Sort difference between 2.1 and 2.3

classic Classic list List threaded Threaded
3 messages Options
adb
Reply | Threaded
Open this post in threaded view
|

Sort difference between 2.1 and 2.3

adb
Hi,

I had a test case that added two documents, each with one untokenized field, and
sorted them.  The data in each document was

char(1) + "First"
char(0xffff) + "Last"

With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1, they
are not.  Looking at the index with Luke shows that the document with "Last" has
not been handled correctly, i.e. the text for the "subject" field is empty.

The test case below shows the problem.

Regards
Antony


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class LastSubjectTest
{
     /**
      *  Set up a number of documents with 1 duplicate ContentId
      *  @throws Exception
      */
     @Before
     public void setUp() throws Exception
     {
         IndexWriter writer = new IndexWriter("TestDir/", new
StandardAnalyzer(), true);
         Document doc = new Document();
         String subject = new StringBuffer(1).append((char)0xffff).toString() +
"Last";
         Field f = new Field("subject", subject, Field.Store.YES,
Field.Index.NO_NORMS);
         doc.add(f);
         writer.addDocument(doc);
         doc = new Document();
         subject = new StringBuffer(1).append((char)0x1).toString() + "First";
         f = new Field("subject", subject, Field.Store.YES, Field.Index.NO_NORMS);
         doc.add(f);
         writer.addDocument(doc);
         writer.close();
     }

     /**
      *  @throws Exception
      */
     @After
     public void tearDown() throws Exception
     {
     }

     /**
      *  Tests that the last is after first document, sorted by subject
      *  @throws IOException
      */
     @Test
     public void testSortDateAscending()
            throws IOException
     {
         IndexSearcher searcher = new IndexSearcher("TestDir/");
         Query q = new MatchAllDocsQuery();
         Sort sort = new Sort(new SortField("subject"));
         Hits hits = searcher.search(q, sort);
         assertEquals("Hits should match all documents",
searcher.getIndexReader().maxDoc(), hits.length());

         Document fd = hits.doc(0);
         Document ld = hits.doc(1);
         String fs = fd.get("subject");
         String ls = ld.get("subject");

         for (int i = 0; i < hits.length(); i++)
         {
             Document doc = hits.doc(i);
             String subject = doc.get("subject");
             System.out.println("Subject:" + subject);
         }
         assertTrue("Subjects have been sorted incorrectly", fs.compareTo(ls) < 0);
     }

}


---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]

Reply | Threaded
Open this post in threaded view
|

Re: Sort difference between 2.1 and 2.3

Michael McCandless-2
You're right, Lucene changed wrt the 0xffff character: 2.3 now uses
this character internally as an "end of term" marker when storing term
text.

This was done as part of LUCENE-843 (speeding up indexing).

Technically that character is an invalid UTF16 character (for
interchange), but it looks like a few Lucene users were indeed relying
on older Lucene versions accepting & preserving it.

You could use 0xfffe instead?  Lucene 2.3 will preserve it, though
It's also invalid for interchange (so future Lucene versions might
change wrt that, too).

Or ... it looks like you're use case is to sort all "last" values
after all "first" values?  In which case one way to do this (without
using invalid UTF16 characters) might be to add a new field marking
whether you have a "last" or a "first" value, then sort first by that
field and second by your value field?

Mike

Antony Bowesman <[hidden email]> wrote:

> Hi,
>
>  I had a test case that added two documents, each with one untokenized
> field, and sorted them.  The data in each document was
>
>  char(1) + "First"
>  char(0xffff) + "Last"
>
>  With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1,
> they are not.  Looking at the index with Luke shows that the document with
> "Last" has not been handled correctly, i.e. the text for the "subject" field
> is empty.
>
>  The test case below shows the problem.
>
>  Regards
>  Antony
>
>
>  import static org.junit.Assert.assertEquals;
>  import static org.junit.Assert.assertTrue;
>
>  import java.io.IOException;
>
>  import org.apache.lucene.analysis.standard.StandardAnalyzer;
>  import org.apache.lucene.document.Document;
>  import org.apache.lucene.document.Field;
>  import org.apache.lucene.index.IndexWriter;
>  import org.apache.lucene.search.Hits;
>  import org.apache.lucene.search.IndexSearcher;
>  import org.apache.lucene.search.MatchAllDocsQuery;
>  import org.apache.lucene.search.Query;
>  import org.apache.lucene.search.Sort;
>  import org.apache.lucene.search.SortField;
>  import org.junit.After;
>  import org.junit.Before;
>  import org.junit.Test;
>
>  public class LastSubjectTest
>  {
>     /**
>      *  Set up a number of documents with 1 duplicate ContentId
>      *  @throws Exception
>      */
>     @Before
>     public void setUp() throws Exception
>     {
>         IndexWriter writer = new IndexWriter("TestDir/", new
> StandardAnalyzer(), true);
>         Document doc = new Document();
>         String subject = new StringBuffer(1).append((char)0xffff).toString()
> + "Last";
>         Field f = new Field("subject", subject, Field.Store.YES,
> Field.Index.NO_NORMS);
>         doc.add(f);
>         writer.addDocument(doc);
>         doc = new Document();
>         subject = new StringBuffer(1).append((char)0x1).toString() +
> "First";
>         f = new Field("subject", subject, Field.Store.YES,
> Field.Index.NO_NORMS);
>         doc.add(f);
>         writer.addDocument(doc);
>         writer.close();
>     }
>
>     /**
>      *  @throws Exception
>      */
>     @After
>     public void tearDown() throws Exception
>     {
>     }
>
>     /**
>      *  Tests that the last is after first document, sorted by subject
>      *  @throws IOException
>      */
>     @Test
>     public void testSortDateAscending()
>            throws IOException
>     {
>         IndexSearcher searcher = new IndexSearcher("TestDir/");
>         Query q = new MatchAllDocsQuery();
>         Sort sort = new Sort(new SortField("subject"));
>         Hits hits = searcher.search(q, sort);
>         assertEquals("Hits should match all documents",
> searcher.getIndexReader().maxDoc(), hits.length());
>
>         Document fd = hits.doc(0);
>         Document ld = hits.doc(1);
>         String fs = fd.get("subject");
>         String ls = ld.get("subject");
>
>         for (int i = 0; i < hits.length(); i++)
>         {
>             Document doc = hits.doc(i);
>             String subject = doc.get("subject");
>             System.out.println("Subject:" + subject);
>         }
>         assertTrue("Subjects have been sorted incorrectly", fs.compareTo(ls)
> < 0);
>     }
>
>  }
>
>
>  ---------------------------------------------------------------------
>  To unsubscribe, e-mail: [hidden email]
>  For additional commands, e-mail: [hidden email]
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]

adb
Reply | Threaded
Open this post in threaded view
|

Re: Sort difference between 2.1 and 2.3

adb
Thanks for the explanation Mike.  It's not a big issue, it's just a test case
where I was needed to ensure ordering for the test, so I'll just use a valid
high utf-16 character.  It just seemed odd that the field was showing strangely
in Luke.  Your explanation gives the reason, thanks.

Antony



Michael McCandless wrote:

> You're right, Lucene changed wrt the 0xffff character: 2.3 now uses
> this character internally as an "end of term" marker when storing term
> text.
>
> This was done as part of LUCENE-843 (speeding up indexing).
>
> Technically that character is an invalid UTF16 character (for
> interchange), but it looks like a few Lucene users were indeed relying
> on older Lucene versions accepting & preserving it.
>
> You could use 0xfffe instead?  Lucene 2.3 will preserve it, though
> It's also invalid for interchange (so future Lucene versions might
> change wrt that, too).
>
> Or ... it looks like you're use case is to sort all "last" values
> after all "first" values?  In which case one way to do this (without
> using invalid UTF16 characters) might be to add a new field marking
> whether you have a "last" or a "first" value, then sort first by that
> field and second by your value field?
>
> Mike
>
> Antony Bowesman <[hidden email]> wrote:
>> Hi,
>>
>>  I had a test case that added two documents, each with one untokenized
>> field, and sorted them.  The data in each document was
>>
>>  char(1) + "First"
>>  char(0xffff) + "Last"
>>
>>  With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1,
>> they are not.  Looking at the index with Luke shows that the document with
>> "Last" has not been handled correctly, i.e. the text for the "subject" field
>> is empty.
>>
>>  The test case below shows the problem.
>>
>>  Regards
>>  Antony
>>
>>
>>  import static org.junit.Assert.assertEquals;
>>  import static org.junit.Assert.assertTrue;
>>
>>  import java.io.IOException;
>>
>>  import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>  import org.apache.lucene.document.Document;
>>  import org.apache.lucene.document.Field;
>>  import org.apache.lucene.index.IndexWriter;
>>  import org.apache.lucene.search.Hits;
>>  import org.apache.lucene.search.IndexSearcher;
>>  import org.apache.lucene.search.MatchAllDocsQuery;
>>  import org.apache.lucene.search.Query;
>>  import org.apache.lucene.search.Sort;
>>  import org.apache.lucene.search.SortField;
>>  import org.junit.After;
>>  import org.junit.Before;
>>  import org.junit.Test;
>>
>>  public class LastSubjectTest
>>  {
>>     /**
>>      *  Set up a number of documents with 1 duplicate ContentId
>>      *  @throws Exception
>>      */
>>     @Before
>>     public void setUp() throws Exception
>>     {
>>         IndexWriter writer = new IndexWriter("TestDir/", new
>> StandardAnalyzer(), true);
>>         Document doc = new Document();
>>         String subject = new StringBuffer(1).append((char)0xffff).toString()
>> + "Last";
>>         Field f = new Field("subject", subject, Field.Store.YES,
>> Field.Index.NO_NORMS);
>>         doc.add(f);
>>         writer.addDocument(doc);
>>         doc = new Document();
>>         subject = new StringBuffer(1).append((char)0x1).toString() +
>> "First";
>>         f = new Field("subject", subject, Field.Store.YES,
>> Field.Index.NO_NORMS);
>>         doc.add(f);
>>         writer.addDocument(doc);
>>         writer.close();
>>     }
>>
>>     /**
>>      *  @throws Exception
>>      */
>>     @After
>>     public void tearDown() throws Exception
>>     {
>>     }
>>
>>     /**
>>      *  Tests that the last is after first document, sorted by subject
>>      *  @throws IOException
>>      */
>>     @Test
>>     public void testSortDateAscending()
>>            throws IOException
>>     {
>>         IndexSearcher searcher = new IndexSearcher("TestDir/");
>>         Query q = new MatchAllDocsQuery();
>>         Sort sort = new Sort(new SortField("subject"));
>>         Hits hits = searcher.search(q, sort);
>>         assertEquals("Hits should match all documents",
>> searcher.getIndexReader().maxDoc(), hits.length());
>>
>>         Document fd = hits.doc(0);
>>         Document ld = hits.doc(1);
>>         String fs = fd.get("subject");
>>         String ls = ld.get("subject");
>>
>>         for (int i = 0; i < hits.length(); i++)
>>         {
>>             Document doc = hits.doc(i);
>>             String subject = doc.get("subject");
>>             System.out.println("Subject:" + subject);
>>         }
>>         assertTrue("Subjects have been sorted incorrectly", fs.compareTo(ls)
>> < 0);
>>     }
>>
>>  }
>>
>>
>>  ---------------------------------------------------------------------
>>  To unsubscribe, e-mail: [hidden email]
>>  For additional commands, e-mail: [hidden email]
>>
>>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [hidden email]
> For additional commands, e-mail: [hidden email]
>
>


---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]