making realtime deletion through indexWriter, deletion not synced to indexReader

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

making realtime deletion through indexWriter, deletion not synced to indexReader

Armnotstrong
Hi, all

I am using lucene 5.5.4.

As trying to make a real-time deletion, I made two singletons,
IndexGenerator.java is responsible for the index generating and
deletion, and Searcher.java is responsible for doing the search.

In order to make the deletion immediately change the search result, I
expose the indexWriter used in the generation/deletion period and
build the indexSearcher on that.

I have checked the documentation of IndexWriter
[https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/index/DirectoryReader.html]
and quoting:

"flushing just moves the internal buffered state in IndexWriter into
the index, but these changes are not visible to IndexReader until
either commit() or close() is called."

After deleting the document, I do a iwriter.flush(); iwriter.commit();

But wired things happen:

  - query result always the same before or after the deletion happen.
it will be not until next time running the programming(which means
reloading index files from the disk again, I guess?) that the deletion
was made functional.

So I think that the deletion was not synced to the reader.

Note:
Bellowing are the two classes Searcher.java and IndexGenerator.java I
do the test with, main method has been put in Searcher.java.
There will be a dir in /tmp/test/lucene created by running the code,
but when changing the directory to RAMDirectory, searching will

///////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////Searcher.java////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;

import java.io.IOException;

public class Searcher {
    private static Searcher instance = null;
    private final Analyzer analyzer = new SimpleAnalyzer();
    private final IndexReader ireader ;
    private final IndexSearcher isearcher;

    public static Searcher getInstance() throws IOException {
        if(instance == null){
            synchronized (Searcher.class){
                if(instance == null){
                    instance = new Searcher();
                }
            }
        }
        return instance;
    }

    private Searcher() throws IOException {
        IndexWriter indexWriter = IndexGenerator.getInstance().getWriter();
        ireader = DirectoryReader.open(indexWriter, true);
        System.out.println("total doc num: " + ireader.numDocs());
        isearcher = new IndexSearcher(ireader);
    }

    public void search(String keyword) throws ParseException, IOException {
        System.out.println("deleted : " + ireader.hasDeletions() + ",
" + "deleted num: " + ireader.numDeletedDocs());
        Query query = new QueryParser("text", analyzer).parse(keyword);
        ScoreDoc[] hits = isearcher.search(query, 100).scoreDocs;
        for(ScoreDoc hitDoc: hits){
            Document doc = isearcher.doc(hitDoc.doc);
            System.out.println("matching text: '"+doc.get("text")+"'");
        }
    }

    public static void main(String[] argvs) throws IOException, ParseException {
        IndexGenerator indexGenerator = IndexGenerator.getInstance();
        indexGenerator.rebuildIndex();
        Searcher searcher = Searcher.getInstance();

        System.out.println("--before delete--");
        searcher.search("world");
        indexGenerator.deleteById("123456");
        System.out.println("--after delete--");
        searcher.search("world");
    }
}

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////IndexGenerator.java///////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

public class IndexGenerator {
    private static IndexGenerator instance = null;

    private final Map<String, FieldType> fieldTypes;
    private final IndexWriter iwriter;

    SimpleAnalyzer analyzer = new SimpleAnalyzer();

    public static IndexGenerator getInstance() throws IOException {
        if (instance == null) {
            synchronized (IndexGenerator.class) {
                if (instance == null) {
                    instance = new IndexGenerator();
                }
            }
        }
        return instance;
    }

    private IndexGenerator() throws IOException {
        fieldTypes = new HashMap<>();
        FieldType idFt = new FieldType();
        idFt.setStored(true);
        idFt.setOmitNorms(true);
        idFt.setIndexOptions(IndexOptions.DOCS);
        idFt.setTokenized(false);
        idFt.freeze();
        fieldTypes.put("_id", idFt);

        FieldType textFt = new FieldType();
        textFt.setStored(true);
        textFt.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        fieldTypes.put("text", textFt);

        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        iwriter = new
IndexWriter(FSDirectory.open(Paths.get("/tmp/test/lucene")), config);
//        iwriter = new IndexWriter(new RAMDirectory(), config);
    }


    public void deleteById(String id) {
        try {
            Term term = new Term("_id", id);
            iwriter.deleteDocuments(term);
            iwriter.flush();
            iwriter.commit();
        }catch (IOException e){
            e.printStackTrace();
        }
    }

    public IndexWriter getWriter(){
        return iwriter;
    }

    public void rebuildIndex() throws IOException {
        iwriter.deleteAll();
        iwriter.flush();
        iwriter.commit();

        Document doc1 = new Document();
        Field idField1 = new Field("_id", "123456", fieldTypes.get("_id"));
        Field textField1 = new Field("text", "hello world",
fieldTypes.get("text"));
        doc1.add(idField1);
        doc1.add(textField1);
        iwriter.addDocument(doc1);

        Document doc2 = new Document();
        Field idField2 = new Field("_id", "abcdefg", fieldTypes.get("_id"));
        Field textField2 = new Field("text", "hi world",
fieldTypes.get("text"));
        doc2.add(idField2);
        doc2.add(textField2);
        iwriter.addDocument(doc2);
        iwriter.flush();
        iwriter.commit();
    }

}

---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]

Reply | Threaded
Open this post in threaded view
|

Re: making realtime deletion through indexWriter, deletion not synced to indexReader

Michael McCandless-2
An IndexSearcher only searches the point-in-time view of your index as
of when it was opened.

So any changes to the index (newly indexed documents, or deleted
documents) will not be reflected until you refresh the underlying
IndexReader, to see the next point-in-time view.

The refresh is efficient: the cost is generally only in proportion to
how much has changed in the index since you last refreshed; this is
what Lucene calls "near real time" search.

In your case, if you simply close the old searcher, and open a new one
(passing in your IW instance) then it will be near-real-time.

But more generally for easier thread safety it's best to use Lucene's
SearcherManager class to manage the lifetime of each searcher ...

Mike McCandless

http://blog.mikemccandless.com


On Sun, Feb 19, 2017 at 9:37 PM, ximing <[hidden email]> wrote:

> Hi, all
>
> I am using lucene 5.5.4.
>
> As trying to make a real-time deletion, I made two singletons,
> IndexGenerator.java is responsible for the index generating and
> deletion, and Searcher.java is responsible for doing the search.
>
> In order to make the deletion immediately change the search result, I
> expose the indexWriter used in the generation/deletion period and
> build the indexSearcher on that.
>
> I have checked the documentation of IndexWriter
> [https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/index/DirectoryReader.html]
> and quoting:
>
> "flushing just moves the internal buffered state in IndexWriter into
> the index, but these changes are not visible to IndexReader until
> either commit() or close() is called."
>
> After deleting the document, I do a iwriter.flush(); iwriter.commit();
>
> But wired things happen:
>
>   - query result always the same before or after the deletion happen.
> it will be not until next time running the programming(which means
> reloading index files from the disk again, I guess?) that the deletion
> was made functional.
>
> So I think that the deletion was not synced to the reader.
>
> Note:
> Bellowing are the two classes Searcher.java and IndexGenerator.java I
> do the test with, main method has been put in Searcher.java.
> There will be a dir in /tmp/test/lucene created by running the code,
> but when changing the directory to RAMDirectory, searching will
>
> ///////////////////////////////////////////////////////////////////////////////////////////////////////
> ////////////////////////////////Searcher.java////////////////////////////////////////////////////
> ///////////////////////////////////////////////////////////////////////////////////////////////////////
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.core.SimpleAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.index.DirectoryReader;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.queryparser.classic.ParseException;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.*;
>
> import java.io.IOException;
>
> public class Searcher {
>     private static Searcher instance = null;
>     private final Analyzer analyzer = new SimpleAnalyzer();
>     private final IndexReader ireader ;
>     private final IndexSearcher isearcher;
>
>     public static Searcher getInstance() throws IOException {
>         if(instance == null){
>             synchronized (Searcher.class){
>                 if(instance == null){
>                     instance = new Searcher();
>                 }
>             }
>         }
>         return instance;
>     }
>
>     private Searcher() throws IOException {
>         IndexWriter indexWriter = IndexGenerator.getInstance().getWriter();
>         ireader = DirectoryReader.open(indexWriter, true);
>         System.out.println("total doc num: " + ireader.numDocs());
>         isearcher = new IndexSearcher(ireader);
>     }
>
>     public void search(String keyword) throws ParseException, IOException {
>         System.out.println("deleted : " + ireader.hasDeletions() + ",
> " + "deleted num: " + ireader.numDeletedDocs());
>         Query query = new QueryParser("text", analyzer).parse(keyword);
>         ScoreDoc[] hits = isearcher.search(query, 100).scoreDocs;
>         for(ScoreDoc hitDoc: hits){
>             Document doc = isearcher.doc(hitDoc.doc);
>             System.out.println("matching text: '"+doc.get("text")+"'");
>         }
>     }
>
>     public static void main(String[] argvs) throws IOException, ParseException {
>         IndexGenerator indexGenerator = IndexGenerator.getInstance();
>         indexGenerator.rebuildIndex();
>         Searcher searcher = Searcher.getInstance();
>
>         System.out.println("--before delete--");
>         searcher.search("world");
>         indexGenerator.deleteById("123456");
>         System.out.println("--after delete--");
>         searcher.search("world");
>     }
> }
>
> //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
> //////////////////////////////////IndexGenerator.java///////////////////////////////////////////////////////////////
> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
>
> import org.apache.lucene.analysis.core.SimpleAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.FieldType;
> import org.apache.lucene.index.*;
> import org.apache.lucene.store.FSDirectory;
>
> import java.io.IOException;
> import java.nio.file.Paths;
> import java.util.HashMap;
> import java.util.Map;
>
> public class IndexGenerator {
>     private static IndexGenerator instance = null;
>
>     private final Map<String, FieldType> fieldTypes;
>     private final IndexWriter iwriter;
>
>     SimpleAnalyzer analyzer = new SimpleAnalyzer();
>
>     public static IndexGenerator getInstance() throws IOException {
>         if (instance == null) {
>             synchronized (IndexGenerator.class) {
>                 if (instance == null) {
>                     instance = new IndexGenerator();
>                 }
>             }
>         }
>         return instance;
>     }
>
>     private IndexGenerator() throws IOException {
>         fieldTypes = new HashMap<>();
>         FieldType idFt = new FieldType();
>         idFt.setStored(true);
>         idFt.setOmitNorms(true);
>         idFt.setIndexOptions(IndexOptions.DOCS);
>         idFt.setTokenized(false);
>         idFt.freeze();
>         fieldTypes.put("_id", idFt);
>
>         FieldType textFt = new FieldType();
>         textFt.setStored(true);
>         textFt.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>         fieldTypes.put("text", textFt);
>
>         IndexWriterConfig config = new IndexWriterConfig(analyzer);
>         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
>         iwriter = new
> IndexWriter(FSDirectory.open(Paths.get("/tmp/test/lucene")), config);
> //        iwriter = new IndexWriter(new RAMDirectory(), config);
>     }
>
>
>     public void deleteById(String id) {
>         try {
>             Term term = new Term("_id", id);
>             iwriter.deleteDocuments(term);
>             iwriter.flush();
>             iwriter.commit();
>         }catch (IOException e){
>             e.printStackTrace();
>         }
>     }
>
>     public IndexWriter getWriter(){
>         return iwriter;
>     }
>
>     public void rebuildIndex() throws IOException {
>         iwriter.deleteAll();
>         iwriter.flush();
>         iwriter.commit();
>
>         Document doc1 = new Document();
>         Field idField1 = new Field("_id", "123456", fieldTypes.get("_id"));
>         Field textField1 = new Field("text", "hello world",
> fieldTypes.get("text"));
>         doc1.add(idField1);
>         doc1.add(textField1);
>         iwriter.addDocument(doc1);
>
>         Document doc2 = new Document();
>         Field idField2 = new Field("_id", "abcdefg", fieldTypes.get("_id"));
>         Field textField2 = new Field("text", "hi world",
> fieldTypes.get("text"));
>         doc2.add(idField2);
>         doc2.add(textField2);
>         iwriter.addDocument(doc2);
>         iwriter.flush();
>         iwriter.commit();
>     }
>
> }
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [hidden email]
> For additional commands, e-mail: [hidden email]
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]

Reply | Threaded
Open this post in threaded view
|

Re: making realtime deletion through indexWriter, deletion not synced to indexReader

Armnotstrong
Thanks, Michael:

I make a simple API that when I want to make the updated index to be
functional, re-assign the Singleton instance of Searcher to be null,
and that seems to work.
I'll check the SearcherManager for further optimization.

Thanks for your trouble to review this problem and it really helps.

good day

2017-02-20 17:45 GMT+08:00 Michael McCandless <[hidden email]>:

> An IndexSearcher only searches the point-in-time view of your index as
> of when it was opened.
>
> So any changes to the index (newly indexed documents, or deleted
> documents) will not be reflected until you refresh the underlying
> IndexReader, to see the next point-in-time view.
>
> The refresh is efficient: the cost is generally only in proportion to
> how much has changed in the index since you last refreshed; this is
> what Lucene calls "near real time" search.
>
> In your case, if you simply close the old searcher, and open a new one
> (passing in your IW instance) then it will be near-real-time.
>
> But more generally for easier thread safety it's best to use Lucene's
> SearcherManager class to manage the lifetime of each searcher ...
>
> Mike McCandless
>
> http://blog.mikemccandless.com
>
>
> On Sun, Feb 19, 2017 at 9:37 PM, ximing <[hidden email]> wrote:
>> Hi, all
>>
>> I am using lucene 5.5.4.
>>
>> As trying to make a real-time deletion, I made two singletons,
>> IndexGenerator.java is responsible for the index generating and
>> deletion, and Searcher.java is responsible for doing the search.
>>
>> In order to make the deletion immediately change the search result, I
>> expose the indexWriter used in the generation/deletion period and
>> build the indexSearcher on that.
>>
>> I have checked the documentation of IndexWriter
>> [https://lucene.apache.org/core/5_0_0/core/org/apache/lucene/index/DirectoryReader.html]
>> and quoting:
>>
>> "flushing just moves the internal buffered state in IndexWriter into
>> the index, but these changes are not visible to IndexReader until
>> either commit() or close() is called."
>>
>> After deleting the document, I do a iwriter.flush(); iwriter.commit();
>>
>> But wired things happen:
>>
>>   - query result always the same before or after the deletion happen.
>> it will be not until next time running the programming(which means
>> reloading index files from the disk again, I guess?) that the deletion
>> was made functional.
>>
>> So I think that the deletion was not synced to the reader.
>>
>> Note:
>> Bellowing are the two classes Searcher.java and IndexGenerator.java I
>> do the test with, main method has been put in Searcher.java.
>> There will be a dir in /tmp/test/lucene created by running the code,
>> but when changing the directory to RAMDirectory, searching will
>>
>> ///////////////////////////////////////////////////////////////////////////////////////////////////////
>> ////////////////////////////////Searcher.java////////////////////////////////////////////////////
>> ///////////////////////////////////////////////////////////////////////////////////////////////////////
>>
>> import org.apache.lucene.analysis.Analyzer;
>> import org.apache.lucene.analysis.core.SimpleAnalyzer;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.index.DirectoryReader;
>> import org.apache.lucene.index.IndexReader;
>> import org.apache.lucene.index.IndexWriter;
>> import org.apache.lucene.queryparser.classic.ParseException;
>> import org.apache.lucene.queryparser.classic.QueryParser;
>> import org.apache.lucene.search.*;
>>
>> import java.io.IOException;
>>
>> public class Searcher {
>>     private static Searcher instance = null;
>>     private final Analyzer analyzer = new SimpleAnalyzer();
>>     private final IndexReader ireader ;
>>     private final IndexSearcher isearcher;
>>
>>     public static Searcher getInstance() throws IOException {
>>         if(instance == null){
>>             synchronized (Searcher.class){
>>                 if(instance == null){
>>                     instance = new Searcher();
>>                 }
>>             }
>>         }
>>         return instance;
>>     }
>>
>>     private Searcher() throws IOException {
>>         IndexWriter indexWriter = IndexGenerator.getInstance().getWriter();
>>         ireader = DirectoryReader.open(indexWriter, true);
>>         System.out.println("total doc num: " + ireader.numDocs());
>>         isearcher = new IndexSearcher(ireader);
>>     }
>>
>>     public void search(String keyword) throws ParseException, IOException {
>>         System.out.println("deleted : " + ireader.hasDeletions() + ",
>> " + "deleted num: " + ireader.numDeletedDocs());
>>         Query query = new QueryParser("text", analyzer).parse(keyword);
>>         ScoreDoc[] hits = isearcher.search(query, 100).scoreDocs;
>>         for(ScoreDoc hitDoc: hits){
>>             Document doc = isearcher.doc(hitDoc.doc);
>>             System.out.println("matching text: '"+doc.get("text")+"'");
>>         }
>>     }
>>
>>     public static void main(String[] argvs) throws IOException, ParseException {
>>         IndexGenerator indexGenerator = IndexGenerator.getInstance();
>>         indexGenerator.rebuildIndex();
>>         Searcher searcher = Searcher.getInstance();
>>
>>         System.out.println("--before delete--");
>>         searcher.search("world");
>>         indexGenerator.deleteById("123456");
>>         System.out.println("--after delete--");
>>         searcher.search("world");
>>     }
>> }
>>
>> //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
>> //////////////////////////////////IndexGenerator.java///////////////////////////////////////////////////////////////
>> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
>>
>> import org.apache.lucene.analysis.core.SimpleAnalyzer;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.document.Field;
>> import org.apache.lucene.document.FieldType;
>> import org.apache.lucene.index.*;
>> import org.apache.lucene.store.FSDirectory;
>>
>> import java.io.IOException;
>> import java.nio.file.Paths;
>> import java.util.HashMap;
>> import java.util.Map;
>>
>> public class IndexGenerator {
>>     private static IndexGenerator instance = null;
>>
>>     private final Map<String, FieldType> fieldTypes;
>>     private final IndexWriter iwriter;
>>
>>     SimpleAnalyzer analyzer = new SimpleAnalyzer();
>>
>>     public static IndexGenerator getInstance() throws IOException {
>>         if (instance == null) {
>>             synchronized (IndexGenerator.class) {
>>                 if (instance == null) {
>>                     instance = new IndexGenerator();
>>                 }
>>             }
>>         }
>>         return instance;
>>     }
>>
>>     private IndexGenerator() throws IOException {
>>         fieldTypes = new HashMap<>();
>>         FieldType idFt = new FieldType();
>>         idFt.setStored(true);
>>         idFt.setOmitNorms(true);
>>         idFt.setIndexOptions(IndexOptions.DOCS);
>>         idFt.setTokenized(false);
>>         idFt.freeze();
>>         fieldTypes.put("_id", idFt);
>>
>>         FieldType textFt = new FieldType();
>>         textFt.setStored(true);
>>         textFt.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>>         fieldTypes.put("text", textFt);
>>
>>         IndexWriterConfig config = new IndexWriterConfig(analyzer);
>>         config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
>>         iwriter = new
>> IndexWriter(FSDirectory.open(Paths.get("/tmp/test/lucene")), config);
>> //        iwriter = new IndexWriter(new RAMDirectory(), config);
>>     }
>>
>>
>>     public void deleteById(String id) {
>>         try {
>>             Term term = new Term("_id", id);
>>             iwriter.deleteDocuments(term);
>>             iwriter.flush();
>>             iwriter.commit();
>>         }catch (IOException e){
>>             e.printStackTrace();
>>         }
>>     }
>>
>>     public IndexWriter getWriter(){
>>         return iwriter;
>>     }
>>
>>     public void rebuildIndex() throws IOException {
>>         iwriter.deleteAll();
>>         iwriter.flush();
>>         iwriter.commit();
>>
>>         Document doc1 = new Document();
>>         Field idField1 = new Field("_id", "123456", fieldTypes.get("_id"));
>>         Field textField1 = new Field("text", "hello world",
>> fieldTypes.get("text"));
>>         doc1.add(idField1);
>>         doc1.add(textField1);
>>         iwriter.addDocument(doc1);
>>
>>         Document doc2 = new Document();
>>         Field idField2 = new Field("_id", "abcdefg", fieldTypes.get("_id"));
>>         Field textField2 = new Field("text", "hi world",
>> fieldTypes.get("text"));
>>         doc2.add(idField2);
>>         doc2.add(textField2);
>>         iwriter.addDocument(doc2);
>>         iwriter.flush();
>>         iwriter.commit();
>>     }
>>
>> }
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: [hidden email]
>> For additional commands, e-mail: [hidden email]
>>

---------------------------------------------------------------------
To unsubscribe, e-mail: [hidden email]
For additional commands, e-mail: [hidden email]