Howto Search word which contains the character "

classic Classic list List threaded Threaded
3 messages Options
Reply | Threaded
Open this post in threaded view
|

Howto Search word which contains the character "

heyyo
In hebrew words could contain the character "
ex: דו"ח

I would like to know how to configure my schema.xml to be able to index and search correctly those types of words.

If I search this character " inside solr query tool I got this debug:

"debug": {
    "rawquerystring": "\"",
    "querystring": "\"",
    "parsedquery": "(+())/no_coord",
    "parsedquery_toString": "+()",


So if I understand correctly solr remove the " when the query is parsed.


I'm using this schema:

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">

        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
       
       
       
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="1"
                catenateNumbers="1"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />

        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        <filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="25" />
      </analyzer>
      <analyzer type="query">
        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="0"
                catenateNumbers="0"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>




Reply | Threaded
Open this post in threaded view
|

Re: Howto Search word which contains the character "

iorixxx
Hi,

It is special query parser character, so it needs to be escaped. 

http://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Escaping%20Special%20Characters

Ahmet





On Tuesday, May 20, 2014 10:57 AM, heyyo <[hidden email]> wrote:
In hebrew words could contain the character *"*
ex: דו"ח

I would like to know how to configure my schema.xml to be able to index and
search correctly those types of words.

If I search this character *"* inside solr query tool I got this debug:

/"debug": {
    "rawquerystring": "\"",
    "querystring": "\"",
    "parsedquery": "(+())/no_coord",
    "parsedquery_toString": "+()",
/

So if I understand correctly solr remove the " when the query is parsed.


I'm using this schema:

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">

        <charFilter class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
       
       
       
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="1"
                catenateNumbers="1"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />

        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        <filter class="solr.EdgeNGramFilterFactory" minGramSize="2"
maxGramSize="25" />
      </analyzer>
      <analyzer type="query">
        <charFilter class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="0"
                catenateNumbers="0"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>








--
View this message in context: http://lucene.472066.n3.nabble.com/Howto-Search-word-which-contains-the-character-tp4137083.html
Sent from the Solr - User mailing list archive at Nabble.com.
Reply | Threaded
Open this post in threaded view
|

Re: Howto Search word which contains the character "

Jack Krupansky-2
It looks like it was escaped in the query, but the word delimiter filter
will remove it and treat it as if it were white space.

The "types" attribute for WDF can point to a file containing the types for
various characters, so you could map a quote to ALPHA.

The doc is sketchy, but there are some examples in my e-book that shows how
to map @ and _ to ALPHA.

-- Jack Krupansky

-----Original Message-----
From: Ahmet Arslan
Sent: Tuesday, May 20, 2014 4:55 AM
To: [hidden email]
Subject: Re: Howto Search word which contains the character "

Hi,

It is special query parser character, so it needs to be escaped.

http://lucene.apache.org/core/2_9_4/queryparsersyntax.html#Escaping%20Special%20Characters

Ahmet





On Tuesday, May 20, 2014 10:57 AM, heyyo <[hidden email]> wrote:
In hebrew words could contain the character *"*
ex: דו"ח

I would like to know how to configure my schema.xml to be able to index and
search correctly those types of words.

If I search this character *"* inside solr query tool I got this debug:

/"debug": {
    "rawquerystring": "\"",
    "querystring": "\"",
    "parsedquery": "(+())/no_coord",
    "parsedquery_toString": "+()",
/

So if I understand correctly solr remove the " when the query is parsed.


I'm using this schema:

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">

        <charFilter class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>



        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="1"
                catenateNumbers="1"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />

        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        <filter class="solr.EdgeNGramFilterFactory" minGramSize="2"
maxGramSize="25" />
      </analyzer>
      <analyzer type="query">
        <charFilter class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory"
                protected="protwords.txt"
                generateWordParts="1"
                generateNumberParts="1"
                catenateWords="0"
                catenateNumbers="0"
                catenateAll="0"
                splitOnCaseChange="1"
                preserveOriginal="1"/>
        <filter class="solr.LengthFilterFactory" min="2" max="100" />
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>








--
View this message in context:
http://lucene.472066.n3.nabble.com/Howto-Search-word-which-contains-the-character-tp4137083.html
Sent from the Solr - User mailing list archive at Nabble.com.