`

solr的索引和查询顺序

阅读更多

 拜读了solr的部分源码,却急于弄明白solr的索引顺序和查询顺序,如下是探访结果.

 所有的配置都在solr/example/solr/conf/schema.xml当中.

 

<!-- 如下是对text类型的处理 -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
 <!-- 索引顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- in this example, we will only use synonyms at query time
        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
        -->
        <!-- Case insensitive stop word removal.
          add enablePositionIncrements=true in both the index and query
          analyzers to leave a 'gap' for more accurate phrase queries.
        -->
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
     <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>


    <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
   <!-- 针对textTight类型-->
    <fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7英文相近词8去除重复词
 -->
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.EnglishMinimalStemFilterFactory"/>
        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
             possible with WordDelimiterFilter in conjuncton with stemming. -->
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>


    <!-- A general unstemmed text field - good if one does not know the language of the field -->
 <!-- 针对textgen类型 -->
    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
     <!-- 索引顺序1空格2过滤词3拆字4小写过滤-->
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
<!-- 查询顺序1空格2同义词3过滤词4小写过滤-->
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>


    <!-- A general unstemmed text field that indexes tokens normally and also
         reversed (via ReversedWildcardFilterFactory), to enable more efficient 
	 leading wildcard queries. -->
   <!-- 针对text_rev类型 -->
    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
  <!-- 索引顺序1空格2过滤词3拆字4小写过滤6转义通配符-->
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
           maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
      </analyzer>
 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤 -->
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory"
                ignoreCase="true"
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

    <!-- charFilter + WhitespaceTokenizer  -->
    <!--
    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
      <analyzer>
        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    -->

    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
      -->
    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
      <analyzer>
        <!-- KeywordTokenizer does no actual tokenizing, so the entire
             input string is preserved as a single token
          -->
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <!-- The LowerCase TokenFilter does what you expect, which can be
             when you want your sorting to be case insensitive
          -->
        <filter class="solr.LowerCaseFilterFactory" />
        <!-- The TrimFilter removes any leading or trailing whitespace -->
        <filter class="solr.TrimFilterFactory" />
        <!-- The PatternReplaceFilter gives you the flexibility to use
             Java Regular expression to replace any sequence of characters
             matching a pattern with an arbitrary replacement string, 
             which may include back references to portions of the original
             string matched by the pattern.
             
             See the Java Regular Expression documentation for more
             information on pattern and replacement string syntax.
             
             http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
          -->
        <filter class="solr.PatternReplaceFilterFactory"
                pattern="([^a-z])" replacement="" replace="all"
        />
      </analyzer>
    </fieldType>
    
    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
      </analyzer>
    </fieldtype>

    <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!--
        The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
        a token of "foo|1.4"  would be indexed as "foo" with a payload of 1.4f
        Attributes of the DelimitedPayloadTokenFilterFactory : 
         "delimiter" - a one character delimiter. Default is | (pipe)
	 "encoder" - how to encode the following value into a playload
	    float -> org.apache.lucene.analysis.payloads.FloatEncoder,
	    integer -> o.a.l.a.p.IntegerEncoder
	    identity -> o.a.l.a.p.IdentityEncoder
            Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
         -->
        <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
      </analyzer>
    </fieldtype>

    <!-- lowercases the entire field value, keeping it as a single token.  -->
    <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
      </analyzer>
    </fieldType>

    大致的索引顺序会是:

 1.空格..............................solr.WhitespaceTokenizerFactory

    2同义词............................solr.SynonymFilterFactory

    3过滤词...........................solr.StopFilterFactory

    4拆字..............................solr.WordDelimiterFilterFactory

    5小写过滤.....................solr.LowerCaseFilterFactory

    6关键字.........................solr.KeywordMarkerFilterFactory

    7词干抽取算法............solr.PorterStemFilterFactory

 

 大致的搜素顺序是:

 

 1.空格..............................solr.WhitespaceTokenizerFactory

    2同义词............................solr.SynonymFilterFactory

    3过滤词...........................solr.StopFilterFactory

    4拆字..............................solr.WordDelimiterFilterFactory

    5小写过滤.....................solr.LowerCaseFilterFactory

    6关键字.........................solr.KeywordMarkerFilterFactory

    7英文相近词..................solr.EnglishMinimalStemFilterFactory

    8去除重复词.................solr.RemoveDuplicatesTokenFilterFactory

 

    当然了,你可以根据自己的权重来重新分配索引和搜素顺序

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics