Merge pull request #233 from scott1702/feature/enable-macrons-default
Enable macrons and non-ASCII chars in search by default Via the use of [`ASCIIFoldingFilter`](https://lucene.apache.org/solr/guide/6_6/filter-descriptions.html#FilterDescriptions-ASCIIFoldingFilter) which will convert chars from outside the ASCII range to their closest equivalent - e.g. an ō into a simple o. >## ASCII Folding Filter >This filter converts alphabetic, numeric, and symbolic Unicode characters which are not in the Basic Latin Unicode block (the first 127 ASCII characters) to their ASCII equivalents, if one exists. This filter converts characters from the following Unicode blocks: > > - C1 Controls and Latin-1 Supplement (PDF) > - Latin Extended-A (PDF) > - Latin Extended-B (PDF) > - Latin Extended Additional (PDF) > - Latin Extended-C (PDF) > - Latin Extended-D (PDF) > - IPA Extensions (PDF) > - Phonetic Extensions (PDF) > - Phonetic Extensions Supplement (PDF) > - General Punctuation (PDF) > - Superscripts and Subscripts (PDF) > - Enclosed Alphanumerics (PDF) > - Dingbats (PDF) > - Supplemental Punctuation (PDF) > - Alphabetic Presentation Forms (PDF) > - Halfwidth and Fullwidth Forms (PDF) > > > **Factory class**: solr.ASCIIFoldingFilterFactory > > **Arguments**: > `preserveOriginal` - (boolean, default false) If true, the original token is preserved: "thé" → "the", "thé" > > **Example**: > ```xml > <analyzer> > <tokenizer class="solr.WhitespaceTokenizer"/> > <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="false" /> > </analyzer> > ``` > **In**: "á" (Unicode character 00E1) > **Out**: "a" (ASCII character 97)
This commit is contained in:
commit
6eb6ebacb4
|
@ -8,7 +8,7 @@
|
||||||
|
|
||||||
<!-- The optional sortMissingLast and sortMissingFirst attributes are
|
<!-- The optional sortMissingLast and sortMissingFirst attributes are
|
||||||
currently supported on types that are sorted internally as strings.
|
currently supported on types that are sorted internally as strings.
|
||||||
This includes "string","boolean","sint","slong","sfloat","sdouble","pdate"
|
This includes "string","boolean","sint","slong","sfloat","sdouble","pdate"
|
||||||
- If sortMissingLast="true", then a sort on this field will cause documents
|
- If sortMissingLast="true", then a sort on this field will cause documents
|
||||||
without the field to come after documents with the field,
|
without the field to come after documents with the field,
|
||||||
regardless of the requested sort order (asc or desc).
|
regardless of the requested sort order (asc or desc).
|
||||||
|
@ -136,9 +136,11 @@
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
<analyzer type="query">
|
<analyzer type="query">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
<filter class="solr.KeywordRepeatFilterFactory"/>
|
<filter class="solr.KeywordRepeatFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory"
|
<filter class="solr.StopFilterFactory"
|
||||||
ignoreCase="true"
|
ignoreCase="true"
|
||||||
|
@ -162,9 +164,11 @@
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
<filter class="solr.LowerCaseFilterFactory"/>
|
<filter class="solr.LowerCaseFilterFactory"/>
|
||||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
<analyzer type="query">
|
<analyzer type="query">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
<filter class="solr.KeywordRepeatFilterFactory"/>
|
<filter class="solr.KeywordRepeatFilterFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
|
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||||
|
@ -239,7 +243,7 @@
|
||||||
|
|
||||||
<!-- A general unstemmed text field that indexes tokens normally and also
|
<!-- A general unstemmed text field that indexes tokens normally and also
|
||||||
reversed (via ReversedWildcardFilterFactory), to enable more efficient
|
reversed (via ReversedWildcardFilterFactory), to enable more efficient
|
||||||
leading wildcard queries. -->
|
leading wildcard queries. -->
|
||||||
<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
|
<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
@ -320,10 +324,10 @@
|
||||||
a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
|
a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
|
||||||
Attributes of the DelimitedPayloadTokenFilterFactory :
|
Attributes of the DelimitedPayloadTokenFilterFactory :
|
||||||
"delimiter" - a one character delimiter. Default is | (pipe)
|
"delimiter" - a one character delimiter. Default is | (pipe)
|
||||||
"encoder" - how to encode the following value into a playload
|
"encoder" - how to encode the following value into a playload
|
||||||
float -> org.apache.lucene.analysis.payloads.FloatEncoder,
|
float -> org.apache.lucene.analysis.payloads.FloatEncoder,
|
||||||
integer -> o.a.l.a.p.IntegerEncoder
|
integer -> o.a.l.a.p.IntegerEncoder
|
||||||
identity -> o.a.l.a.p.IdentityEncoder
|
identity -> o.a.l.a.p.IdentityEncoder
|
||||||
Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
|
Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
|
||||||
-->
|
-->
|
||||||
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
|
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
|
||||||
|
@ -364,8 +368,8 @@
|
||||||
<!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
|
<!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
|
||||||
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
|
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
A Geohash is a compact representation of a latitude longitude pair in a single field.
|
A Geohash is a compact representation of a latitude longitude pair in a single field.
|
||||||
See http://wiki.apache.org/solr/SpatialSearch
|
See http://wiki.apache.org/solr/SpatialSearch
|
||||||
-->
|
-->
|
||||||
<fieldtype name="geohash" class="solr.GeoHashField"/>
|
<fieldtype name="geohash" class="solr.GeoHashField"/>
|
|
@ -428,7 +428,7 @@ To allow searches on words containing numeric tokens, you'll need to change the
|
||||||
|
|
||||||
The `ASCIIFoldingFilterFactory` filter converts alphabetic, numeric, and symbolic Unicode characters which are not in the Basic Latin Unicode block (the first 127 ASCII characters) to their ASCII equivalents, if one exists.
|
The `ASCIIFoldingFilterFactory` filter converts alphabetic, numeric, and symbolic Unicode characters which are not in the Basic Latin Unicode block (the first 127 ASCII characters) to their ASCII equivalents, if one exists.
|
||||||
|
|
||||||
Find the fields in your overloaded `types.ss` that you want to enable this behaviour in, for example inside the `<fieldType name="htmltext">` block, add the following to both its index analyzer and query analyzer records.
|
By default, this functionality is enabled on the `htmltext` and `text` fieldTypes. If you want it enabled for any other fieldTypes simply find the fields in your overloaded `types.ss` that you want to enable this behaviour in, for example inside the `<fieldType name="textTight">` block, add the following to both its index analyzer and query analyzer records.
|
||||||
|
|
||||||
```xml
|
```xml
|
||||||
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
<filter class="solr.ASCIIFoldingFilterFactory"/>
|
||||||
|
|
Loading…
Reference in New Issue