mirror of
https://github.com/silverstripe/silverstripe-fulltextsearch
synced 2024-10-22 12:05:29 +00:00
Merge pull request #13 from silverstripe-big-o/pull/spell-checking
ENHANCEMENT Better spell checking default config and docs
This commit is contained in:
commit
b10c263784
@ -339,6 +339,10 @@ abstract class SolrIndex extends SearchIndex {
|
||||
$ret['Matches']->setPageStart($offset);
|
||||
// Results per page
|
||||
$ret['Matches']->setPageLength($limit);
|
||||
// Suggestions (requires custom setup, assumes spellcheck.collate=true)
|
||||
if(isset($res->spellcheck->suggestions->collation)) {
|
||||
$ret['Suggestion'] = $res->spellcheck->suggestions->collation;
|
||||
}
|
||||
|
||||
return new ArrayData($ret);
|
||||
}
|
||||
|
@ -761,6 +761,11 @@
|
||||
<str>nameOfCustomComponent2</str>
|
||||
</arr>
|
||||
-->
|
||||
|
||||
<arr name="last-components">
|
||||
<str>spellcheck</str>
|
||||
</arr>
|
||||
|
||||
</requestHandler>
|
||||
|
||||
<!-- A Robust Example
|
||||
@ -1082,13 +1087,13 @@
|
||||
component
|
||||
-->
|
||||
|
||||
<!-- a spellchecker built from a field of hte main index, and
|
||||
written to disk
|
||||
-->
|
||||
<!-- a spellchecker built from a field of the main index, and written to disk -->
|
||||
<lst name="spellchecker">
|
||||
<str name="name">default</str>
|
||||
<str name="field">name</str>
|
||||
<str name="spellcheckIndexDir">spellchecker</str>
|
||||
<str name="field">_text</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
<str name="classname">solr.IndexBasedSpellChecker</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
</lst>
|
||||
|
||||
<!-- a spellchecker that uses a different distance measure -->
|
||||
|
@ -192,6 +192,17 @@
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Text optimized for spelling corrections, with minimal alterations (e.g. no stemming) -->
|
||||
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.LengthFilterFactory" min="4" max="20" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!-- A general unstemmed text field - good if one does not know the language of the field -->
|
||||
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
|
||||
|
75
docs/Solr.md
75
docs/Solr.md
@ -138,6 +138,81 @@ from a new file `mysite/solr/templates/types.ss` instead:
|
||||
}
|
||||
}
|
||||
|
||||
### Spell Checking ("Did you mean...")
|
||||
|
||||
Solr has various spell checking strategies (see the ["SpellCheckComponent" docs](http://wiki.apache.org/solr/SpellCheckComponent)), all of which are configured through `solrconfig.xml`.
|
||||
In the default config which is copied into your index,
|
||||
spell checking data is collected from all fulltext fields
|
||||
(everything you added through `SolrIndex->addFulltextField()`).
|
||||
The values of these fields are collected in a special `_text` field.
|
||||
|
||||
$index = new MyIndex();
|
||||
$query = new SearchQuery();
|
||||
$query->search('My Term');
|
||||
$params = array('spellcheck' => 'true', 'spellcheck.collate' => 'true');
|
||||
$results = $index->search($query, -1, -1, $params);
|
||||
$results->spellcheck
|
||||
|
||||
The built-in `_text` data is better than nothing, but also has some problems:
|
||||
Its heavily processed, for example by stemming filters which butcher words.
|
||||
So misspelling "Govnernance" will suggest "govern" rather than "Governance".
|
||||
This can be fixed by aggregating spell checking data in a separate
|
||||
|
||||
<?php
|
||||
class MyIndex extends SolrIndex {
|
||||
|
||||
function init() {
|
||||
// ...
|
||||
$this->addCopyField('SiteTree_Title', 'spellcheckData');
|
||||
$this->addCopyField('DMSDocument_Title', 'spellcheckData');
|
||||
$this->addCopyField('SiteTree_Content', 'spellcheckData');
|
||||
$this->addCopyField('DMSDocument_Content', 'spellcheckData');
|
||||
}
|
||||
|
||||
// ...
|
||||
|
||||
function getFieldDefinitions() {
|
||||
$xml = parent::getFieldDefinitions();
|
||||
|
||||
$xml .= "\n\n\t\t<!-- Additional custom fields for spell checking -->";
|
||||
$xml .= "\n\t\t<field name='spellcheckData' type='textSpell' indexed='true' stored='false' multiValued='true' />";
|
||||
|
||||
return $xml;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Now you need to tell solr to use our new field for gathering spelling data.
|
||||
In order to customize the spell checking configuration,
|
||||
create your own `solrconfig.xml` (see "File-based configuration").
|
||||
In there, change the following directive:
|
||||
|
||||
<!-- ... -->
|
||||
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
|
||||
<!-- ... -->
|
||||
<str name="field">spellcheckData</str>
|
||||
</searchComponent
|
||||
|
||||
Don't forget to copy the new configuration via a call to the `Solr_Configure`
|
||||
task, and reindex your data before using the spell checker.
|
||||
|
||||
### Custom Types
|
||||
|
||||
Solr supports custom field type definitions which are written to its XML schema.
|
||||
Many standard ones are already included in the default schema.
|
||||
As the XML file is generated dynamically, we can add our own types
|
||||
by overloading the template responsible for it: `types.ss`.
|
||||
|
||||
In the following example, we read out type definitions
|
||||
from a new file `mysite/solr/templates/types.ss` instead:
|
||||
|
||||
<?php
|
||||
class MyIndex extends SolrIndex {
|
||||
function getTemplatesPath() {
|
||||
return Director::baseFolder() . '/mysite/solr/templates/';
|
||||
}
|
||||
}
|
||||
|
||||
## Debugging
|
||||
|
||||
### Using the web admin interface
|
||||
|
Loading…
x
Reference in New Issue
Block a user