This commit is contained in:
Ingo Schommer 2012-10-05 06:16:26 -07:00
commit 427d60ad9c
7 changed files with 351 additions and 46 deletions

View File

@ -11,6 +11,7 @@ An attempt to add stable support for Fulltext Search engines like Sphinx and Sol
## Requirements
* SilverStripe 3.0
* (optional) [silverstripe-phockito](https://github.com/hafriedlander/silverstripe-phockito) (for testing)
## Documentation

View File

@ -47,7 +47,7 @@ abstract class SearchIndex extends ViewableData {
* Examines the classes this index is built on to try and find defined fields in the class hierarchy for those classes.
* Looks for db and viewable-data fields, although can't nessecarily find type for viewable-data fields.
*/
function fieldData($field, $forceType = null) {
function fieldData($field, $forceType = null, $extraOptions = array()) {
$fullfield = str_replace(".", "_", $field);
$sources = $this->getClasses();
@ -150,7 +150,8 @@ abstract class SearchIndex extends ViewableData {
'class' => $dataclass,
'lookup_chain' => $fieldoptions['lookup_chain'],
'type' => $forceType ? $forceType : $type,
'multi_valued' => isset($fieldoptions['multi_valued']) ? true : false
'multi_valued' => isset($fieldoptions['multi_valued']) ? true : false,
'extra_options' => $extraOptions
);
}
}
@ -200,9 +201,10 @@ abstract class SearchIndex extends ViewableData {
* Add a field that should be fulltext searchable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
* @param String $extraOptions - Dependent on search implementation
*/
public function addFulltextField($field, $forceType = null) {
$this->fulltextFields = array_merge($this->fulltextFields, $this->fieldData($field, $forceType));
public function addFulltextField($field, $forceType = null, $extraOptions = array()) {
$this->fulltextFields = array_merge($this->fulltextFields, $this->fieldData($field, $forceType, $extraOptions));
}
public function getFulltextFields() { return $this->fulltextFields; }
@ -211,9 +213,10 @@ abstract class SearchIndex extends ViewableData {
* Add a field that should be filterable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
* @param String $extraOptions - Dependent on search implementation
*/
public function addFilterField($field, $forceType = null) {
$this->filterFields = array_merge($this->filterFields, $this->fieldData($field, $forceType));
public function addFilterField($field, $forceType = null, $extraOptions = array()) {
$this->filterFields = array_merge($this->filterFields, $this->fieldData($field, $forceType, $extraOptions));
}
public function getFilterFields() { return $this->filterFields; }
@ -222,9 +225,10 @@ abstract class SearchIndex extends ViewableData {
* Add a field that should be sortable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
* @param String $extraOptions - Dependent on search implementation
*/
public function addSortField($field, $forceType = null) {
$this->sortFields = array_merge($this->sortFields, $this->fieldData($field, $forceType));
public function addSortField($field, $forceType = null, $extraOptions = array()) {
$this->sortFields = array_merge($this->sortFields, $this->fieldData($field, $forceType, $extraOptions));
}
public function getSortFields() { return $this->sortFields; }

View File

@ -31,11 +31,23 @@ class SearchQuery extends ViewableData {
if (self::$present === null) self::$present = new stdClass();
}
function search($text, $fields = null, $boost = 1) {
/**
* @param [type] $text [description]
* @param [type] $fields [description]
* @param array $boost Map of field names to float values. The higher the value,
* the more important the field gets for relevancy.
*/
function search($text, $fields = null, $boost = array()) {
$this->search[] = array('text' => $text, 'fields' => $fields ? (array)$fields : null, 'boost' => $boost, 'fuzzy' => false);
}
function fuzzysearch($text, $fields = null, $boost = 1) {
/**
* @param [type] $text [description]
* @param [type] $fields [description]
* @param array $boost Map of field names to float values. The higher the value,
* the more important the field gets for relevancy.
*/
function fuzzysearch($text, $fields = null, $boost = array()) {
$this->search[] = array('text' => $text, 'fields' => $fields ? (array)$fields : null, 'boost' => $boost, 'fuzzy' => true);
}

View File

@ -24,6 +24,10 @@ abstract class SolrIndex extends SearchIndex {
static $sortTypeMap = array();
protected $analyzerFields = array();
protected $copyFields = array();
function generateSchema() {
return $this->renderWith(Director::baseFolder() . '/fulltextsearch/conf/templates/schema.ss');
}
@ -36,6 +40,24 @@ abstract class SolrIndex extends SearchIndex {
return $this->renderWith(Director::baseFolder() . '/fulltextsearch/conf/templates/types.ss');
}
/**
* Index-time analyzer which is applied to a specific field.
* Can be used to remove HTML tags, apply stemming, etc.
*
* @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WhitespaceTokenizerFactory
*
* @param String $field
* @param String $type
* @param Array $params Parameters for the analyzer, usually at least a "class"
*/
function addAnalyzer($field, $type, $params) {
$fullFields = $this->fieldData($field);
if($fullFields) foreach($fullFields as $fullField => $spec) {
if(!isset($this->analyzerFields[$fullField])) $this->analyzerFields[$fullField] = array();
$this->analyzerFields[$fullField][$type] = $params;
}
}
function getFieldDefinitions() {
$xml = array();
$stored = Director::isDev() ? "stored='true'" : "stored='false'";
@ -57,33 +79,89 @@ abstract class SolrIndex extends SearchIndex {
// Add the user-specified fields
foreach ($this->fulltextFields as $name => $field) {
$type = isset(self::$fulltextTypeMap[$field['type']]) ? self::$fulltextTypeMap[$field['type']] : self::$fulltextTypeMap['*'];
$xml[] = "<field name='{$name}' type='$type' indexed='true' $stored />";
$xml[] = $this->getFieldDefinition($name, $field, self::$fulltextTypeMap);
}
foreach ($this->filterFields as $name => $field) {
if ($field['fullfield'] == 'ID' || $field['fullfield'] == 'ClassName') continue;
$multiValued = (isset($field['multi_valued']) && $field['multi_valued']) ? "multiValued='true'" : '';
$type = isset(self::$filterTypeMap[$field['type']]) ? self::$filterTypeMap[$field['type']] : self::$filterTypeMap['*'];
$xml[] = "<field name='{$name}' type='{$type}' indexed='true' $stored $multiValued />";
$xml[] = $this->getFieldDefinition($name, $field);
}
foreach ($this->sortFields as $name => $field) {
if ($field['fullfield'] == 'ID' || $field['fullfield'] == 'ClassName') continue;
$multiValued = (isset($field['multi_valued']) && $field['multi_valued']) ? "multiValued='true'" : '';
$typeMap = array_merge(self::$filterTypeMap, self::$sortTypeMap);
$type = isset($typeMap[$field['type']]) ? $typeMap[$field['type']] : $typeMap['*'];
$xml[] = "<field name='{$name}' type='{$type}' indexed='true' $stored $multiValued />";
$xml[] = $this->getFieldDefinition($name, $field);
}
return implode("\n\t\t", $xml);
}
/**
* @param String $name
* @param Array $spec
* @param Array $typeMap
* @return String XML
*/
protected function getFieldDefinition($name, $spec, $typeMap = null) {
if(!$typeMap) $typeMap = self::$filterTypeMap;
$multiValued = (isset($spec['multi_valued']) && $spec['multi_valued']) ? "true" : '';
$type = isset($typeMap[$spec['type']]) ? $typeMap[$spec['type']] : $typeMap['*'];
$analyzerXml = '';
if(isset($this->analyzerFields[$name])) {
foreach($this->analyzerFields[$name] as $analyzerType => $analyzerParams) {
$analyzerXml .= $this->toXmlTag($analyzerType, $analyzerParams);
}
}
$fieldParams = array_merge(
array(
'name' => $name,
'type' => $type,
'indexed' => 'true',
'stored' => Director::isDev() ? 'true' : 'false',
'multiValued' => $multiValued
),
isset($spec['extra_options']) ? $spec['extra_options'] : array()
);
return $this->toXmlTag(
"field",
$fieldParams,
$analyzerXml ? "<analyzer>$analyzerXml</analyzer>" : null
);
}
/**
* Convert definition to XML tag
*
* @param String $tag
* @param String $attrs Map of attributes
* @param String $content Inner content
* @return String XML tag
*/
protected function toXmlTag($tag, $attrs, $content = null) {
$xml = "<$tag ";
if($attrs) {
$attrStrs = array();
foreach($attrs as $attrName => $attrVal) $attrStrs[] = "$attrName='$attrVal'";
$xml .= $attrStrs ? implode(' ', $attrStrs) : '';
}
$xml .= $content ? ">$content</$tag>" : '/>';
return $xml;
}
/**
* @param String $source Composite field name (<class>_<fieldname>)
* @param String $dest
*/
function addCopyField($source, $dest, $extraOptions = array()) {
if(!isset($this->copyFields[$source])) $this->copyFields[$source] = array();
$this->copyFields[$source][] = array_merge(
array('source' => $source, 'dest' => $dest),
$extraOptions
);
}
function getCopyFieldDefinitions() {
$xml = array();
@ -91,6 +169,12 @@ abstract class SolrIndex extends SearchIndex {
$xml[] = "<copyField source='{$name}' dest='_text' />";
}
foreach ($this->copyFields as $source => $fields) {
foreach($fields as $fieldAttrs) {
$xml[] = $this->toXmlTag('copyField', $fieldAttrs);
}
}
return implode("\n\t", $xml);
}
@ -99,11 +183,16 @@ abstract class SolrIndex extends SearchIndex {
if ($class != $field['origin'] && !is_subclass_of($class, $field['origin'])) return;
$value = $this->_getFieldValue($object, $field);
$type = isset(self::$filterTypeMap[$field['type']]) ? self::$filterTypeMap[$field['type']] : self::$filterTypeMap['*'];
if (is_array($value)) foreach($value as $sub) {
/* Solr requires dates in the form 1995-12-31T23:59:59Z */
if ($type == 'tdate') $sub = gmdate('Y-m-d\TH:i:s\Z', strtotime($sub));
if ($type == 'tdate') {
if(!$sub) continue;
$sub = gmdate('Y-m-d\TH:i:s\Z', strtotime($sub));
}
/* Solr requires numbers to be valid if presented, not just empty */
if (($type == 'tint' || $type == 'tfloat' || $type == 'tdouble') && !is_numeric($sub)) continue;
@ -112,7 +201,11 @@ abstract class SolrIndex extends SearchIndex {
else {
/* Solr requires dates in the form 1995-12-31T23:59:59Z */
if ($type == 'tdate') $value = gmdate('Y-m-d\TH:i:s\Z', strtotime($value));
if ($type == 'tdate') {
if(!$value) return;
$value = gmdate('Y-m-d\TH:i:s\Z', strtotime($value));
}
/* Solr requires numbers to be valid if presented, not just empty */
if (($type == 'tint' || $type == 'tfloat' || $type == 'tdouble') && !is_numeric($value)) return;
@ -139,17 +232,22 @@ abstract class SolrIndex extends SearchIndex {
if ($field['base'] == $base) $this->_addField($doc, $object, $field);
}
Solr::service(get_class($this))->addDocument($doc);
$this->getService()->addDocument($doc);
return $doc;
}
function add($object) {
$class = get_class($object);
$docs = array();
foreach ($this->getClasses() as $searchclass => $options) {
if ($searchclass == $class || ($options['include_children'] && is_subclass_of($class, $searchclass))) {
$this->_addAs($object, $searchclass, $options);
$docs[] = $this->_addAs($object, $searchclass, $options);
}
}
return $docs;
}
function canAdd($class) {
@ -162,15 +260,22 @@ abstract class SolrIndex extends SearchIndex {
function delete($base, $id, $state) {
$documentID = $this->getDocumentIDForState($base, $id, $state);
Solr::service(get_class($this))->deleteById($documentID);
$this->getService()->deleteById($documentID);
}
function commit() {
Solr::service(get_class($this))->commit(false, false, false);
$this->getService()->commit(false, false, false);
}
public function search($query, $offset = -1, $limit = -1) {
$service = Solr::service(get_class($this));
/**
* @param SearchQuery $query
* @param integer $offset
* @param integer $limit
* @return ArrayData Map with the following keys:
* - 'Matches': ArrayList of the matched object instances
*/
public function search(SearchQuery $query, $offset = -1, $limit = -1) {
$service = $this->getService();
SearchVariant::with(count($query->classes) == 1 ? $query->classes[0]['class'] : null)->call('alterQuery', $query, $this);
@ -186,12 +291,15 @@ abstract class SolrIndex extends SearchIndex {
$fuzzy = $search['fuzzy'] ? '~' : '';
foreach ($parts[0] as $part) {
if ($search['fields']) {
$fields = (isset($search['fields'])) ? $search['fields'] : array();
if(isset($search['boost'])) $fields = array_merge($fields, array_keys($search['boost']));
if ($fields) {
$searchq = array();
foreach ($search['fields'] as $field) {
$searchq[] = "{$field}:".$part.$fuzzy;
foreach ($fields as $field) {
$boost = (isset($search['boost'][$field])) ? '^' . $search['boost'][$field] : '';
$searchq[] = "{$field}:".$part.$fuzzy.$boost;
}
$q[] = '+('.implode(' ', $searchq).')';
$q[] = '+('.implode(' OR ', $searchq).')';
}
else {
$q[] = '+'.$part;
@ -259,27 +367,39 @@ abstract class SolrIndex extends SearchIndex {
$fq[] = ($missing ? "+{$field}:[* TO *] " : '') . '-('.implode(' ', $excludeq).')';
}
if ($q) header('X-Query: '.implode(' ', $q));
if ($fq) header('X-Filters: "'.implode('", "', $fq).'"');
if(!headers_sent()) {
if ($q) header('X-Query: '.implode(' ', $q));
if ($fq) header('X-Filters: "'.implode('", "', $fq).'"');
}
if ($offset == -1) $offset = $query->start;
if ($limit == -1) $limit = $query->limit;
if ($limit == -1) $limit = SearchQuery::$default_page_size;
$res = $service->search($q ? implode(' ', $q) : '*:*', $offset, $limit, array('fq' => implode(' ', $fq)), Apache_Solr_Service::METHOD_POST);
$res = $service->search(
$q ? implode(' ', $q) : '*:*',
$offset,
$limit,
array('fq' => implode(' ', $fq)),
Apache_Solr_Service::METHOD_POST
);
$results = new ArrayList();
foreach ($res->response->docs as $doc) {
$result = DataObject::get_by_id($doc->ClassName, $doc->ID);
if($result) $results->push($result);
if($res->getHttpStatus() >= 200 && $res->getHttpStatus() < 300) {
foreach ($res->response->docs as $doc) {
$result = DataObject::get_by_id($doc->ClassName, $doc->ID);
if($result) $results->push($result);
}
$numFound = $res->response->numFound;
} else {
$numFound = 0;
}
$ret = array();
$ret['Matches'] = new PaginatedList($results);
$ret['Matches']->setLimitItems(false);
// Tell PaginatedList how many results there are
$ret['Matches']->setTotalItems($res->response->numFound);
$ret['Matches']->setTotalItems($numFound);
// Results for current page start at $offset
$ret['Matches']->setPageStart($offset);
// Results per page
@ -287,4 +407,19 @@ abstract class SolrIndex extends SearchIndex {
return new ArrayData($ret);
}
protected $service;
/**
* @return SolrService
*/
public function getService() {
if(!$this->service) $this->service = Solr::service(get_class($this));
return $this->service;
}
public function setService(SolrService $service) {
$this->service = $service;
return $this;
}
}

View File

@ -76,6 +76,27 @@ You can also copy the `thirdparty/`solr directory somewhere else,
just set the path value in `mysite/_config.php` to point to the new location.
And of course run `java -jar start.jar` from the new directory.
### Adding Analyzers, Tokenizers and Token Filters
When a document is indexed, its individual fields are subject to the analyzing and tokenizing filters that can transform and normalize the data in the fields. For example — removing blank spaces, removing html code, stemming, removing a particular character and replacing it with another
(see [Solr Wiki](http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters)).
Example: Replace synonyms on indexing (e.g. "i-pad" with "iPad")
<?php
class MyIndex extends SolrIndex {
function init() {
$this->addClass('Page');
$this->addField('Content');
$this->addAnalyzer('Content', 'filter', array('class' => 'solr.SynonymFilterFactory'));
}
}
// Generates the following XML schema definition:
// <field name="Page_Content" ...>
// <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
// </field>
## Debugging
### Using the web admin interface
@ -89,4 +110,18 @@ to Solr when saving/publishing in SilverStripe,
which is useful when debugging front-end queries,
see `thirdparty/fulltextsearch/server/silverstripe-solr-test.xml`.
java -Durl=http://localhost:8983/solr/MyIndex/update/ -Dtype=text/xml -jar post.jar silverstripe-solr-test.xml
java -Durl=http://localhost:8983/solr/MyIndex/update/ -Dtype=text/xml -jar post.jar silverstripe-solr-test.xml
## FAQ
### How do I use date ranges where dates might not be defined?
The Solr index updater only includes dates with values,
so the field might not exist in all your index entries.
A simple bounded range query (`<field>:[* TO <date>]`) will fail in this case.
In order to query the field, reverse the search conditions and exclude the ranges you don't want:
// Wrong: Filter will ignore all empty field values
$myQuery->filter(<field>, new SearchQuery_Range('*', <date>));
// Better: Exclude the opposite range
$myQuery->exclude(<field>, new SearchQuery_Range(<date>, '*'));

View File

@ -3,7 +3,8 @@
class SearchUpdaterTest_Container extends DataObject {
static $db = array(
'Field1' => 'Varchar',
'Field2' => 'Varchar'
'Field2' => 'Varchar',
'MyDate' => 'Date',
);
static $has_one = array(

117
tests/SolrIndexTest.php Normal file
View File

@ -0,0 +1,117 @@
<?php
class SolrIndexTest extends SapphireTest {
function setUpOnce() {
parent::setUpOnce();
Phockito::include_hamcrest();
}
function testBoost() {
$serviceMock = $this->getServiceMock();
$index = new SolrIndexTest_FakeIndex();
$index->setService($serviceMock);
$query = new SearchQuery();
$query->search(
'term',
null,
array('Field1' => 1.5, 'HasOneObject_Field1' => 3)
);
$index->search($query);
Phockito::verify($serviceMock)->search(
'+(Field1:term^1.5 OR HasOneObject_Field1:term^3)',
anything(), anything(), anything(), anything()
);
}
function testIndexExcludesNullValues() {
$serviceMock = $this->getServiceMock();
$index = new SolrIndexTest_FakeIndex();
$index->setService($serviceMock);
$obj = new SearchUpdaterTest_Container();
$obj->Field1 = 'Field1 val';
$obj->Field2 = null;
$obj->MyDate = null;
$docs = $index->add($obj);
$value = $docs[0]->getField('SearchUpdaterTest_Container_Field1');
$this->assertEquals('Field1 val', $value['value'], 'Writes non-NULL string fields');
$value = $docs[0]->getField('SearchUpdaterTest_Container_Field2');
$this->assertFalse($value, 'Ignores string fields if they are NULL');
$value = $docs[0]->getField('SearchUpdaterTest_Container_MyDate');
$this->assertFalse($value, 'Ignores date fields if they are NULL');
$obj->MyDate = '2010-12-30';
$docs = $index->add($obj);
$value = $docs[0]->getField('SearchUpdaterTest_Container_MyDate');
$this->assertEquals('2010-12-30T00:00:00Z', $value['value'], 'Writes non-NULL dates');
}
function testAddFieldExtraOptions() {
$origMode = Director::get_environment_type();
Director::set_environment_type('live'); // dev mode would for stored=true for everything
$index = new SolrIndexTest_FakeIndex();
$defs = simplexml_load_string('<fields>' . $index->getFieldDefinitions() . '</fields>');
$defField1 = $defs->xpath('field[@name="SearchUpdaterTest_Container_Field1"]');
$this->assertEquals((string)$defField1[0]['stored'], 'false');
$index->addFilterField('Field1', null, array('stored' => 'true'));
$defs = simplexml_load_string('<fields>' . $index->getFieldDefinitions() . '</fields>');
$defField1 = $defs->xpath('field[@name="SearchUpdaterTest_Container_Field1"]');
$this->assertEquals((string)$defField1[0]['stored'], 'true');
Director::set_environment_type($origMode);
}
function testAddAnalyzer() {
$index = new SolrIndexTest_FakeIndex();
$defs = simplexml_load_string('<fields>' . $index->getFieldDefinitions() . '</fields>');
$defField1 = $defs->xpath('field[@name="SearchUpdaterTest_Container_Field1"]');
$analyzers = $defField1[0]->analyzer;
$this->assertFalse((bool)$analyzers);
$index->addAnalyzer('Field1', 'charFilter', array('class' => 'solr.HTMLStripCharFilterFactory'));
$defs = simplexml_load_string('<fields>' . $index->getFieldDefinitions() . '</fields>');
$defField1 = $defs->xpath('field[@name="SearchUpdaterTest_Container_Field1"]');
$analyzers = $defField1[0]->analyzer;
$this->assertTrue((bool)$analyzers);
$this->assertEquals('solr.HTMLStripCharFilterFactory', $analyzers[0]->charFilter[0]['class']);
}
function testAddCopyField() {
$index = new SolrIndexTest_FakeIndex();
$index->addCopyField('sourceField', 'destField');
$defs = simplexml_load_string('<fields>' . $index->getCopyFieldDefinitions() . '</fields>');
$lastDef = array_pop($defs);
$this->assertEquals('sourceField', $lastDef['source']);
$this->assertEquals('destField', $lastDef['dest']);
}
protected function getServiceSpy() {
$serviceSpy = Phockito::spy('SolrService');
$fakeResponse = new Apache_Solr_Response(new Apache_Solr_HttpTransport_Response(null, null, null));
Phockito::when($serviceMock)
->_sendRawPost(anything(), anything(), anything(), anything())
->return($fakeResponse);
return $serviceMock;
}
}
class SolrIndexTest_FakeIndex extends SolrIndex {
function init() {
$this->addClass('SearchUpdaterTest_Container');
$this->addFilterField('Field1');
$this->addFilterField('MyDate', 'Date');
$this->addFilterField('HasOneObject.Field1');
$this->addFilterField('HasManyObjects.Field1');
}
}