Import first version of fulltextsearch module

This commit is contained in:
Hamish Friedlander 2011-05-02 16:33:05 +12:00
parent 85dfbaaf32
commit 17be5a3e63
82 changed files with 17656 additions and 0 deletions

View File

@ -0,0 +1,48 @@
# WARNING: Heavily experimental API. Likely to change without notice.
# FullTextSearch module
An attempt to add stable support for Fulltext Search engines like Sphinx and Solr to SilverStripe CMS
## Maintainer Contact
* Hamish Friedlander <hamish (at) silverstripe (dot) com>
## Requirements
* SilverStripe 2.4. Untested in 3, but probably won't work.
## Documentation
See docs/README.md
## TODO
* Get rid of includeSubclasses - isn't actually used in practise, makes the codebase uglier, and ClassHierarchy can be
used at query time for most of the same use cases
* Fix field referencing in queries. Should be able to do `$query->search('Text', 'Content')`, not
`$query->search('Text', 'SiteTree_Content')` like you have to do now
- Make sure that when field exists in multiple classes, searching against bare fields searches all of them
- Allow searching against specific instances too
* Make fields restrictable by class in an index - 'SiteTree#Content' to limit fields to a particular class,
maybe 'Content->Summary' to allow calling a specific method on the field object to get the text
* Allow following user relationships (Children.Foo for example)
* Be clearer about what happens with relationships to stateful objects (e.g. Parent.Foo where Parent is versioned)
* Improvements to SearchUpdater
- Make it work properly when in-between objects (the A in A.B.Foo) update
- Allow user logic to cause triggering reindex of documents when field is user generated
* Add sphinx connector
* Add generic APIs for spell correction, file text extraction and snippet generation
* Better docs

3
_config.php Normal file
View File

@ -0,0 +1,3 @@
<?php
SearchUpdater::bind_manipulation_capture();

View File

@ -0,0 +1,88 @@
<?php
/**
* Base class to manage active search indexes.
*/
class FullTextSearch {
static protected $all_indexes = null;
static protected $indexes_by_subclass = array();
/**
* Get all the instantiable search indexes (so all the user created indexes, but not the connector or library level
* abstract indexes). Can optionally be filtered to only return indexes that are subclasses of some class
*
* @static
* @param String $class - Class name to filter indexes by, so that all returned indexes are subclasses of provided class
* @param bool $rebuild - If true, don't use cached values
*/
static function get_indexes($class = null, $rebuild = false) {
if ($rebuild) { self::$all_indexes = null; self::$indexes_by_subclass = array(); }
if (!$class) {
if (self::$all_indexes === null) {
$classes = ClassInfo::subclassesFor('SearchIndex');
$concrete = array();
foreach ($classes as $class) {
$ref = new ReflectionClass($class);
if ($ref->isInstantiable()) $concrete[$class] = singleton($class);
}
self::$all_indexes = $concrete;
}
return self::$all_indexes;
}
else {
if (!isset(self::$indexes_by_subclass[$class])) {
$all = self::get_indexes();
$valid = array();
foreach ($all as $indexclass => $instance) {
if (ClassInfo::is_subclass_of($indexclass, $class)) $valid[$indexclass] = $instance;
}
self::$indexes_by_subclass[$class] = $valid;
}
return self::$indexes_by_subclass[$class];
}
}
/**
* Sometimes, like when in tests, you want to restrain the actual indexes to a subset
*
* Call with one argument - an array of class names, index instances or classname => indexinstance pairs (can be mixed).
* Alternatively call with multiple arguments, each of which is a class name or index instance
*
* From then on, fulltext search system will only see those indexes passed in this most recent call.
*
* Passing in no arguments resets back to automatic index list
*/
static function force_index_list() {
$indexes = func_get_args();
// No arguments = back to automatic
if (!$indexes) {
self::get_indexes(null, true);
return;
}
// Arguments can be a single array
if (is_array($indexes[0])) $indexes = $indexes[0];
// Reset to empty first
self::$all_indexes = array(); self::$indexes_by_subclass = array();
// And parse out alternative type combos for arguments and add to allIndexes
foreach ($indexes as $class => $index) {
if (is_string($index)) { $class = $index; $index = singleton($class); }
if (is_numeric($class)) $class = get_class($index);
self::$all_indexes[$class] = $index;
}
}
}

548
code/search/SearchIndex.php Normal file
View File

@ -0,0 +1,548 @@
<?php
/**
* SearchIndex is the base index class. Each connector will provide a subclass of this that
* provides search engine specific behavior.
*
* This class is responsible for:
*
* - Taking index calls adding classes and fields, and resolving those to value sources and types
*
* - Determining which records in this index need updating when a DataObject is changed
*
* - Providing utilities to the connector indexes
*
* The connector indexes are responsible for
*
* - Mapping types to index configuration
*
* - Adding and removing items to index
*
* - Parsing and converting SearchQueries into a form the engine will understand, and executing those queries
*
* The user indexes are responsible for
*
* - Specifying which classes and fields this index contains
*
* - Specifying update rules that are not extractable from metadata (because the values come from functions for instance)
*
*/
abstract class SearchIndex extends ViewableData {
function __construct() {
$this->init();
foreach ($this->getClasses() as $class => $options) {
SearchVariant::with($class, $options['include_children'])->call('alterDefinition', $class, $this);
}
$this->buildDependancyList();
}
function __toString() {
return 'Search Index ' . get_class($this);
}
/**
* Examines the classes this index is built on to try and find defined fields in the class hierarchy for those classes.
* Looks for db and viewable-data fields, although can't nessecarily find type for viewable-data fields.
*/
function fieldData($field, $forceType = null) {
$fullfield = str_replace(".", "_", $field);
$sources = $this->getClasses();
foreach ($sources as $source => $options) {
$sources[$source]['base'] = $source;
$sources[$source]['lookup_chain'] = array();
}
$found = array();
if (strpos($field, '.') !== false) {
$lookups = explode(".", $field);
$field = array_pop($lookups);
foreach ($lookups as $lookup) {
$next = array();
foreach ($sources as $source => $options) {
$class = null;
foreach (SearchIntrospection::hierarchy($source, $options['include_children']) as $dataclass) {
$singleton = singleton($dataclass);
if ($hasOne = $singleton->has_one($lookup)) {
$class = $hasOne;
$options['lookup_chain'][] = array(
'call' => 'method', 'method' => $lookup,
'through' => 'has_one', 'class' => $dataclass, 'otherclass' => $class, 'foreignkey' => "{$lookup}ID"
);
}
else if ($hasMany = $singleton->has_many($lookup)) {
$class = $hasMany;
$options['multi_valued'] = true;
$options['lookup_chain'][] = array(
'call' => 'method', 'method' => $lookup,
'through' => 'has_many', 'class' => $dataclass, 'otherclass' => $class, 'foreignkey' => $singleton->getRemoteJoinField($lookup, 'has_many')
);
}
else if ($manyMany = $singleton->many_many($lookup)) {
$class = $manyMany[0];
$options['multi_valued'] = true;
$options['lookup_chain'][] = array(
'call' => 'method', 'method' => $lookup,
'through' => 'many_many', 'class' => $dataclass, 'otherclass' => $class, 'details' => $manyMany
);
}
if ($class) {
if (!isset($options['origin'])) $options['origin'] = $dataclass;
$next[$class] = $options;
continue 2;
}
}
}
if (!$next) return $next; // Early out to avoid excessive empty looping
$sources = $next;
}
}
foreach ($sources as $class => $options) {
$dataclasses = SearchIntrospection::hierarchy($class, $options['include_children']);
while (count($dataclasses)) {
$dataclass = array_shift($dataclasses);
$type = null; $fieldoptions = $options;
$fields = DataObject::database_fields($dataclass);
if (isset($fields[$field])) {
$type = $fields[$field];
$fieldoptions['lookup_chain'][] = array('call' => 'property', 'property' => $field);
}
else {
$singleton = singleton($dataclass);
if ($singleton->hasMethod("get$field") || $singleton->hasField($field)) {
$type = $singleton->castingClass($field);
if (!$type) $type = 'String';
if ($singleton->hasMethod("get$field")) $fieldoptions['lookup_chain'][] = array('call' => 'method', 'method' => "get$field");
else $fieldoptions['lookup_chain'][] = array('call' => 'property', 'property' => $field);
}
}
if ($type) {
// Don't search through child classes of a class we matched on. TODO: Should we?
$dataclasses = array_diff($dataclasses, array_values(ClassInfo::subclassesFor($dataclass)));
// Trim arguments off the type string
if (preg_match('/^(\w+)\(/', $type, $match)) $type = $match[1];
// Get the origin
$origin = isset($fieldoptions['origin']) ? $fieldoptions['origin'] : $dataclass;
$found["{$origin}_{$fullfield}"] = array(
'name' => "{$origin}_{$fullfield}",
'field' => $field,
'fullfield' => $fullfield,
'base' => $fieldoptions['base'],
'origin' => $origin,
'class' => $dataclass,
'lookup_chain' => $fieldoptions['lookup_chain'],
'type' => $forceType ? $forceType : $type,
'multi_valued' => isset($fieldoptions['multi_valued']) ? true : false
);
}
}
}
return $found;
}
/** Public, but should only be altered by variants */
protected $classes = array();
protected $fulltextFields = array();
public $filterFields = array();
protected $sortFields = array();
/**
* Add a DataObject subclass whose instances should be included in this index
*
* Can only be called when addFulltextField, addFilterField, addSortField and addAllFulltextFields have not
* yet been called for this index instance
*
* @throws Exception
* @param String $class - The class to include
* @param array $options - TODO: Remove
*/
public function addClass($class, $options = array()) {
if ($this->fulltextFields || $this->filterFields || $this->sortFields) {
throw new Exception('Can\'t add class to Index after fields have already been added');
}
$options = array_merge(array(
'include_children' => true
), $options);
$this->classes[$class] = $options;
}
/**
* Get the classes added by addClass
*/
public function getClasses() { return $this->classes; }
/**
* Add a field that should be fulltext searchable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
*/
public function addFulltextField($field, $forceType = null) {
$this->fulltextFields = array_merge($this->fulltextFields, $this->fieldData($field, $forceType));
}
public function getFulltextFields() { return $this->fulltextFields; }
/**
* Add a field that should be filterable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
*/
public function addFilterField($field, $forceType = null) {
$this->filterFields = array_merge($this->filterFields, $this->fieldData($field, $forceType));
}
public function getFilterFields() { return $this->filterFields; }
/**
* Add a field that should be sortable
* @param String $field - The field to add
* @param String $forceType - The type to force this field as (required in some cases, when not detectable from metadata)
*/
public function addSortField($field, $forceType = null) {
$this->sortFields = array_merge($this->sortFields, $this->fieldData($field, $forceType));
}
public function getSortFields() { return $this->sortFields; }
/**
* Add all database-backed text fields as fulltext searchable fields.
*
* For every class included in the index, examines those classes and all subclasses looking for "Text" database
* fields (Varchar, Text, HTMLText, etc) and adds them all as fulltext searchable fields.
*/
public function addAllFulltextFields($includeSubclasses = true) {
foreach ($this->getClasses() as $class => $options) {
foreach (SearchIntrospection::hierarchy($class, $includeSubclasses, true) as $dataclass) {
$fields = DataObject::database_fields($dataclass);
foreach ($fields as $field => $type) {
if (preg_match('/^(\w+)\(/', $type, $match)) $type = $match[1];
if (ClassInfo::is_subclass_of($type, 'StringField')) $this->addFulltextField($field);
}
}
}
}
/**
* Returns an interator that will let you interate through all added fields, regardless of whether they
* were added as fulltext, filter or sort fields.
*
* @return MultipleArrayIterator
*/
public function getFieldsIterator() {
return new MultipleArrayIterator($this->fulltextFields, $this->filterFields, $this->sortFields);
}
public $dependancyList = array();
function buildDependancyList() {
$this->dependancyList = array_keys($this->getClasses());
foreach ($this->getFieldsIterator() as $name => $field) {
if (!isset($field['class'])) continue;
SearchIntrospection::add_unique_by_ancestor($this->dependancyList, $field['class']);
}
}
public $derivedFields = null;
/**
* Returns an array where each member is all the fields and the classes that are at the end of some
* specific lookup chain from one of the base classes
*/
function getDerivedFields() {
if ($this->derivedFields === null) {
$this->derivedFields = array();
foreach ($this->getFieldsIterator() as $name => $field) {
if (count($field['lookup_chain']) < 2) continue;
$key = sha1($field['base'].serialize($field['lookup_chain']));
$fieldname = "{$field['class']}:{$field['field']}";
if (isset($this->derivedFields[$key])) {
$this->derivedFields[$key]['fields'][$fieldname] = $fieldname;
SearchIntrospection::add_unique_by_ancestor($this->derivedFields['classes'], $field['class']);
}
else {
$chain = array_reverse($field['lookup_chain']);
array_shift($chain);
$this->derivedFields[$key] = array(
'base' => $field['base'],
'fields' => array($fieldname => $fieldname),
'classes' => array($field['class']),
'chain' => $chain
);
}
}
}
return $this->derivedFields;
}
/**
* Get the "document ID" (a database & variant unique id) given some "Base" class, DataObject ID and state array
*
* @param String $base - The base class of the object
* @param Integer $id - The ID of the object
* @param Array $state - The variant state of the object
* @return string - The document ID as a string
*/
function getDocumentIDForState($base, $id, $state) {
ksort($state);
$parts = array('id' => $id, 'base' => $base, 'state' => json_encode($state));
return implode('-', array_values($parts));
}
/**
* Get the "document ID" (a database & variant unique id) given some "Base" class and DataObject
*
* @param DataObject $object - The object
* @param String $base - The base class of the object
* @param Boolean $includesubs - TODO: Probably going away
* @return string - The document ID as a string
*/
function getDocumentID($object, $base, $includesubs) {
return $this->getDocumentIDForState($base, $object->ID, SearchVariant::current_state($base, $includesubs));
}
/**
* Given an object and a field definition (as returned by fieldData) get the current value of that field on that object
*
* @param DataObject $object - The object to get the value from
* @param Array $field - The field definition to use
* @return Mixed - The value of the field, or null if we couldn't look it up for some reason
*/
protected function _getFieldValue($object, $field) {
set_error_handler(create_function('$no, $str', 'throw new Exception("HTML Parse Error: ".$str);'), E_ALL);
try {
foreach ($field['lookup_chain'] as $step) {
// Just fail if we've fallen off the end of the chain
if (!$object) return null;
// If we're looking up this step on an array or DataObjectSet, do the step on every item, merge result
if (is_array($object) || $object instanceof DataObjectSet) {
$next = array();
foreach ($object as $item) {
if ($step['call'] == 'method') {
$method = $step['method'];
$item = $item->$method();
}
else {
$property = $step['property'];
$item = $item->$property;
}
if ($item instanceof DataObjectSet) $next = array_merge($next, $item->toArray());
elseif (is_array($item)) $next = array_merge($next, $item);
else $next[] = $item;
}
$object = $next;
}
// Otherwise, just call
else {
if ($step['call'] == 'method') {
$method = $step['method'];
$object = $object->$method();
}
elseif ($step['call'] == 'variant') {
$variants = SearchVariant::variants($field['base'], true);
$variant = $variants[$step['variant']]; $method = $step['method'];
$object = $variant->$method($object);
}
else {
$property = $step['property'];
$object = $object->$property;
}
}
}
}
catch (Exception $e) {
$object = null;
}
restore_error_handler();
return $object;
}
/**
* Given a class, object id, set of stateful ids and a list of changed fields (in a special format),
* return what statefulids need updating in this index
*
* Internal function used by SearchUpdater.
*
* @param $class
* @param $id
* @param $statefulids
* @param $fields
* @return array
*/
function getDirtyIDs($class, $id, $statefulids, $fields) {
$dirty = array();
// First, if this object is directly contained in the index, add it
foreach ($this->classes as $searchclass => $options) {
if ($searchclass == $class || ($options['include_children'] && ClassInfo::is_subclass_of($class, $searchclass))) {
$dirty[$searchclass] = array();
foreach ($statefulids as $statefulid) {
$key = serialize($statefulid);
$dirty[$searchclass][$key] = $statefulid;
}
}
}
$current = SearchVariant::current_state();
// Then, for every derived field
foreach ($this->getDerivedFields() as $derivation) {
// If the this object is a subclass of any of the classes we want a field from
if (!SearchIntrospection::is_subclass_of($class, $derivation['classes'])) continue;
if (!array_intersect_key($fields, $derivation['fields'])) continue;
foreach (SearchVariant::reindex_states($class, false) as $state) {
SearchVariant::activate_state($state);
$ids = array($id);
foreach ($derivation['chain'] as $step) {
if ($step['through'] == 'has_one') {
$sql = new SQLQuery('ID', $step['class'], $step['foreignkey'].' IN ('.implode(',', $ids).')');
singleton($step['class'])->extend('augmentSQL', $sql);
$ids = $sql->execute()->column();
}
else if ($step['through'] == 'has_many') {
$sql = new SQLQuery('"'.$step['class'].'"."ID"', $step['class'], '"'.$step['otherclass'].'"."ID" IN ('.implode(',', $ids).')');
$sql->innerJoin($step['otherclass'], '"'.$step['class'].'"."ID" = "'.$step['otherclass'].'"."'.$step['foreignkey'].'"');
singleton($step['class'])->extend('augmentSQL', $sql);
$ids = $sql->execute()->column();
}
}
SearchVariant::activate_state($current);
if ($ids) {
$base = $derivation['base'];
if (!isset($dirty[$base])) $dirty[$base] = array();
foreach ($ids as $id) {
$statefulid = array('id' => $id, 'state' => $state);
$key = serialize($statefulid);
$dirty[$base][$key] = $statefulid;
}
}
}
}
return $dirty;
}
/** !! These should be implemented by the full text search engine */
abstract function add($object) ;
abstract function delete($base, $id, $state) ;
abstract function commit();
/** !! These should be implemented by the specific index */
/**
* Called during construction, this is the method that builds the structure.
* Used instead of overriding __construct as we have specific execution order - code that has
* to be run before _and/or_ after this.
*/
abstract function init();
}
/**
* A search index that does nothing. Useful for testing
*/
abstract class SearchIndex_Null extends SearchIndex {
function add($object) { }
function delete($base, $id, $state) { }
function commit() { }
}
/**
* A search index that just records actions. Useful for testing
*/
abstract class SearchIndex_Recording extends SearchIndex {
public $added = array();
public $deleted = array();
function reset() {
$this->added = array();
$this->deleted = array();
}
function add($object) {
$res = array();
$res['ID'] = $object->ID;
foreach ($this->getFieldsIterator() as $name => $field) {
$val = $this->_getFieldValue($object, $field);
$res[$name] = $val;
}
$this->added[] = $res;
}
function getAdded($fields = array()) {
$res = array();
foreach ($this->added as $added) {
$filtered = array();
foreach ($fields as $field) {
if (isset($added[$field])) $filtered[$field] = $added[$field];
}
$res[] = $filtered;
}
return $res;
}
function delete($base, $id, $state) {
$this->deleted[] = array('base' => $base, 'id' => $id, 'state' => $state);
}
function commit() { }
}

View File

@ -0,0 +1,78 @@
<?php
/**
* Some additional introspection tools that are used often by the fulltext search code
*/
class SearchIntrospection {
protected static $ancestry = array();
/**
* Check if class is subclass of (a) the class in $of, or (b) any of the classes in the array $of
* @static
* @param $class
* @param $of
* @return bool
*/
static function is_subclass_of ($class, $of) {
$ancestry = isset(self::$ancestry[$class]) ? self::$ancestry[$class] : (self::$ancestry[$class] = ClassInfo::ancestry($class));
return is_array($of) ? (bool)array_intersect($of, $ancestry) : array_key_exists($of, $ancestry);
}
protected static $hierarchy = array();
/**
* Get all the classes involved in a DataObject hierarchy - both super and optionally subclasses
*
* @static
* @param String $class - The class to query
* @param bool $includeSubclasses - True to return subclasses as well as super classes
* @param bool $dataOnly - True to only return classes that have tables
* @return Array - Integer keys, String values as classes sorted by depth (most super first)
*/
static function hierarchy ($class, $includeSubclasses = true, $dataOnly = false) {
$key = "$class!" . ($includeSubclasses ? 'sc' : 'an') . '!' . ($dataOnly ? 'do' : 'al');
if (!isset(self::$hierarchy[$key])) {
$classes = array_values(ClassInfo::ancestry($class));
if ($includeSubclasses) $classes = array_unique(array_merge($classes, array_values(ClassInfo::subclassesFor($class))));
$idx = array_search('DataObject', $classes);
if ($idx !== false) array_splice($classes, 0, $idx+1);
if ($dataOnly) foreach($classes as $i => $class) {
if (!DataObject::has_own_table($class)) unset($classes[$i]);
}
self::$hierarchy[$key] = $classes;
}
return self::$hierarchy[$key];
}
/**
* Add classes to list, keeping only the parent when parent & child are both in list after add
*/
static function add_unique_by_ancestor(&$list, $class) {
// If class already has parent in list, just ignore
if (self::is_subclass_of($class, $list)) return;
// Strip out any subclasses of $class already in the list
$children = ClassInfo::subclassesFor($class);
$list = array_diff($list, $children);
// Then add the class in
$list[] = $class;
}
/**
* Does this class, it's parent (or optionally one of it's children) have the passed extension attached?
*/
static function has_extension($class, $extension, $includeSubclasses = true) {
foreach (self::hierarchy($class, $includeSubclasses) as $relatedclass) {
if (Object::has_extension($relatedclass, $extension)) return true;
}
return false;
}
}

105
code/search/SearchQuery.php Normal file
View File

@ -0,0 +1,105 @@
<?php
/**
* Represents a search query
*
* API very much still in flux. Generally, calling with multiple arguments = OR, calling multiple times = AND.
*/
class SearchQuery extends ViewableData {
static $missing = null;
static $present = null;
static $default_page_size = 10;
/** These are public, but only for index & variant access - API users should not manually access these */
public $search = array();
public $classes = array();
public $require = array();
public $exclude = array();
protected $start = 0;
protected $limit = -1;
/** These are the API functions */
function __construct() {
if (self::$missing === null) self::$missing = new stdClass();
if (self::$present === null) self::$present = new stdClass();
}
function search($text, $fields = null, $boost = 1) {
$this->search[] = array('text' => $text, 'fields' => $fields ? (array)$fields : null, 'boost' => $boost, 'fuzzy' => false);
}
function fuzzysearch($text, $fields = null, $boost = 1) {
$this->search[] = array('text' => $text, 'fields' => $fields ? (array)$fields : null, 'boost' => $boost, 'fuzzy' => true);
}
function inClass($class, $includeSubclasses = true) {
$this->classes[] = array('class' => $class, 'includeSubclasses' => $includeSubclasses);
}
function filter($field, $values) {
$requires = isset($this->require[$field]) ? $this->require[$field] : array();
$values = is_array($values) ? $values : array($values);
$this->require[$field] = array_merge($requires, $values);
}
function exclude($field, $values) {
$excludes = isset($this->exclude[$field]) ? $this->exclude[$field] : array();
$values = is_array($values) ? $values : array($values);
$this->exclude[$field] = array_merge($excludes, $values);
}
function start($start) {
$this->start = $start;
}
function limit($limit) {
$this->limit = $limit;
}
function page($page) {
$this->start = $page * self::$default_page_size;
$this->limit = self::$default_page_size;
}
function isfiltered() {
return $this->search || $this->classes || $this->require || $this->exclude;
}
function __toString() {
return "Search Query\n";
}
}
/**
* Create one of these and pass as one of the values in filter or exclude to filter or exclude by a (possibly
* open ended) range
*/
class SearchQuery_Range {
public $start = null;
public $end = null;
function __construct($start = null, $end = null) {
$this->start = $start;
$this->end = $end;
}
function start($start) {
$this->start = $start;
}
function end($end) {
$this->end = $end;
}
function isfiltered() {
return $this->start !== null || $this->end !== null;
}
}

View File

@ -0,0 +1,257 @@
<?php
/**
* This class is responsible for capturing changes to DataObjects and triggering index updates of the resulting dirty index
* items.
*
* Attached automatically by _config calling SearchUpdater#bind_manipulation_capture. Overloads the current database connector's
* manipulate method - basically we need to capture a manipulation _after_ all the augmentManipulation code (for instance Version's)
* is run
*
* Pretty closely tied to the field structure of SearchIndex.
*
* TODO: The way we bind in is awful hacky. The config stuff in 3 will hopefully allow us to force ourselves as the very last
* augmentManipulation.
*/
class SearchUpdater extends Object {
const AUTO = 0;
const DEFERRED = 1;
const IMMEDIATE = 2;
const DISABLED = 3;
/**
* How to schedule index updates at the end of the request.
*
* AUTO = IMMEDIATE if not _many_ dirty records, DEFERRED if _many_ where many is self::$auto_threshold
* DEFERRED = Use messagequeue to trigger updating indexes sometime soonish
* IMMEDIATE = Update indexes at end of request
* DISABLE = Dont update indexes
*
* If messagequeue module not installed, AUTO => IMMEDIATE and DEFERRED => DISABLED
*/
static $update_method = SearchUpdater::DEFERRED;
// How many items can be dirty before we defer updates
static $auto_threshold = 6;
// The indexing message queue
static $reindex_queue = "search_indexing";
static function set_reindexing_queue($queue) { self::$reindex_queue = $queue; }
/**
* Replace the database object with a subclass that captures all manipulations and passes them to us
*/
static function bind_manipulation_capture() {
global $databaseConfig;
$type = $databaseConfig['type'];
$file = TEMP_FOLDER."/.cache.SMC.$type";
if (!is_file($file)) {
file_put_contents($file, "<?php
class SearchManipulateCapture_$type extends $type {
function manipulate(\$manipulation) {
\$res = parent::manipulate(\$manipulation);
SearchUpdater::handle_manipulation(\$manipulation);
return \$res;
}
}
");
}
require_once($file);
$databaseConfig['type'] = 'SearchManipulateCapture_'.$type;
}
static $dirty = array(); static $dirtycount = 0;
static function add_dirty_ids($class, $statefulids, $index) {
$base = ClassInfo::baseDataClass($class);
$forclass = isset(self::$dirty[$base]) ? self::$dirty[$base] : array();
foreach ($statefulids as $statefulid) {
$id = $statefulid['id'];
$state = $statefulid['state']; $statekey = serialize($state);
if (!isset($forclass[$statekey])) {
$forclass[$statekey] = array('state' => $state, 'ids' => array($id => array($index)));
self::$dirtycount += 1;
}
else if (!isset($forclass[$statekey]['ids'][$id])) {
$forclass[$statekey]['ids'][$id] = array($index);
self::$dirtycount += 1;
}
else if (array_search($index, $forclass[$statekey]['ids'][$id]) === false) {
$forclass[$statekey]['ids'][$id][] = $index;
// dirty count stays the same
}
}
self::$dirty[$base] = $forclass;
}
static $registered = false;
/**
* Called by the SearchManiplateCapture database adapter with every manipulation made against the database.
*
* Check every index to see what objects need re-inserting into what indexes to keep the index fresh,
* but doesn't actually do it yet.
*
* TODO: This is pretty sensitive to the format of manipulation that DataObject::write produces. Specifically,
* it expects the actual class of the object to be present as a table, regardless of if any fields changed in that table
* (so a class => array( 'fields' => array() ) item), in order to find the actual class for a set of table manipulations
*/
static function handle_manipulation($manipulation) {
// First, extract any state that is in the manipulation itself
foreach ($manipulation as $table => $details) {
$manipulation[$table]['class'] = $table;
$manipulation[$table]['state'] = array();
}
SearchVariant::call('extractManipulationState', $manipulation);
// Then combine the manipulation back into object field sets
$writes = array();
foreach ($manipulation as $table => $details) {
if (!isset($details['id']) || !isset($details['fields'])) continue;
$id = $details['id'];
$state = $details['state'];
$class = $details['class'];
$fields = $details['fields'];
$base = ClassInfo::baseDataClass($class);
$key = "$id:$base:".serialize($state);
$statefulids = array(array('id' => $id, 'state' => $state));
// Is this the first table for this particular object? Then add an item to $writes
if (!isset($writes[$key])) $writes[$key] = array('base' => $base, 'class' => $class, 'id' => $id, 'statefulids' => $statefulids, 'fields' => array());
// Otherwise update the class label if it's more specific than the currently recorded one
else if (ClassInfo::is_subclass_of($class, $writes[$key]['class'])) $writes[$key]['class'] = $class;
// Update the fields
foreach ($fields as $field => $value) {
$writes[$key]['fields']["$class:$field"] = $value;
}
}
// Then extract any state that is needed for the writes
SearchVariant::call('extractManipulationWriteState', $writes);
// Then for each write, figure out what objects need updating
foreach ($writes as $write) {
// For every index
foreach (FullTextSearch::get_indexes() as $index => $instance) {
// If that index as a field from this class
if (SearchIntrospection::is_subclass_of($write['class'], $instance->dependancyList)) {
// Get the dirty IDs
$dirtyids = $instance->getDirtyIDs($write['class'], $write['id'], $write['statefulids'], $write['fields']);
// Then add then then to the global list to deal with later
foreach ($dirtyids as $dirtyclass => $ids) {
if ($ids) self::add_dirty_ids($dirtyclass, $ids, $index);
}
}
}
}
// Finally, if we do have some work to do register the shutdown function to actually do the work
// Don't do it if we're testing - there's no database connection outside the test methods, so we'd
// just get errors
if (self::$dirty && !self::$registered && !(class_exists('SapphireTest',false) && SapphireTest::is_running_test())) {
register_shutdown_function(array("SearchUpdater", "flush_dirty_indexes"));
self::$registered = true;
}
}
/**
* Throw away the recorded dirty IDs without doing anything with them.
*/
static function clear_dirty_indexes() {
self::$dirty = array(); self::$dirtycount = 0;
}
/**
* Do something with the recorded dirty IDs, where that "something" depends on the value of self::$update_method,
* either immediately update the indexes, queue a messsage to update the indexes at some point in the future, or
* just throw the dirty IDs away.
*/
static function flush_dirty_indexes() {
if (!self::$dirty) return;
$method = self::$update_method;
if (class_exists("MessageQueue")) {
if ($method == self::AUTO) $method = self::$dirtycount < self::$auto_threshold ? self::IMMEDIATE : self::DEFERRED;
}
else {
if ($method == self::AUTO) $method = self::IMMEDIATE;
elseif ($method == self::DEFERRED) $method = self::DISABLED;
}
switch ($method) {
case self::IMMEDIATE:
self::process_dirty_indexes(self::$dirty);
break;
case self::DEFERRED:
MessageQueue::send(
self::$reindex_queue,
new MethodInvocationMessage("SearchUpdater", "process_dirty_indexes", self::$dirty)
);
break;
case self::DISABLED:
// NOP
break;
}
self::clear_dirty_indexes();
}
/**
* Internal function. Process the passed list of dirty ids. Split from flush_dirty_indexes so it can be called both
* directly and via messagequeue message.
*
* WARNING: Changes state (subsite, stage) and doesn't reset it. Should only be called after request has ended
*/
static function process_dirty_indexes($dirty) {
$indexes = FullTextSearch::get_indexes();
$dirtyindexes = array();
foreach ($dirty as $base => $statefulids) {
if (!$statefulids) continue;
foreach ($statefulids as $statefulid) {
$state = $statefulid['state'];
$ids = $statefulid['ids'];
SearchVariant::activate_state($state);
$objs = DataObject::get($base, '"'.$base.'"."ID" IN ('.implode(',', array_keys($ids)).')');
if ($objs) foreach ($objs as $obj) {
foreach ($ids[$obj->ID] as $index) { $indexes[$index]->add($obj); $dirtyindexes[$index] = $index; }
unset($ids[$obj->ID]);
}
foreach ($ids as $id => $fromindexes) {
foreach ($fromindexes as $index) { $indexes[$index]->delete($base, $id, $state); $dirtyindexes[$index] = $index; }
}
}
}
foreach ($dirtyindexes as $index) {
$indexes[$index]->commit();
}
}
}

View File

@ -0,0 +1,191 @@
<?php
/**
* A Search Variant handles decorators and other situations where the items to reindex or search through are modified
* from the default state - for instance, dealing with Versioned or Subsite
*/
abstract class SearchVariant {
function __construct() {}
/*** OVERRIDES start here */
/**
* Variants can provide any functions they want, but they _must_ override these functions
* with specific ones
*/
/**
* Return true if this variant applies to the passed class & subclass
*/
abstract function appliesTo($class, $includeSubclasses);
/**
* Return the current state
*/
abstract function currentState();
/**
* Return all states to step through to reindex all items
*/
abstract function reindexStates();
/**
* Activate the passed state
*/
abstract function activateState($state);
/*** OVERRIDES end here*/
/** Holds a cache of all variants */
protected static $variants = null;
/** Holds a cache of the variants keyed by "class!" "1"? (1 = include subclasses) */
protected static $class_variants = array();
/**
* Returns an array of variants.
*
* With no arguments, returns all variants
*
* With a classname as the first argument, returns the variants that apply to that class
* (optionally including subclasses)
*
* @static
* @param string $class - The class name to get variants for
* @param bool $includeSubclasses - True if variants should be included if they apply to at least one subclass of $class
* @return array - An array of (string)$variantClassName => (Object)$variantInstance pairs
*/
public static function variants($class = null, $includeSubclasses = true) {
if (!$class) {
if (self::$variants === null) {
$classes = ClassInfo::subclassesFor('SearchVariant');
$concrete = array();
foreach ($classes as $variantclass) {
$ref = new ReflectionClass($variantclass);
if ($ref->isInstantiable()) $concrete[$variantclass] = singleton($variantclass);
}
self::$variants = $concrete;
}
return self::$variants;
}
else {
$key = $class . '!' . $includeSubclasses;
if (!isset(self::$class_variants[$key])) {
self::$class_variants[$key] = array();
foreach (self::variants() as $variantclass => $instance) {
if ($instance->appliesTo($class, $includeSubclasses)) self::$class_variants[$key][$variantclass] = $instance;
}
}
return self::$class_variants[$key];
}
}
/** Holds a cache of SearchVariant_Caller instances, one for each class/includeSubclasses setting */
protected static $call_instances = array();
/**
* Lets you call any function on all variants that support it, in the same manner as "Object#extend" calls
* a method from extensions.
*
* Usage: SearchVariant::with(...)->call($method, $arg1, ...);
*
* @static
*
* @param string $class - (Optional) a classname. If passed, only variants that apply to that class will be checked / called
*
* @param bool $includeSubclasses - (Optional) If false, only variants that apply strictly to the passed class or its super-classes
* will be checked. If true (the default), variants that apply to any sub-class of the passed class with also be checked
*
* @return An object with one method, call()
*/
static function with($class = null, $includeSubclasses = true) {
// Make the cache key
$key = $class ? $class . '!' . $includeSubclasses : '!';
// If no SearchVariant_Caller instance yet, create it
if (!isset(self::$call_instances[$key])) self::$call_instances[$key] = new SearchVariant_Caller(self::variants($class, $includeSubclasses));
// Then return it
return self::$call_instances[$key];
}
/**
* A shortcut to with when calling without passing in a class,
*
* SearchVariant::call(...) ==== SearchVariant::with()->call(...);
*/
static function call($method, &$a1=null, &$a2=null, &$a3=null, &$a4=null, &$a5=null, &$a6=null, &$a7=null) {
return self::with()->call($method, $a1, $a2, $a3, $a4, $a5, $a6, $a7);
}
/**
* Get the current state of every variant
* @static
* @return array
*/
static function current_state($class = null, $includeSubclasses = true) {
$state = array();
foreach (self::variants($class, $includeSubclasses) as $variant => $instance) {
$state[$variant] = $instance->currentState();
}
return $state;
}
/**
* Activate all the states in the passed argument
* @static
* @param (array) $state. A set of (string)$variantClass => (any)$state pairs , e.g. as returned by
* SearchVariant::current_state()
* @return void
*/
static function activate_state($state) {
foreach (self::variants() as $variant => $instance) {
if (isset($state[$variant])) $instance->activateState($state[$variant]);
}
}
/**
* Return an iterator that, when used in a for loop, activates one combination of reindex states per loop, and restores
* back to the original state at the end
* @static
* @param string $class - The class name to get variants for
* @param bool $includeSubclasses - True if variants should be included if they apply to at least one subclass of $class
* @return SearchVariant_ReindexStateIteratorRet - The iterator to foreach loop over
*/
static function reindex_states($class = null, $includeSubclasses = true) {
$allstates = array();
foreach (self::variants($class, $includeSubclasses) as $variant => $instance) {
if ($states = $instance->reindexStates()) $allstates[$variant] = $states;
}
return $allstates ? new CombinationsArrayIterator($allstates) : array(array());
}
}
/**
* Internal utility class used to hold the state of the SearchVariant::with call
*/
class SearchVariant_Caller {
protected $variants = null;
function __construct($variants) {
$this->variants = $variants;
}
function call($method, &$a1=null, &$a2=null, &$a3=null, &$a4=null, &$a5=null, &$a6=null, &$a7=null) {
$values = array();
foreach ($this->variants as $variant) {
if (method_exists($variant, $method)) {
$value = $variant->$method($a1, $a2, $a3, $a4, $a5, $a6, $a7);
if ($value !== null) $values[] = $value;
}
}
return $values;
}
}

View File

@ -0,0 +1,81 @@
<?php
class SearchVariantSiteTreeSubsitesPolyhome extends SearchVariant {
function appliesTo($class, $includeSubclasses) {
return SearchIntrospection::has_extension($class, 'SiteTreeSubsitesPolyhome', $includeSubclasses);
}
function currentState() {
return Subsite::currentSubsiteID();
}
function reindexStates() {
static $ids = null;
if ($ids === null) {
$ids = array(0);
foreach (DataObject::get('Subsite') as $subsite) $ids[] = $subsite->ID;
}
return $ids;
}
function activateState($state) {
if (Controller::has_curr()) {
Subsite::changeSubsite($state);
}
else {
// TODO: This is a nasty hack - calling Subsite::changeSubsite after request ends
// throws error because no current controller to access session on
$_REQUEST['SubsiteID'] = $state;
}
}
function alterDefinition($base, $index) {
$self = get_class($this);
$index->filterFields['_subsite'] = array(
'name' => '_subsite',
'field' => '_subsite',
'fullfield' => '_subsite',
'base' => $base,
'origin' => $base,
'type' => 'Int',
'lookup_chain' => array(array('call' => 'variant', 'variant' => $self, 'method' => 'currentState'))
);
}
function alterQuery($query, $index) {
$subsite = Subsite::currentSubsiteID();
$query->filter('_subsite', array($subsite, SearchQuery::$missing));
}
static $subsites = null;
/**
* We need _really_ complicated logic to find just the changed subsites (because we use versions there's no explicit
* deletes, just new versions with different members) so just always use all of them
*/
function extractManipulationWriteState(&$writes) {
$self = get_class($this);
foreach ($writes as $key => $write) {
if (!$this->appliesTo($write['class'], true)) continue;
if (self::$subsites === null) {
$query = new SQLQuery('ID', 'Subsite');
self::$subsites = array_merge(array('0'), $query->execute()->column());
}
$next = array();
foreach ($write['statefulids'] as $i => $statefulid) {
foreach (self::$subsites as $subsiteID) {
$next[] = array('id' => $statefulid['id'], 'state' => array_merge($statefulid['state'], array($self => $subsiteID)));
}
}
$writes[$key]['statefulids'] = $next;
}
}
}

View File

@ -0,0 +1,66 @@
<?php
class SearchVariantVersioned extends SearchVariant {
function appliesTo($class, $includeSubclasses) {
return SearchIntrospection::has_extension($class, 'Versioned', $includeSubclasses);
}
function currentState() { return Versioned::current_stage(); }
function reindexStates() { return array('Stage', 'Live'); }
function activateState($state) { Versioned::reading_stage($state); }
function alterDefinition($base, $index) {
$self = get_class($this);
$index->filterFields['_versionedstage'] = array(
'name' => '_versionedstage',
'field' => '_versionedstage',
'fullfield' => '_versionedstage',
'base' => $base,
'origin' => $base,
'type' => 'String',
'lookup_chain' => array(array('call' => 'variant', 'variant' => $self, 'method' => 'currentState'))
);
}
function alterQuery($query) {
$stage = Versioned::current_stage();
$query->filter('_versionedstage', array($stage, SearchQuery::$missing));
}
function extractManipulationState(&$manipulation) {
$self = get_class($this);
foreach ($manipulation as $table => $details)