ENHANCEMENT: included zend/search/lucene with the module to avoid including the entire lucene module. ENHANCEMENT: added hourly task as well as buildtask for dealing with cron updates. MINOR: updated documentation

This commit is contained in:
Will Rossiter 2011-01-11 01:35:59 +00:00
parent 59c9762dc0
commit 221cd700b6
93 changed files with 23670 additions and 36 deletions

View File

@ -1,17 +1,20 @@
# Documentation Viewer Module
## Maintainer Content
* Will Rossiter (Nickname: willr, wrossiter) <will@silverstripe.com>
## Maintainer Contact
* Will Rossiter (Nickname: willr, wrossiter)
<will@silverstripe.com>
## Requirements
* Tested on 2.4 and trunk
* SilverStripe 2.4
## Summary
Read nested documentation files from the /docs/ folder in modules. To read documentation go to yoursite.com/dev/docs/.
It is likely this will be integrated into the core in future versions once it is polished.
Reads markdown documentation files from the /docs/ folder in . To read documentation go to yoursite.com/dev/docs/.
For more documentation on how to use the module please read /docs/Writing-Documentation.md (or via this in /dev/docs/sapphiredocs/Writing-Documentation)
For more documentation on how to use the module please read /docs/Writing-Documentation.md
(or via this in /dev/docs/sapphiredocs/Writing-Documentation in your webbrowser)
## Syntax Highlighting ##
@ -55,4 +58,4 @@ To include the syntax highlighter source, add the following to your `Documentati
Requirements::css('sapphiredocs/thirdparty/syntaxhighlighter/styles/shThemeRDark.css');
You can overload the `DocumentationViewer` class and add a custom route through `Director::addRule()`
if you prefer not to modify the module file.
if you prefer not to modify the module file.>>>>>>> .r115400

View File

@ -1,5 +1,13 @@
<?php
/**
* Documentation Configuration
*
* Please override any of these options in your own projects _config.php file.
* For more information and documentation see sapphiredocs/docs/en
*/
// default location for documentation
Director::addRules(100, array(
'dev/docs' => 'DocumentationViewer'
));

View File

@ -67,13 +67,10 @@ class DocumentationSearch {
* Enable searching documentation
*/
public static function enable() {
if(!class_exists('ZendSearchLuceneSearchable')) {
return user_error('DocumentationSearch requires the ZendSearchLucene library', E_ERROR);
}
self::$enabled = true;
ZendSearchLuceneSearchable::enable(array());
// include the zend search functionality
set_include_path(get_include_path() . PATH_SEPARATOR . dirname(dirname(__FILE__)) . '/thirdparty/');
}
/**
@ -103,12 +100,19 @@ class DocumentationSearch {
* Rebuilds the index if it out of date
*/
public function performSearch($query) {
$index = Zend_Search_Lucene::open(self::get_index_location());
try {
$index = Zend_Search_Lucene::open(self::get_index_location());
Zend_Search_Lucene::setResultSetLimit(200);
Zend_Search_Lucene::setResultSetLimit(200);
$this->results = $index->find($query);
$this->totalResults = $index->numDocs();
$this->results = $index->find($query);
$this->totalResults = $index->numDocs();
}
catch(Zend_Search_Lucene_Exception $e) {
// the reindexing task has not been run
user_error('DocumentationSearch::performSearch() could not perform search as index does not exist.
Please run /dev/tasks/RebuildLuceneDocsIndex', E_USER_ERROR);
}
}
/**

View File

@ -1,37 +1,48 @@
<?php
/**
* Rebuilds the search indexes for the documentation pages.
*
* For the hourly cron rebuild use RebuildLuceneDocusIndex_Hourly
*
* @package sapphiredocs
* @subpackage tasks
*/
class RebuildLuceneDocsIndex extends BuildTask {
/**
* Builds the document index
*
* Perhaps we run this via a hourly / daily task rather than
* based on the user. It's a
*/
protected $title = "Rebuild Documentation Search Indexes";
protected $description = "Rebuilds the indexes used for the search engine in sapphiredocs. Gathers all documentation files from your modules";
function run($request) {
$this->rebuildIndexes();
}
function rebuildIndexes($quiet = false) {
require_once('../sapphiredocs/thirdparty/markdown/markdown.php');
ini_set("memory_limit", -1);
ini_set('max_execution_time', 0);
// only rebuild the index if we have to. Check for either flush or the time write.lock.file
// was last altered
$lock = DocumentationSearch::get_index_location() .'/write.lock.file';
$lockFileFresh = (file_exists($lock) && filemtime($lock) > (time() - (60 * 60 * 24)));
if($lockFileFresh && !isset($_REQUEST['flush'])) return true;
if($lockFileFresh && !isset($_REQUEST['flush'])) {
if(!$quiet) {
echo "Index recently rebuilt. If you want to force reindex use ?flush=1";
}
return true;
}
try {
$index = Zend_Search_Lucene::open(DocumentationSearch::get_index_location());
$index->removeReference();
}
catch (Zend_Search_Lucene_Exception $e) {
}
catch (Zend_Search_Lucene_Exception $e) {}
try {
$index = Zend_Search_Lucene::create(DocumentationSearch::get_index_location());
@ -45,14 +56,15 @@ class RebuildLuceneDocsIndex extends BuildTask {
if($pages) {
$count = 0;
// iconv complains about all the markdown formatting
// turn off notices while we parse
$error = error_reporting();
error_reporting('E_ALL ^ E_NOTICE');
foreach($pages as $page) {
$count++;
// iconv complains about all the markdown formatting
// turn off notices while we parse
$error = error_reporting();
error_reporting('E_ALL ^ E_NOTICE');
if(!is_dir($page->getPath())) {
$doc = new Zend_Search_Lucene_Document();
$content = $page->getMarkdown();
@ -65,11 +77,28 @@ class RebuildLuceneDocsIndex extends BuildTask {
$doc->addField(Zend_Search_Lucene_Field::Keyword('Link', $page->Link()));
$index->addDocument($doc);
}
error_reporting($error);
if(!$quiet) echo "adding ". $page->getTitle() ."\n\n";
}
error_reporting($error);
}
$index->commit();
if(!$quiet) echo "complete.";
}
}
/**
* @package sapphiredocs
* @subpackage tasks
*/
class RebuildLuceneDocusIndex_Hourly extends HourlyTask {
function process() {
$reindex = new RebuildLuceneDocusIndex();
$reindex->rebuildIndexes(true);
}
}

View File

@ -11,6 +11,20 @@ page.
## Setup
### Enabling Search
The module provides automatic search functionality via [Lucene Search](http://lucene.apache.org/java/docs/index.html). To enable search
you need to add the following to your applications _config.php file:
DocumentationSearch::enable();
After adding that line you will also need to build the indexes of the search. You can run the following from your webbrowser or via sake
### Using a URL other than /dev/docs/
By default, the documentation is available in `dev/docs`. If you want it to live on the webroot instead of a subfolder,
add the following configuration to your `mysite/_config.php`:
@ -18,4 +32,5 @@ add the following configuration to your `mysite/_config.php`:
Director::addRules(1, array(
'$Action' => 'DocumentationViewer',
'' => 'DocumentationViewer'
));
));

37
thirdparty/Zend/Search/Exception.php vendored Normal file
View File

@ -0,0 +1,37 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Exception.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Framework base exception
*/
require_once 'Zend/Exception.php';
/**
* @category Zend
* @package Zend_Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Exception extends Zend_Exception
{}

1577
thirdparty/Zend/Search/Lucene.php vendored Normal file
View File

@ -0,0 +1,1577 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Lucene.php 21640 2010-03-24 18:28:32Z alexander $
*/
/** User land classes and interfaces turned on by Zend/Search/Lucene.php file inclusion. */
/** @todo Section should be removed with ZF 2.0 release as obsolete */
/** Zend_Search_Lucene_Document_Html */
require_once 'Zend/Search/Lucene/Document/Html.php';
/** Zend_Search_Lucene_Document_Docx */
require_once 'Zend/Search/Lucene/Document/Docx.php';
/** Zend_Search_Lucene_Document_Pptx */
require_once 'Zend/Search/Lucene/Document/Pptx.php';
/** Zend_Search_Lucene_Document_Xlsx */
require_once 'Zend/Search/Lucene/Document/Xlsx.php';
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
/** Zend_Search_Lucene_Search_QueryHit */
require_once 'Zend/Search/Lucene/Search/QueryHit.php';
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Search_Query_Term */
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
/** Zend_Search_Lucene_Search_Query_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
/** Zend_Search_Lucene_Search_Query_MultiTerm */
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
/** Zend_Search_Lucene_Search_Query_Wildcard */
require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
/** Zend_Search_Lucene_Search_Query_Range */
require_once 'Zend/Search/Lucene/Search/Query/Range.php';
/** Zend_Search_Lucene_Search_Query_Fuzzy */
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
/** Zend_Search_Lucene_Search_Query_Boolean */
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
/** Zend_Search_Lucene_Search_Query_Empty */
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
/** Zend_Search_Lucene_Search_Query_Insignificant */
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
/** Internally used classes */
/** Zend_Search_Lucene_Interface */
require_once 'Zend/Search/Lucene/Interface.php';
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/** Zend_Search_Lucene_LockManager */
require_once 'Zend/Search/Lucene/LockManager.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene implements Zend_Search_Lucene_Interface
{
/**
* Default field name for search
*
* Null means search through all fields
*
* @var string
*/
private static $_defaultSearchField = null;
/**
* Result set limit
*
* 0 means no limit
*
* @var integer
*/
private static $_resultSetLimit = 0;
/**
* Terms per query limit
*
* 0 means no limit
*
* @var integer
*/
private static $_termsPerQueryLimit = 1024;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* File system adapter closing option
*
* @var boolean
*/
private $_closeDirOnExit = true;
/**
* Writer for this index, not instantiated unless required.
*
* @var Zend_Search_Lucene_Index_Writer
*/
private $_writer = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for current version of index.
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos = array();
/**
* Number of documents in this index.
*
* @var integer
*/
private $_docCount = 0;
/**
* Flag for index changes
*
* @var boolean
*/
private $_hasChanges = false;
/**
* Signal, that index is already closed, changes are fixed and resources are cleaned up
*
* @var boolean
*/
private $_closed = false;
/**
* Number of references to the index object
*
* @var integer
*/
private $_refCount = 0;
/**
* Current segment generation
*
* @var integer
*/
private $_generation;
const FORMAT_PRE_2_1 = 0;
const FORMAT_2_1 = 1;
const FORMAT_2_3 = 2;
/**
* Index format version
*
* @var integer
*/
private $_formatVersion;
/**
* Create index
*
* @param mixed $directory
* @return Zend_Search_Lucene_Interface
*/
public static function create($directory)
{
/** Zend_Search_Lucene_Proxy */
require_once 'Zend/Search/Lucene/Proxy.php';
return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, true));
}
/**
* Open index
*
* @param mixed $directory
* @return Zend_Search_Lucene_Interface
*/
public static function open($directory)
{
/** Zend_Search_Lucene_Proxy */
require_once 'Zend/Search/Lucene/Proxy.php';
return new Zend_Search_Lucene_Proxy(new Zend_Search_Lucene($directory, false));
}
/** Generation retrieving counter */
const GENERATION_RETRIEVE_COUNT = 10;
/** Pause between generation retrieving attempts in milliseconds */
const GENERATION_RETRIEVE_PAUSE = 50;
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory)
{
/**
* Zend_Search_Lucene uses segments.gen file to retrieve current generation number
*
* Apache Lucene index format documentation mentions this method only as a fallback method
*
* Nevertheless we use it according to the performance considerations
*
* @todo check if we can use some modification of Apache Lucene generation determination algorithm
* without performance problems
*/
require_once 'Zend/Search/Lucene/Exception.php';
try {
for ($count = 0; $count < self::GENERATION_RETRIEVE_COUNT; $count++) {
// Try to get generation file
$genFile = $directory->getFileObject('segments.gen', false);
$format = $genFile->readInt();
if ($format != (int)0xFFFFFFFE) {
throw new Zend_Search_Lucene_Exception('Wrong segments.gen file format');
}
$gen1 = $genFile->readLong();
$gen2 = $genFile->readLong();
if ($gen1 == $gen2) {
return $gen1;
}
usleep(self::GENERATION_RETRIEVE_PAUSE * 1000);
}
// All passes are failed
throw new Zend_Search_Lucene_Exception('Index is under processing now');
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
try {
// Try to open old style segments file
$segmentsFile = $directory->getFileObject('segments', false);
// It's pre-2.1 index
return 0;
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
return -1;
} else {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
} else {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
return -1;
}
/**
* Get generation number associated with this index instance
*
* The same generation number in pair with document number or query string
* guarantees to give the same result while index retrieving.
* So it may be used for search result caching.
*
* @return integer
*/
public function getGeneration()
{
return $this->_generation;
}
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation)
{
if ($generation == 0) {
return 'segments';
}
return 'segments_' . base_convert($generation, 10, 36);
}
/**
* Get index format version
*
* @return integer
*/
public function getFormatVersion()
{
return $this->_formatVersion;
}
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
* @throws Zend_Search_Lucene_Exception
*/
public function setFormatVersion($formatVersion)
{
if ($formatVersion != self::FORMAT_PRE_2_1 &&
$formatVersion != self::FORMAT_2_1 &&
$formatVersion != self::FORMAT_2_3) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unsupported index format');
}
$this->_formatVersion = $formatVersion;
}
/**
* Read segments file for pre-2.1 Lucene index format
*
* @throws Zend_Search_Lucene_Exception
*/
private function _readPre21SegmentsFile()
{
$segmentsFile = $this->_directory->getFileObject('segments');
$format = $segmentsFile->readInt();
if ($format != (int)0xFFFFFFFF) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong segments file format');
}
// read version
$segmentsFile->readLong();
// read segment name counter
$segmentsFile->readInt();
$segments = $segmentsFile->readInt();
$this->_docCount = 0;
// read segmentInfos
for ($count = 0; $count < $segments; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
$this->_docCount += $segSize;
$this->_segmentInfos[$segName] =
new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$segName,
$segSize);
}
// Use 2.1 as a target version. Index will be reorganized at update time.
$this->_formatVersion = self::FORMAT_2_1;
}
/**
* Read segments file
*
* @throws Zend_Search_Lucene_Exception
*/
private function _readSegmentsFile()
{
$segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
$format = $segmentsFile->readInt();
if ($format == (int)0xFFFFFFFC) {
$this->_formatVersion = self::FORMAT_2_3;
} else if ($format == (int)0xFFFFFFFD) {
$this->_formatVersion = self::FORMAT_2_1;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
}
// read version
$segmentsFile->readLong();
// read segment name counter
$segmentsFile->readInt();
$segments = $segmentsFile->readInt();
$this->_docCount = 0;
// read segmentInfos
for ($count = 0; $count < $segments; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
// 2.1+ specific properties
$delGen = $segmentsFile->readLong();
if ($this->_formatVersion == self::FORMAT_2_3) {
$docStoreOffset = $segmentsFile->readInt();
if ($docStoreOffset != (int)0xFFFFFFFF) {
$docStoreSegment = $segmentsFile->readString();
$docStoreIsCompoundFile = $segmentsFile->readByte();
$docStoreOptions = array('offset' => $docStoreOffset,
'segment' => $docStoreSegment,
'isCompound' => ($docStoreIsCompoundFile == 1));
} else {
$docStoreOptions = null;
}
} else {
$docStoreOptions = null;
}
$hasSingleNormFile = $segmentsFile->readByte();
$numField = $segmentsFile->readInt();
$normGens = array();
if ($numField != (int)0xFFFFFFFF) {
for ($count1 = 0; $count1 < $numField; $count1++) {
$normGens[] = $segmentsFile->readLong();
}
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Separate norm files are not supported. Optimize index to use it with Zend_Search_Lucene.');
}
$isCompoundByte = $segmentsFile->readByte();
if ($isCompoundByte == 0xFF) {
// The segment is not a compound file
$isCompound = false;
} else if ($isCompoundByte == 0x00) {
// The status is unknown
$isCompound = null;
} else if ($isCompoundByte == 0x01) {
// The segment is a compound file
$isCompound = true;
}
$this->_docCount += $segSize;
$this->_segmentInfos[$segName] =
new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$segName,
$segSize,
$delGen,
$docStoreOptions,
$hasSingleNormFile,
$isCompound);
}
}
/**
* Opens the index.
*
* IndexReader constructor needs Directory as a parameter. It should be
* a string with a path to the index folder or a Directory object.
*
* @param Zend_Search_Lucene_Storage_Directory_Filesystem|string $directory
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($directory = null, $create = false)
{
if ($directory === null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Exception('No index directory specified');
}
if (is_string($directory)) {
require_once 'Zend/Search/Lucene/Storage/Directory/Filesystem.php';
$this->_directory = new Zend_Search_Lucene_Storage_Directory_Filesystem($directory);
$this->_closeDirOnExit = true;
} else {
$this->_directory = $directory;
$this->_closeDirOnExit = false;
}
$this->_segmentInfos = array();
// Mark index as "under processing" to prevent other processes from premature index cleaning
Zend_Search_Lucene_LockManager::obtainReadLock($this->_directory);
$this->_generation = self::getActualGeneration($this->_directory);
if ($create) {
require_once 'Zend/Search/Lucene/Exception.php';
try {
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
} catch (Zend_Search_Lucene_Exception $e) {
Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
if (strpos($e->getMessage(), 'Can\'t obtain exclusive index lock') === false) {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
} else {
throw new Zend_Search_Lucene_Exception('Can\'t create index. It\'s under processing now', 0, $e);
}
}
if ($this->_generation == -1) {
// Directory doesn't contain existing index, start from 1
$this->_generation = 1;
$nameCounter = 0;
} else {
// Directory contains existing index
$segmentsFile = $this->_directory->getFileObject(self::getSegmentFileName($this->_generation));
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$nameCounter = $segmentsFile->readInt();
$this->_generation++;
}
require_once 'Zend/Search/Lucene/Index/Writer.php';
Zend_Search_Lucene_Index_Writer::createIndex($this->_directory, $this->_generation, $nameCounter);
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
}
if ($this->_generation == -1) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Index doesn\'t exists in the specified directory.');
} else if ($this->_generation == 0) {
$this->_readPre21SegmentsFile();
} else {
$this->_readSegmentsFile();
}
}
/**
* Close current index and free resources
*/
private function _close()
{
if ($this->_closed) {
// index is already closed and resources are cleaned up
return;
}
$this->commit();
// Release "under processing" flag
Zend_Search_Lucene_LockManager::releaseReadLock($this->_directory);
if ($this->_closeDirOnExit) {
$this->_directory->close();
}
$this->_directory = null;
$this->_writer = null;
$this->_segmentInfos = null;
$this->_closed = true;
}
/**
* Add reference to the index object
*
* @internal
*/
public function addReference()
{
$this->_refCount++;
}
/**
* Remove reference from the index object
*
* When reference count becomes zero, index is closed and resources are cleaned up
*
* @internal
*/
public function removeReference()
{
$this->_refCount--;
if ($this->_refCount == 0) {
$this->_close();
}
}
/**
* Object destructor
*/
public function __destruct()
{
$this->_close();
}
/**
* Returns an instance of Zend_Search_Lucene_Index_Writer for the index
*
* @return Zend_Search_Lucene_Index_Writer
*/
private function _getIndexWriter()
{
if ($this->_writer === null) {
require_once 'Zend/Search/Lucene/Index/Writer.php';
$this->_writer = new Zend_Search_Lucene_Index_Writer($this->_directory,
$this->_segmentInfos,
$this->_formatVersion);
}
return $this->_writer;
}
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @return Zend_Search_Lucene_Storage_Directory
*/
public function getDirectory()
{
return $this->_directory;
}
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc()
{
return $this->count();
}
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs()
{
$numDocs = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
$numDocs += $segmentInfo->numDocs();
}
return $numDocs;
}
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function isDeleted($id)
{
if ($id >= $this->_docCount) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
if ($segmentStartId + $segmentInfo->count() > $id) {
break;
}
$segmentStartId += $segmentInfo->count();
}
return $segmentInfo->isDeleted($id - $segmentStartId);
}
/**
* Set default search field.
*
* Null means, that search is performed through all fields by default
*
* Default value is null
*
* @param string $fieldName
*/
public static function setDefaultSearchField($fieldName)
{
self::$_defaultSearchField = $fieldName;
}
/**
* Get default search field.
*
* Null means, that search is performed through all fields by default
*
* @return string
*/
public static function getDefaultSearchField()
{
return self::$_defaultSearchField;
}
/**
* Set result set limit.
*
* 0 (default) means no limit
*
* @param integer $limit
*/
public static function setResultSetLimit($limit)
{
self::$_resultSetLimit = $limit;
}
/**
* Get result set limit.
*
* 0 means no limit
*
* @return integer
*/
public static function getResultSetLimit()
{
return self::$_resultSetLimit;
}
/**
* Set terms per query limit.
*
* 0 means no limit
*
* @param integer $limit
*/
public static function setTermsPerQueryLimit($limit)
{
self::$_termsPerQueryLimit = $limit;
}
/**
* Get result set limit.
*
* 0 (default) means no limit
*
* @return integer
*/
public static function getTermsPerQueryLimit()
{
return self::$_termsPerQueryLimit;
}
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @return integer
*/
public function getMaxBufferedDocs()
{
return $this->_getIndexWriter()->maxBufferedDocs;
}
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs)
{
$this->_getIndexWriter()->maxBufferedDocs = $maxBufferedDocs;
}
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @return integer
*/
public function getMaxMergeDocs()
{
return $this->_getIndexWriter()->maxMergeDocs;
}
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs)
{
$this->_getIndexWriter()->maxMergeDocs = $maxMergeDocs;
}
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @return integer
*/
public function getMergeFactor()
{
return $this->_getIndexWriter()->mergeFactor;
}
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor)
{
$this->_getIndexWriter()->mergeFactor = $mergeFactor;
}
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param Zend_Search_Lucene_Search_QueryParser|string $query
* @return array Zend_Search_Lucene_Search_QueryHit
* @throws Zend_Search_Lucene_Exception
*/
public function find($query)
{
if (is_string($query)) {
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
$query = Zend_Search_Lucene_Search_QueryParser::parse($query);
}
if (!$query instanceof Zend_Search_Lucene_Search_Query) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Query must be a string or Zend_Search_Lucene_Search_Query object');
}
$this->commit();
$hits = array();
$scores = array();
$ids = array();
$query = $query->rewrite($this)->optimize($this);
$query->execute($this);
$topScore = 0;
/** Zend_Search_Lucene_Search_QueryHit */
require_once 'Zend/Search/Lucene/Search/QueryHit.php';
foreach ($query->matchedDocs() as $id => $num) {
$docScore = $query->score($id, $this);
if( $docScore != 0 ) {
$hit = new Zend_Search_Lucene_Search_QueryHit($this);
$hit->id = $id;
$hit->score = $docScore;
$hits[] = $hit;
$ids[] = $id;
$scores[] = $docScore;
if ($docScore > $topScore) {
$topScore = $docScore;
}
}
if (self::$_resultSetLimit != 0 && count($hits) >= self::$_resultSetLimit) {
break;
}
}
if (count($hits) == 0) {
// skip sorting, which may cause a error on empty index
return array();
}
if ($topScore > 1) {
foreach ($hits as $hit) {
$hit->score /= $topScore;
}
}
if (func_num_args() == 1) {
// sort by scores
array_multisort($scores, SORT_DESC, SORT_NUMERIC,
$ids, SORT_ASC, SORT_NUMERIC,
$hits);
} else {
// sort by given field names
$argList = func_get_args();
$fieldNames = $this->getFieldNames();
$sortArgs = array();
// PHP 5.3 now expects all arguments to array_multisort be passed by
// reference (if it's invoked through call_user_func_array());
// since constants can't be passed by reference, create some placeholder variables.
$sortReg = SORT_REGULAR;
$sortAsc = SORT_ASC;
$sortNum = SORT_NUMERIC;
$sortFieldValues = array();
require_once 'Zend/Search/Lucene/Exception.php';
for ($count = 1; $count < count($argList); $count++) {
$fieldName = $argList[$count];
if (!is_string($fieldName)) {
throw new Zend_Search_Lucene_Exception('Field name must be a string.');
}
if (strtolower($fieldName) == 'score') {
$sortArgs[] = &$scores;
} else {
if (!in_array($fieldName, $fieldNames)) {
throw new Zend_Search_Lucene_Exception('Wrong field name.');
}
if (!isset($sortFieldValues[$fieldName])) {
$valuesArray = array();
foreach ($hits as $hit) {
try {
$value = $hit->getDocument()->getFieldValue($fieldName);
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'not found') === false) {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
} else {
$value = null;
}
}
$valuesArray[] = $value;
}
// Collect loaded values in $sortFieldValues
// Required for PHP 5.3 which translates references into values when source
// variable is destroyed
$sortFieldValues[$fieldName] = $valuesArray;
}
$sortArgs[] = &$sortFieldValues[$fieldName];
}
if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
$count++;
$sortArgs[] = &$argList[$count];
if ($count + 1 < count($argList) && is_integer($argList[$count+1])) {
$count++;
$sortArgs[] = &$argList[$count];
} else {
if ($argList[$count] == SORT_ASC || $argList[$count] == SORT_DESC) {
$sortArgs[] = &$sortReg;
} else {
$sortArgs[] = &$sortAsc;
}
}
} else {
$sortArgs[] = &$sortAsc;
$sortArgs[] = &$sortReg;
}
}
// Sort by id's if values are equal
$sortArgs[] = &$ids;
$sortArgs[] = &$sortAsc;
$sortArgs[] = &$sortNum;
// Array to be sorted
$sortArgs[] = &$hits;
// Do sort
call_user_func_array('array_multisort', $sortArgs);
}
return $hits;
}
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false)
{
$result = array();
foreach( $this->_segmentInfos as $segmentInfo ) {
$result = array_merge($result, $segmentInfo->getFields($indexed));
}
return $result;
}
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @return Zend_Search_Lucene_Document
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function getDocument($id)
{
if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
/* @var $id Zend_Search_Lucene_Search_QueryHit */
$id = $id->id;
}
if ($id >= $this->_docCount) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
if ($segmentStartId + $segmentInfo->count() > $id) {
break;
}
$segmentStartId += $segmentInfo->count();
}
$fdxFile = $segmentInfo->openCompoundFile('.fdx');
$fdxFile->seek(($id-$segmentStartId)*8, SEEK_CUR);
$fieldValuesPosition = $fdxFile->readLong();
$fdtFile = $segmentInfo->openCompoundFile('.fdt');
$fdtFile->seek($fieldValuesPosition, SEEK_CUR);
$fieldCount = $fdtFile->readVInt();
$doc = new Zend_Search_Lucene_Document();
for ($count = 0; $count < $fieldCount; $count++) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
$fieldInfo = $segmentInfo->getField($fieldNum);
if (!($bits & 2)) { // Text data
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readString(),
'UTF-8',
true,
$fieldInfo->isIndexed,
$bits & 1 );
} else { // Binary data
$field = new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readBinary(),
'',
true,
$fieldInfo->isIndexed,
$bits & 1,
true );
}
$doc->addField($field);
}
return $doc;
}
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return boolean
*/
public function hasTerm(Zend_Search_Lucene_Index_Term $term)
{
foreach ($this->_segmentInfos as $segInfo) {
if ($segInfo->getTermInfo($term) !== null) {
return true;
}
}
return false;
}
/**
* Returns IDs of all documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
$subResults = array();
$segmentStartDocId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
$subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
$segmentStartDocId += $segmentInfo->count();
}
if (count($subResults) == 0) {
return array();
} else if (count($subResults) == 1) {
// Index is optimized (only one segment)
// Do not perform array reindexing
return reset($subResults);
} else {
$result = call_user_func_array('array_merge', $subResults);
}
return $result;
}
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_DocsFilter
*/
public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
$segmentStartDocId = 0;
$result = new Zend_Search_Lucene_Index_DocsFilter();
foreach ($this->_segmentInfos as $segmentInfo) {
$subResults[] = $segmentInfo->termDocs($term, $segmentStartDocId, $docsFilter);
$segmentStartDocId += $segmentInfo->count();
}
if (count($subResults) == 0) {
return array();
} else if (count($subResults) == 1) {
// Index is optimized (only one segment)
// Do not perform array reindexing
return reset($subResults);
} else {
$result = call_user_func_array('array_merge', $subResults);
}
return $result;
}
/**
* Returns an array of all term freqs.
* Result array structure: array(docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return integer
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
$result = array();
$segmentStartDocId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
$result += $segmentInfo->termFreqs($term, $segmentStartDocId, $docsFilter);
$segmentStartDocId += $segmentInfo->count();
}
return $result;
}
/**
* Returns an array of all term positions in the documents.
* Result array structure: array(docId => array(pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
$result = array();
$segmentStartDocId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
$result += $segmentInfo->termPositions($term, $segmentStartDocId, $docsFilter);
$segmentStartDocId += $segmentInfo->count();
}
return $result;
}
/**
* Returns the number of documents in this index containing the $term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return integer
*/
public function docFreq(Zend_Search_Lucene_Index_Term $term)
{
$result = 0;
foreach ($this->_segmentInfos as $segInfo) {
$termInfo = $segInfo->getTermInfo($term);
if ($termInfo !== null) {
$result += $termInfo->docFreq;
}
}
return $result;
}
/**
* Retrive similarity used by index reader
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public function getSimilarity()
{
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
return Zend_Search_Lucene_Search_Similarity::getDefault();
}
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
if ($id >= $this->_docCount) {
return null;
}
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segInfo) {
if ($segmentStartId + $segInfo->count() > $id) {
break;
}
$segmentStartId += $segInfo->count();
}
if ($segInfo->isDeleted($id - $segmentStartId)) {
return 0;
}
return $segInfo->norm($id - $segmentStartId, $fieldName);
}
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions()
{
foreach ($this->_segmentInfos as $segmentInfo) {
if ($segmentInfo->hasDeletions()) {
return true;
}
}
return false;
}
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @throws Zend_Search_Lucene_Exception
*/
public function delete($id)
{
if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
/* @var $id Zend_Search_Lucene_Search_QueryHit */
$id = $id->id;
}
if ($id >= $this->_docCount) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segmentInfo) {
if ($segmentStartId + $segmentInfo->count() > $id) {
break;
}
$segmentStartId += $segmentInfo->count();
}
$segmentInfo->delete($id - $segmentStartId);
$this->_hasChanges = true;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
$this->_getIndexWriter()->addDocument($document);
$this->_docCount++;
$this->_hasChanges = true;
}
/**
* Update document counter
*/
private function _updateDocCount()
{
$this->_docCount = 0;
foreach ($this->_segmentInfos as $segInfo) {
$this->_docCount += $segInfo->count();
}
}
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*
* @todo undeleteAll processing.
*/
public function commit()
{
if ($this->_hasChanges) {
$this->_getIndexWriter()->commit();
$this->_updateDocCount();
$this->_hasChanges = false;
}
}
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize()
{
// Commit changes if any changes have been made
$this->commit();
if (count($this->_segmentInfos) > 1 || $this->hasDeletions()) {
$this->_getIndexWriter()->optimize();
$this->_updateDocCount();
}
}
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms()
{
$result = array();
/** Zend_Search_Lucene_Index_TermsPriorityQueue */
require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
$segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
foreach ($this->_segmentInfos as $segmentInfo) {
$segmentInfo->resetTermsStream();
// Skip "empty" segments
if ($segmentInfo->currentTerm() !== null) {
$segmentInfoQueue->put($segmentInfo);
}
}
while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
if ($segmentInfoQueue->top() === null ||
$segmentInfoQueue->top()->currentTerm()->key() !=
$segmentInfo->currentTerm()->key()) {
// We got new term
$result[] = $segmentInfo->currentTerm();
}
if ($segmentInfo->nextTerm() !== null) {
// Put segment back into the priority queue
$segmentInfoQueue->put($segmentInfo);
}
}
return $result;
}
/**
* Terms stream priority queue object
*
* @var Zend_Search_Lucene_TermStreamsPriorityQueue
*/
private $_termsStream = null;
/**
* Reset terms stream.
*/
public function resetTermsStream()
{
if ($this->_termsStream === null) {
/** Zend_Search_Lucene_TermStreamsPriorityQueue */
require_once 'Zend/Search/Lucene/TermStreamsPriorityQueue.php';
$this->_termsStream = new Zend_Search_Lucene_TermStreamsPriorityQueue($this->_segmentInfos);
} else {
$this->_termsStream->resetTermsStream();
}
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
$this->_termsStream->skipTo($prefix);
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
return $this->_termsStream->nextTerm();
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_termsStream->currentTerm();
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_termsStream->closeTermsStream();
$this->_termsStream = null;
}
/*************************************************************************
@todo UNIMPLEMENTED
*************************************************************************/
/**
* Undeletes all documents currently marked as deleted in this index.
*
* @todo Implementation
*/
public function undeleteAll()
{}
}

View File

@ -0,0 +1,175 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Analyzer.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */
/** @todo Section should be removed with ZF 2.0 release as obsolete */
if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) {
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
}
/**
* An Analyzer is used to analyze text.
* It thus represents a policy for extracting index terms from text.
*
* Note:
* Lucene Java implementation is oriented to streams. It provides effective work
* with a huge documents (more then 20Mb).
* But engine itself is not oriented such documents.
* Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The Analyzer implementation used by default.
*
* @var Zend_Search_Lucene_Analysis_Analyzer
*/
private static $_defaultImpl;
/**
* Input string
*
* @var string
*/
protected $_input = null;
/**
* Input string encoding
*
* @var string
*/
protected $_encoding = '';
/**
* Tokenize text to a terms
* Returns array of Zend_Search_Lucene_Analysis_Token objects
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @param string $data
* @return array
*/
public function tokenize($data, $encoding = '')
{
$this->setInput($data, $encoding);
$tokenList = array();
while (($nextToken = $this->nextToken()) !== null) {
$tokenList[] = $nextToken;
}
return $tokenList;
}
/**
* Tokenization stream API
* Set input
*
* @param string $data
*/
public function setInput($data, $encoding = '')
{
$this->_input = $data;
$this->_encoding = $encoding;
$this->reset();
}
/**
* Reset token stream
*/
abstract public function reset();
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
abstract public function nextToken();
/**
* Set the default Analyzer implementation used by indexing code.
*
* @param Zend_Search_Lucene_Analysis_Analyzer $similarity
*/
public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
{
self::$_defaultImpl = $analyzer;
}
/**
* Return the default Analyzer implementation used by indexing code.
*
* @return Zend_Search_Lucene_Analysis_Analyzer
*/
public static function getDefault()
{
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
}
return self::$_defaultImpl;
}
}

View File

@ -0,0 +1,92 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Common.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Define constant used to provide correct file processing order */
/** @todo Section should be removed with ZF 2.0 release as obsolete */
define('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED', true);
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Common implementation of the Zend_Search_Lucene_Analysis_Analyzer interface.
* There are several standard standard subclasses provided by Zend_Search_Lucene/Analysis
* subpackage: Zend_Search_Lucene_Analysis_Analyzer_Common_Text, ZSearchHTMLAnalyzer, ZSearchXMLAnalyzer.
*
* @todo ZSearchHTMLAnalyzer and ZSearchXMLAnalyzer implementation
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_Analyzer_Common extends Zend_Search_Lucene_Analysis_Analyzer
{
/**
* The set of Token filters applied to the Token stream.
* Array of Zend_Search_Lucene_Analysis_TokenFilter objects.
*
* @var array
*/
private $_filters = array();
/**
* Add Token filter to the Analyzer
*
* @param Zend_Search_Lucene_Analysis_TokenFilter $filter
*/
public function addFilter(Zend_Search_Lucene_Analysis_TokenFilter $filter)
{
$this->_filters[] = $filter;
}
/**
* Apply filters to the token. Can return null when the token was removed.
*
* @param Zend_Search_Lucene_Analysis_Token $token
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $token)
{
foreach ($this->_filters as $filter) {
$token = $filter->normalize($token);
// resulting token can be null if the filter removes it
if ($token === null) {
return null;
}
}
return $token;
}
}

View File

@ -0,0 +1,96 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Text.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@ -0,0 +1,47 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: CaseInsensitive.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Text
{
public function __construct()
{
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
}
}

View File

@ -0,0 +1,95 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: TextNum.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current position in a stream
*
* @var integer
*/
private $_position;
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
if ($this->_input === null) {
return;
}
// convert input into ascii
if (PHP_OS != 'AIX') {
$this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input);
}
$this->_encoding = 'ASCII';
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
$str = $match[0][0];
$pos = $match[0][1];
$endpos = $pos + strlen($str);
$this->_position = $endpos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($str, $pos, $endpos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@ -0,0 +1,47 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: CaseInsensitive.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCase */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCase.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum
{
public function __construct()
{
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCase());
}
}

View File

@ -0,0 +1,126 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Utf8.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws Zend_Search_Lucene_Exception
*/
public function __construct()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@ -0,0 +1,49 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: CaseInsensitive.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8
{
public function __construct()
{
parent::__construct();
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8());
}
}

View File

@ -0,0 +1,126 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Utf8Num.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
/**
* Current char position in an UTF-8 stream
*
* @var integer
*/
private $_position;
/**
* Current binary position in an UTF-8 stream
*
* @var integer
*/
private $_bytePosition;
/**
* Object constructor
*
* @throws Zend_Search_Lucene_Exception
*/
public function __construct()
{
if (@preg_match('/\pL/u', 'a') != 1) {
// PCRE unicode support is turned off
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8Num analyzer needs PCRE unicode support to be enabled.');
}
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$this->_bytePosition = 0;
// convert input into UTF-8
if (strcasecmp($this->_encoding, 'utf8' ) != 0 &&
strcasecmp($this->_encoding, 'utf-8') != 0 ) {
$this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
$this->_encoding = 'UTF-8';
}
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
do {
if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
// It covers both cases a) there are no matches (preg_match(...) === 0)
// b) error occured (preg_match(...) === FALSE)
return null;
}
// matched string
$matchedWord = $match[0][0];
// binary position of the matched word in the input stream
$binStartPos = $match[0][1];
// character position of the matched word in the input stream
$startPos = $this->_position +
iconv_strlen(substr($this->_input,
$this->_bytePosition,
$binStartPos - $this->_bytePosition),
'UTF-8');
// character postion of the end of matched word in the input stream
$endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
$this->_bytePosition = $binStartPos + strlen($matchedWord);
$this->_position = $endPos;
$token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
} while ($token === null); // try again if token is skipped
return $token;
}
}

View File

@ -0,0 +1,49 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: CaseInsensitive.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
/** Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive extends Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num
{
public function __construct()
{
parent::__construct();
$this->addFilter(new Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8());
}
}

View File

@ -0,0 +1,154 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Token.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_Token
{
/**
* The text of the term.
*
* @var string
*/
private $_termText;
/**
* Start in source text.
*
* @var integer
*/
private $_startOffset;
/**
* End in source text
*
* @var integer
*/
private $_endOffset;
/**
* The position of this token relative to the previous Token.
*
* The default value is one.
*
* Some common uses for this are:
* Set it to zero to put multiple terms in the same position. This is
* useful if, e.g., a word has multiple stems. Searches for phrases
* including either stem will match. In this case, all but the first stem's
* increment should be set to zero: the increment of the first instance
* should be one. Repeating a token with an increment of zero can also be
* used to boost the scores of matches on that token.
*
* Set it to values greater than one to inhibit exact phrase matches.
* If, for example, one does not want phrases to match across removed stop
* words, then one could build a stop word filter that removes stop words and
* also sets the increment to the number of stop words removed before each
* non-stop word. Then exact phrase queries will only match when the terms
* occur with no intervening stop words.
*
* @var integer
*/
private $_positionIncrement;
/**
* Object constructor
*
* @param string $text
* @param integer $start
* @param integer $end
* @param string $type
*/
public function __construct($text, $start, $end)
{
$this->_termText = $text;
$this->_startOffset = $start;
$this->_endOffset = $end;
$this->_positionIncrement = 1;
}
/**
* positionIncrement setter
*
* @param integer $positionIncrement
*/
public function setPositionIncrement($positionIncrement)
{
$this->_positionIncrement = $positionIncrement;
}
/**
* Returns the position increment of this Token.
*
* @return integer
*/
public function getPositionIncrement()
{
return $this->_positionIncrement;
}
/**
* Returns the Token's term text.
*
* @return string
*/
public function getTermText()
{
return $this->_termText;
}
/**
* Returns this Token's starting offset, the position of the first character
* corresponding to this token in the source text.
*
* Note:
* The difference between getEndOffset() and getStartOffset() may not be equal
* to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered
* by a stemmer or some other filter.
*
* @return integer
*/
public function getStartOffset()
{
return $this->_startOffset;
}
/**
* Returns this Token's ending offset, one greater than the position of the
* last character corresponding to this token in the source text.
*
* @return integer
*/
public function getEndOffset()
{
return $this->_endOffset;
}
}

View File

@ -0,0 +1,47 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: TokenFilter.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_Token */
require_once 'Zend/Search/Lucene/Analysis/Token.php';
/**
* Token filter converts (normalizes) Token ore removes it from a token stream.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
abstract public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken);
}

View File

@ -0,0 +1,58 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: LowerCase.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_LowerCase extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
$newToken = new Zend_Search_Lucene_Analysis_Token(
strtolower( $srcToken->getTermText() ),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}

View File

@ -0,0 +1,70 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: LowerCaseUtf8.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Lower case Token filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_LowerCaseUtf8 extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Object constructor
*/
public function __construct()
{
if (!function_exists('mb_strtolower')) {
// mbstring extension is disabled
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Utf8 compatible lower case filter needs mbstring extension to be enabled.');
}
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken)
{
$newToken = new Zend_Search_Lucene_Analysis_Token(
mb_strtolower($srcToken->getTermText(), 'UTF-8'),
$srcToken->getStartOffset(),
$srcToken->getEndOffset());
$newToken->setPositionIncrement($srcToken->getPositionIncrement());
return $newToken;
}
}

View File

@ -0,0 +1,69 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: ShortWords.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Token filter that removes short words. What is short word can be configured with constructor.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_ShortWords extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Minimum allowed term length
* @var integer
*/
private $length;
/**
* Constructs new instance of this filter.
*
* @param integer $short minimum allowed length of term which passes this filter (default 2)
*/
public function __construct($length = 2) {
$this->length = $length;
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
if (strlen($srcToken->getTermText()) < $this->length) {
return null;
} else {
return $srcToken;
}
}
}

View File

@ -0,0 +1,101 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: StopWords.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Analysis_TokenFilter */
require_once 'Zend/Search/Lucene/Analysis/TokenFilter.php';
/**
* Token filter that removes stop words. These words must be provided as array (set), example:
* $stopwords = array('the' => 1, 'an' => '1');
*
* We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Analysis
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Analysis_TokenFilter_StopWords extends Zend_Search_Lucene_Analysis_TokenFilter
{
/**
* Stop Words
* @var array
*/
private $_stopSet;
/**
* Constructs new instance of this filter.
*
* @param array $stopwords array (set) of words that will be filtered out
*/
public function __construct($stopwords = array()) {
$this->_stopSet = array_flip($stopwords);
}
/**
* Normalize Token or remove it (if null is returned)
*
* @param Zend_Search_Lucene_Analysis_Token $srcToken
* @return Zend_Search_Lucene_Analysis_Token
*/
public function normalize(Zend_Search_Lucene_Analysis_Token $srcToken) {
if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) {
return null;
} else {
return $srcToken;
}
}
/**
* Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first
* column are ignored (as comments).
*
* You can call this method one or more times. New stopwords are always added to current set.
*
* @param string $filepath full path for text file with stopwords
* @throws Zend_Search_Exception When the file doesn`t exists or is not readable.
*/
public function loadFromFile($filepath = null) {
if (! $filepath || ! file_exists($filepath)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('You have to provide valid file path');
}
$fd = fopen($filepath, "r");
if (! $fd) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Cannot open file ' . $filepath);
}
while (!feof ($fd)) {
$buffer = trim(fgets($fd));
if (strlen($buffer) > 0 && $buffer[0] != '#') {
$this->_stopSet[$buffer] = 1;
}
}
if (!fclose($fd)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Cannot close file ' . $filepath);
}
}
}

View File

@ -0,0 +1,131 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Document.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Field */
require_once 'Zend/Search/Lucene/Field.php';
/**
* A Document is a set of fields. Each field has a name and a textual value.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document
{
/**
* Associative array Zend_Search_Lucene_Field objects where the keys to the
* array are the names of the fields.
*
* @var array
*/
protected $_fields = array();
/**
* Field boost factor
* It's not stored directly in the index, but affects on normalization factor
*
* @var float
*/
public $boost = 1.0;
/**
* Proxy method for getFieldValue(), provides more convenient access to
* the string value of a field.
*
* @param $offset
* @return string
*/
public function __get($offset)
{
return $this->getFieldValue($offset);
}
/**
* Add a field object to this document.
*
* @param Zend_Search_Lucene_Field $field
* @return Zend_Search_Lucene_Document
*/
public function addField(Zend_Search_Lucene_Field $field)
{
$this->_fields[$field->name] = $field;
return $this;
}
/**
* Return an array with the names of the fields in this document.
*
* @return array
*/
public function getFieldNames()
{
return array_keys($this->_fields);
}
/**
* Returns Zend_Search_Lucene_Field object for a named field in this document.
*
* @param string $fieldName
* @return Zend_Search_Lucene_Field
*/
public function getField($fieldName)
{
if (!array_key_exists($fieldName, $this->_fields)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Field name \"$fieldName\" not found in document.");
}
return $this->_fields[$fieldName];
}
/**
* Returns the string value of a named field in this document.
*
* @see __get()
* @return string
*/
public function getFieldValue($fieldName)
{
return $this->getField($fieldName)->value;
}
/**
* Returns the string value of a named field in UTF-8 encoding.
*
* @see __get()
* @return string
*/
public function getFieldUtf8Value($fieldName)
{
return $this->getField($fieldName)->getUtf8Value();
}
}

View File

@ -0,0 +1,151 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Docx.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
/**
* Docx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Docx extends Zend_Search_Lucene_Document_OpenXml {
/**
* Xml Schema - WordprocessingML
*
* @var string
*/
const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws Zend_Search_Lucene_Exception
*/
private function __construct($fileName, $storeContent) {
if (!class_exists('ZipArchive', false)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
}
// Document data holders
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .docx file.');
}
$relations = simplexml_load_string($relationsXml);
foreach($relations->Relationship as $rel) {
if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read in contents...
$contents = simplexml_load_string($package->getFromName(
$this->absoluteZipPath(dirname($rel['Target'])
. '/'
. basename($rel['Target']))
));
$contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML);
$paragraphs = $contents->xpath('//w:body/w:p');
foreach ($paragraphs as $paragraph) {
$runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
if ($runs === false) {
// Paragraph doesn't contain any text or breaks
continue;
}
foreach ($runs as $run) {
if ($run->getName() == 'br') {
// Break element
$documentBody[] = ' ';
} else {
$documentBody[] = (string)$run;
}
}
// Add space after each paragraph. So they are not bound together.
$documentBody[] = ' ';
}
break;
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value) {
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (! isset($coreProperties['title'])) {
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Docx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Docx
* @throws Zend_Search_Lucene_Document_Exception
*/
public static function loadDocxFile($fileName, $storeContent = false) {
if (!is_readable($fileName)) {
require_once 'Zend/Search/Lucene/Document/Exception.php';
throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.');
}
return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent);
}
}

View File

@ -0,0 +1,37 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Exception.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Framework base exception
*/
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Exception extends Zend_Search_Lucene_Exception
{}

View File

@ -0,0 +1,481 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Html.php 21946 2010-04-19 08:21:02Z alexander $
*/
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
/**
* HTML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
{
/**
* List of document links
*
* @var array
*/
private $_links = array();
/**
* List of document header links
*
* @var array
*/
private $_headerLinks = array();
/**
* Stored DOM representation
*
* @var DOMDocument
*/
private $_doc;
/**
* Exclud nofollow links flag
*
* If true then links with rel='nofollow' attribute are not included into
* document links.
*
* @var boolean
*/
private static $_excludeNoFollowLinks = false;
/**
*
* List of inline tags
*
* @var array
*/
private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
'q', 'sub', 'sup');
/**
* Object constructor
*
* @param string $data HTML string (may be HTML fragment, )
* @param boolean $isFile
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
*/
private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
{
$this->_doc = new DOMDocument();
$this->_doc->substituteEntities = true;
if ($isFile) {
$htmlData = file_get_contents($data);
} else {
$htmlData = $data;
}
@$this->_doc->loadHTML($htmlData);
if ($this->_doc->encoding === null) {
// Document encoding is not recognized
/** @todo improve HTML vs HTML fragment recognition */
if (preg_match('/<html>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
// It's an HTML document
// Add additional HEAD section and recognize document
$htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
@$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
. '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
. iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
// Remove additional HEAD section
$xpath = new DOMXPath($this->_doc);
$head = $xpath->query('/html/head')->item(0);
$head->parentNode->removeChild($head);
} else {
// It's an HTML fragment
@$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
. iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
. '</body></html>');
}
}
/** @todo Add correction of wrong HTML encoding recognition processing
* The case is:
* Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
* even $this->_doc->encoding demonstrates another recognized encoding
*/
$xpath = new DOMXPath($this->_doc);
$docTitle = '';
$titleNodes = $xpath->query('/html/head/title');
foreach ($titleNodes as $titleNode) {
// title should always have only one entry, but we process all nodeset entries
$docTitle .= $titleNode->nodeValue . ' ';
}
$this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
$metaNodes = $xpath->query('/html/head/meta[@name]');
foreach ($metaNodes as $metaNode) {
$this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
$metaNode->getAttribute('content'),
'UTF-8'));
}
$docBody = '';
$bodyNodes = $xpath->query('/html/body');
foreach ($bodyNodes as $bodyNode) {
// body should always have only one entry, but we process all nodeset entries
$this->_retrieveNodeText($bodyNode, $docBody);
}
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
}
$linkNodes = $this->_doc->getElementsByTagName('a');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' &&
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
) {
$this->_links[] = $href;
}
}
$linkNodes = $this->_doc->getElementsByTagName('area');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '' &&
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
) {
$this->_links[] = $href;
}
}
$this->_links = array_unique($this->_links);
$linkNodes = $xpath->query('/html/head/link');
foreach ($linkNodes as $linkNode) {
if (($href = $linkNode->getAttribute('href')) != '') {
$this->_headerLinks[] = $href;
}
}
$this->_headerLinks = array_unique($this->_headerLinks);
}
/**
* Set exclude nofollow links flag
*
* @param boolean $newValue
*/
public static function setExcludeNoFollowLinks($newValue)
{
self::$_excludeNoFollowLinks = $newValue;
}
/**
* Get exclude nofollow links flag
*
* @return boolean
*/
public static function getExcludeNoFollowLinks()
{
return self::$_excludeNoFollowLinks;
}
/**
* Get node text
*
* We should exclude scripts, which may be not included into comment tags, CDATA sections,
*
* @param DOMNode $node
* @param string &$text
*/
private function _retrieveNodeText(DOMNode $node, &$text)
{
if ($node->nodeType == XML_TEXT_NODE) {
$text .= $node->nodeValue;
if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
$text .= ' ';
}
} else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') {
foreach ($node->childNodes as $childNode) {
$this->_retrieveNodeText($childNode, $text);
}
}
}
/**
* Get document HREF links
*
* @return array
*/
public function getLinks()
{
return $this->_links;
}
/**
* Get document header links
*
* @return array
*/
public function getHeaderLinks()
{
return $this->_headerLinks;
}
/**
* Load HTML document from a string
*
* @param string $data
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @return Zend_Search_Lucene_Document_Html
*/
public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
{
return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
}
/**
* Load HTML document from a file
*
* @param string $file
* @param boolean $storeContent
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @return Zend_Search_Lucene_Document_Html
*/
public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
{
return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
}
/**
* Highlight text in text node
*
* @param DOMText $node
* @param array $wordsToHighlight
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
* @throws Zend_Search_Lucene_Exception
*/
protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
{
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$analyzer->setInput($node->nodeValue, 'UTF-8');
$matchedTokens = array();
while (($token = $analyzer->nextToken()) !== null) {
if (isset($wordsToHighlight[$token->getTermText()])) {
$matchedTokens[] = $token;
}
}
if (count($matchedTokens) == 0) {
return;
}
$matchedTokens = array_reverse($matchedTokens);
foreach ($matchedTokens as $token) {
// Cut text after matched token
$node->splitText($token->getEndOffset());
// Cut matched node
$matchedWordNode = $node->splitText($token->getStartOffset());
// Retrieve HTML string representation for highlihted word
$fullCallbackparamsList = $params;
array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
$highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);
// Transform HTML string to a DOM representation and automatically transform retrieved string
// into valid XHTML (It's automatically done by loadHTML() method)
$highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
$success = @$highlightedWordNodeSetDomDocument->
loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
. $highlightedWordNodeSetHtml
. '</body></html>');
if (!$success) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
}
$highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
$highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
$nodeToImport = $highlightedWordNodeSet->item($count);
$node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
$matchedWordNode);
}
$node->parentNode->removeChild($matchedWordNode);
}
}
/**
* highlight words in content of the specified node
*
* @param DOMNode $contextNode
* @param array $wordsToHighlight
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform)
*/
protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
{
$textNodes = array();
if (!$contextNode->hasChildNodes()) {
return;
}
foreach ($contextNode->childNodes as $childNode) {
if ($childNode->nodeType == XML_TEXT_NODE) {
// process node later to leave childNodes structure untouched
$textNodes[] = $childNode;
} else {
// Process node if it's not a script node
if ($childNode->nodeName != 'script') {
$this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
}
}
}
foreach ($textNodes as $textNode) {
$this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
}
}
/**
* Standard callback method used to highlight words.
*
* @param string $stringToHighlight
* @return string
* @internal
*/
public function applyColour($stringToHighlight, $colour)
{
return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
}
/**
* Highlight text with specified color
*
* @param string|array $words
* @param string $colour
* @return string
*/
public function highlight($words, $colour = '#66ffff')
{
return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
}
/**
* Highlight text using specified View helper or callback function.
*
* @param string|array $words Words to highlight. Words could be organized using the array or string.
* @param callback $callback Callback method, used to transform (highlighting) text.
* @param array $params Array of additionall callback parameters passed through into it
* (first non-optional parameter is an HTML fragment for highlighting)
* @return string
* @throws Zend_Search_Lucene_Exception
*/
public function highlightExtended($words, $callback, $params = array())
{
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
if (!is_array($words)) {
$words = array($words);
}
$wordsToHighlightList = array();
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
foreach ($words as $wordString) {
$wordsToHighlightList[] = $analyzer->tokenize($wordString);
}
$wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
if (count($wordsToHighlight) == 0) {
return $this->_doc->saveHTML();
}
$wordsToHighlightFlipped = array();
foreach ($wordsToHighlight as $id => $token) {
$wordsToHighlightFlipped[$token->getTermText()] = $id;
}
if (!is_callable($callback)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
}
$xpath = new DOMXPath($this->_doc);
$matchedNodes = $xpath->query("/html/body");
foreach ($matchedNodes as $matchedNode) {
$this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
}
}
/**
* Get HTML
*
* @return string
*/
public function getHTML()
{
return $this->_doc->saveHTML();
}
/**
* Get HTML body
*
* @return string
*/
public function getHtmlBody()
{
$xpath = new DOMXPath($this->_doc);
$bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
$outputFragments = array();
for ($count = 0; $count < $bodyNodes->length; $count++) {
$outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
}
return implode($outputFragments);
}
}

View File

@ -0,0 +1,129 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: OpenXml.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
/**
* OpenXML document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Document_OpenXml extends Zend_Search_Lucene_Document
{
/**
* Xml Schema - Relationships
*
* @var string
*/
const SCHEMA_RELATIONSHIP = 'http://schemas.openxmlformats.org/package/2006/relationships';
/**
* Xml Schema - Office document
*
* @var string
*/
const SCHEMA_OFFICEDOCUMENT = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument';
/**
* Xml Schema - Core properties
*
* @var string
*/
const SCHEMA_COREPROPERTIES = 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties';
/**
* Xml Schema - Dublin Core
*
* @var string
*/
const SCHEMA_DUBLINCORE = 'http://purl.org/dc/elements/1.1/';
/**
* Xml Schema - Dublin Core Terms
*
* @var string
*/
const SCHEMA_DUBLINCORETERMS = 'http://purl.org/dc/terms/';
/**
* Extract metadata from document
*
* @param ZipArchive $package ZipArchive OpenXML package
* @return array Key-value pairs containing document meta data
*/
protected function extractMetaData(ZipArchive $package)
{
// Data holders
$coreProperties = array();
// Read relations and search for core properties
$relations = simplexml_load_string($package->getFromName("_rels/.rels"));
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_COREPROPERTIES) {
// Found core properties! Read in contents...
$contents = simplexml_load_string(
$package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"]))
);
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_DUBLINCORE) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_COREPROPERTIES) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
foreach ($contents->children(Zend_Search_Lucene_Document_OpenXml::SCHEMA_DUBLINCORETERMS) as $child) {
$coreProperties[$child->getName()] = (string)$child;
}
}
}
return $coreProperties;
}
/**
* Determine absolute zip path
*
* @param string $path
* @return string
*/
protected function absoluteZipPath($path) {
$path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path);
$parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen');
$absolutes = array();
foreach ($parts as $part) {
if ('.' == $part) continue;
if ('..' == $part) {
array_pop($absolutes);
} else {
$absolutes[] = $part;
}
}
return implode('/', $absolutes);
}
}

View File

@ -0,0 +1,200 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Pptx.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
/**
* Pptx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Pptx extends Zend_Search_Lucene_Document_OpenXml
{
/**
* Xml Schema - PresentationML
*
* @var string
*/
const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Slide relation
*
* @var string
*/
const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws Zend_Search_Lucene_Exception
*/
private function __construct($fileName, $storeContent)
{
if (!class_exists('ZipArchive', false)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
}
// Document data holders
$slides = array();
$slideNotes = array();
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .pptx file.');
}
$relations = simplexml_load_string($relationsXml);
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Search for slides...
$slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
foreach ($slideRelations->Relationship as $slideRel) {
if ($slideRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDERELATION) {
// Found slide!
$slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) )
);
// Search for slide notes
$slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) );
foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
if ($slideNoteRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDENOTESRELATION) {
// Found slide notes!
$slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) )
);
break;
}
}
}
}
break;
}
}
// Sort slides
ksort($slides);
ksort($slideNotes);
// Extract contents from slides
foreach ($slides as $slideKey => $slide) {
// Register namespaces
$slide->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
$slide->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slide->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
// Extract contents from slide notes
if (isset($slideNotes[$slideKey])) {
// Fetch slide note
$slideNote = $slideNotes[$slideKey];
// Register namespaces
$slideNote->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
$slideNote->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
// Fetch all text
$textElements = $slideNote->xpath('//a:t');
foreach ($textElements as $textElement) {
$documentBody[] = (string)$textElement;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value)
{
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title']))
{
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Load Pptx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Pptx
*/
public static function loadPptxFile($fileName, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Pptx($fileName, $storeContent);
}
}

View File

@ -0,0 +1,263 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Xlsx.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Document_OpenXml */
require_once 'Zend/Search/Lucene/Document/OpenXml.php';
/**
* Xlsx document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Document_Xlsx extends Zend_Search_Lucene_Document_OpenXml
{
/**
* Xml Schema - SpreadsheetML
*
* @var string
*/
const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
/**
* Xml Schema - DrawingML
*
* @var string
*/
const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
/**
* Xml Schema - Shared Strings
*
* @var string
*/
const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
/**
* Xml Schema - Worksheet relation
*
* @var string
*/
const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
/**
* Xml Schema - Slide notes relation
*
* @var string
*/
const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
/**
* Object constructor
*
* @param string $fileName
* @param boolean $storeContent
* @throws Zend_Search_Lucene_Exception
*/
private function __construct($fileName, $storeContent)
{
if (!class_exists('ZipArchive', false)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
}
// Document data holders
$sharedStrings = array();
$worksheets = array();
$documentBody = array();
$coreProperties = array();
// Open OpenXML package
$package = new ZipArchive();
$package->open($fileName);
// Read relations and search for officeDocument
$relationsXml = $package->getFromName('_rels/.rels');
if ($relationsXml === false) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .xlsx file.');
}
$relations = simplexml_load_string($relationsXml);
foreach ($relations->Relationship as $rel) {
if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
// Found office document! Read relations for workbook...
$workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
$workbookRelations->registerXPathNamespace("rel", Zend_Search_Lucene_Document_OpenXml::SCHEMA_RELATIONSHIP);
// Read shared strings
$sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . Zend_Search_Lucene_Document_Xlsx::SCHEMA_SHAREDSTRINGS . "']");
$sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
$xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
if (isset($xmlStrings) && isset($xmlStrings->si)) {
foreach ($xmlStrings->si as $val) {
if (isset($val->t)) {
$sharedStrings[] = (string)$val->t;
} elseif (isset($val->r)) {
$sharedStrings[] = $this->_parseRichText($val);
}
}
}
// Loop relations for workbook and extract worksheets...
foreach ($workbookRelations->Relationship as $workbookRelation) {
if ($workbookRelation["Type"] == Zend_Search_Lucene_Document_Xlsx::SCHEMA_WORKSHEETRELATION) {
$worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
$package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
);
}
}
break;
}
}
// Sort worksheets
ksort($worksheets);
// Extract contents from worksheets
foreach ($worksheets as $sheetKey => $worksheet) {
foreach ($worksheet->sheetData->row as $row) {
foreach ($row->c as $c) {
// Determine data type
$dataType = (string)$c["t"];
switch ($dataType) {
case "s":
// Value is a shared string
if ((string)$c->v != '') {
$value = $sharedStrings[intval($c->v)];
} else {
$value = '';
}
break;
case "b":
// Value is boolean
$value = (string)$c->v;
if ($value == '0') {
$value = false;
} else if ($value == '1') {
$value = true;
} else {
$value = (bool)$c->v;
}
break;
case "inlineStr":
// Value is rich text inline
$value = $this->_parseRichText($c->is);
break;
case "e":
// Value is an error message
if ((string)$c->v != '') {
$value = (string)$c->v;
} else {
$value = '';
}
break;
default:
// Value is a string
$value = (string)$c->v;
// Check for numeric values
if (is_numeric($value) && $dataType != 's') {
if ($value == (int)$value) $value = (int)$value;
elseif ($value == (float)$value) $value = (float)$value;
elseif ($value == (double)$value) $value = (double)$value;
}
}
$documentBody[] = $value;
}
}
}
// Read core properties
$coreProperties = $this->extractMetaData($package);
// Close file
$package->close();
// Store filename
$this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
// Store contents
if ($storeContent) {
$this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
} else {
$this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
}
// Store meta data properties
foreach ($coreProperties as $key => $value)
{
$this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
}
// Store title (if not present in meta data)
if (!isset($coreProperties['title']))
{
$this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
}
}
/**
* Parse rich text XML
*
* @param SimpleXMLElement $is
* @return string
*/
private function _parseRichText($is = null) {
$value = array();
if (isset($is->t)) {
$value[] = (string)$is->t;
} else {
foreach ($is->r as $run) {
$value[] = (string)$run->t;
}
}
return implode('', $value);
}
/**
* Load Xlsx document from a file
*
* @param string $fileName
* @param boolean $storeContent
* @return Zend_Search_Lucene_Document_Xlsx
*/
public static function loadXlsxFile($fileName, $storeContent = false)
{
return new Zend_Search_Lucene_Document_Xlsx($fileName, $storeContent);
}
}

View File

@ -0,0 +1,37 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Exception.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Framework base exception
*/
require_once 'Zend/Search/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Exception extends Zend_Search_Exception
{}

443
thirdparty/Zend/Search/Lucene/FSM.php vendored Normal file
View File

@ -0,0 +1,443 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: FSM.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_FSMAction */
require_once 'Zend/Search/Lucene/FSMAction.php';
/**
* Abstract Finite State Machine
*
* Take a look on Wikipedia state machine description: http://en.wikipedia.org/wiki/Finite_state_machine
*
* Any type of Transducers (Moore machine or Mealy machine) also may be implemented by using this abstract FSM.
* process() methods invokes a specified actions which may construct FSM output.
* Actions may be also used to signal, that we have reached Accept State
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_FSM
{
/**
* Machine States alphabet
*
* @var array
*/
private $_states = array();
/**
* Current state
*
* @var integer|string
*/
private $_currentState = null;
/**
* Input alphabet
*
* @var array
*/
private $_inputAphabet = array();
/**
* State transition table
*
* [sourceState][input] => targetState
*
* @var array
*/
private $_rules = array();
/**
* List of entry actions
* Each action executes when entering the state
*
* [state] => action
*
* @var array
*/
private $_entryActions = array();
/**
* List of exit actions
* Each action executes when exiting the state
*
* [state] => action
*
* @var array
*/
private $_exitActions = array();
/**
* List of input actions
* Each action executes when entering the state
*
* [state][input] => action
*
* @var array
*/
private $_inputActions = array();
/**
* List of input actions
* Each action executes when entering the state
*
* [state1][state2] => action
*
* @var array
*/
private $_transitionActions = array();
/**
* Finite State machine constructor
*
* $states is an array of integers or strings with a list of possible machine states
* constructor treats fist list element as a sturt state (assignes it to $_current state).
* It may be reassigned by setState() call.
* States list may be empty and can be extended later by addState() or addStates() calls.
*
* $inputAphabet is the same as $states, but represents input alphabet
* it also may be extended later by addInputSymbols() or addInputSymbol() calls.
*
* $rules parameter describes FSM transitions and has a structure:
* array( array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* ...
* )
* Rules also can be added later by addRules() and addRule() calls.
*
* FSM actions are very flexible and may be defined by addEntryAction(), addExitAction(),
* addInputAction() and addTransitionAction() calls.
*
* @param array $states
* @param array $inputAphabet
* @param array $rules
*/
public function __construct($states = array(), $inputAphabet = array(), $rules = array())
{
$this->addStates($states);
$this->addInputSymbols($inputAphabet);
$this->addRules($rules);
}
/**
* Add states to the state machine
*
* @param array $states
*/
public function addStates($states)
{
foreach ($states as $state) {
$this->addState($state);
}
}
/**
* Add state to the state machine
*
* @param integer|string $state
*/
public function addState($state)
{
$this->_states[$state] = $state;
if ($this->_currentState === null) {
$this->_currentState = $state;
}
}
/**
* Set FSM state.
* No any action is invoked
*
* @param integer|string $state
* @throws Zend_Search_Exception
*/
public function setState($state)
{
if (!isset($this->_states[$state])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('State \'' . $state . '\' is not on of the possible FSM states.');
}
$this->_currentState = $state;
}
/**
* Get FSM state.
*
* @return integer|string $state|null
*/
public function getState()
{
return $this->_currentState;
}
/**
* Add symbols to the input alphabet
*
* @param array $inputAphabet
*/
public function addInputSymbols($inputAphabet)
{
foreach ($inputAphabet as $inputSymbol) {
$this->addInputSymbol($inputSymbol);
}
}
/**
* Add symbol to the input alphabet
*
* @param integer|string $inputSymbol
*/
public function addInputSymbol($inputSymbol)
{
$this->_inputAphabet[$inputSymbol] = $inputSymbol;
}
/**
* Add transition rules
*
* array structure:
* array( array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* array(sourseState, input, targetState[, inputAction]),
* ...
* )
*
* @param array $rules
*/
public function addRules($rules)
{
foreach ($rules as $rule) {
$this->addrule($rule[0], $rule[1], $rule[2], isset($rule[3])?$rule[3]:null);
}
}
/**
* Add symbol to the input alphabet
*
* @param integer|string $sourceState
* @param integer|string $input
* @param integer|string $targetState
* @param Zend_Search_Lucene_FSMAction|null $inputAction
* @throws Zend_Search_Exception
*/
public function addRule($sourceState, $input, $targetState, $inputAction = null)
{
if (!isset($this->_states[$sourceState])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined source state (' . $sourceState . ').');
}
if (!isset($this->_states[$targetState])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined target state (' . $targetState . ').');
}
if (!isset($this->_inputAphabet[$input])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined input symbol (' . $input . ').');
}
if (!isset($this->_rules[$sourceState])) {
$this->_rules[$sourceState] = array();
}
if (isset($this->_rules[$sourceState][$input])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Rule for {state,input} pair (' . $sourceState . ', '. $input . ') is already defined.');
}
$this->_rules[$sourceState][$input] = $targetState;
if ($inputAction !== null) {
$this->addInputAction($sourceState, $input, $inputAction);
}
}
/**
* Add state entry action.
* Several entry actions are allowed.
* Action execution order is defined by addEntryAction() calls
*
* @param integer|string $state
* @param Zend_Search_Lucene_FSMAction $action
*/
public function addEntryAction($state, Zend_Search_Lucene_FSMAction $action)
{
if (!isset($this->_states[$state])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined state (' . $state. ').');
}
if (!isset($this->_entryActions[$state])) {
$this->_entryActions[$state] = array();
}
$this->_entryActions[$state][] = $action;
}
/**
* Add state exit action.
* Several exit actions are allowed.
* Action execution order is defined by addEntryAction() calls
*
* @param integer|string $state
* @param Zend_Search_Lucene_FSMAction $action
*/
public function addExitAction($state, Zend_Search_Lucene_FSMAction $action)
{
if (!isset($this->_states[$state])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined state (' . $state. ').');
}
if (!isset($this->_exitActions[$state])) {
$this->_exitActions[$state] = array();
}
$this->_exitActions[$state][] = $action;
}
/**
* Add input action (defined by {state, input} pair).
* Several input actions are allowed.
* Action execution order is defined by addInputAction() calls
*
* @param integer|string $state
* @param integer|string $input
* @param Zend_Search_Lucene_FSMAction $action
*/
public function addInputAction($state, $inputSymbol, Zend_Search_Lucene_FSMAction $action)
{
if (!isset($this->_states[$state])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined state (' . $state. ').');
}
if (!isset($this->_inputAphabet[$inputSymbol])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined input symbol (' . $inputSymbol. ').');
}
if (!isset($this->_inputActions[$state])) {
$this->_inputActions[$state] = array();
}
if (!isset($this->_inputActions[$state][$inputSymbol])) {
$this->_inputActions[$state][$inputSymbol] = array();
}
$this->_inputActions[$state][$inputSymbol][] = $action;
}
/**
* Add transition action (defined by {state, input} pair).
* Several transition actions are allowed.
* Action execution order is defined by addTransitionAction() calls
*
* @param integer|string $sourceState
* @param integer|string $targetState
* @param Zend_Search_Lucene_FSMAction $action
*/
public function addTransitionAction($sourceState, $targetState, Zend_Search_Lucene_FSMAction $action)
{
if (!isset($this->_states[$sourceState])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined source state (' . $sourceState. ').');
}
if (!isset($this->_states[$targetState])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('Undefined source state (' . $targetState. ').');
}
if (!isset($this->_transitionActions[$sourceState])) {
$this->_transitionActions[$sourceState] = array();
}
if (!isset($this->_transitionActions[$sourceState][$targetState])) {
$this->_transitionActions[$sourceState][$targetState] = array();
}
$this->_transitionActions[$sourceState][$targetState][] = $action;
}
/**
* Process an input
*
* @param mixed $input
* @throws Zend_Search_Exception
*/
public function process($input)
{
if (!isset($this->_rules[$this->_currentState])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('There is no any rule for current state (' . $this->_currentState . ').');
}
if (!isset($this->_rules[$this->_currentState][$input])) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('There is no any rule for {current state, input} pair (' . $this->_currentState . ', ' . $input . ').');
}
$sourceState = $this->_currentState;
$targetState = $this->_rules[$this->_currentState][$input];
if ($sourceState != $targetState && isset($this->_exitActions[$sourceState])) {
foreach ($this->_exitActions[$sourceState] as $action) {
$action->doAction();
}
}
if (isset($this->_inputActions[$sourceState]) &&
isset($this->_inputActions[$sourceState][$input])) {
foreach ($this->_inputActions[$sourceState][$input] as $action) {
$action->doAction();
}
}
$this->_currentState = $targetState;
if (isset($this->_transitionActions[$sourceState]) &&
isset($this->_transitionActions[$sourceState][$targetState])) {
foreach ($this->_transitionActions[$sourceState][$targetState] as $action) {
$action->doAction();
}
}
if ($sourceState != $targetState && isset($this->_entryActions[$targetState])) {
foreach ($this->_entryActions[$targetState] as $action) {
$action->doAction();
}
}
}
public function reset()
{
if (count($this->_states) == 0) {
require_once 'Zend/Search/Exception.php';
throw new Zend_Search_Exception('There is no any state defined for FSM.');
}
$this->_currentState = $this->_states[0];
}
}

View File

@ -0,0 +1,66 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: FSMAction.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Abstract Finite State Machine
*
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_FSMAction
{
/**
* Object reference
*
* @var object
*/
private $_object;
/**
* Method name
*
* @var string
*/
private $_method;
/**
* Object constructor
*
* @param object $object
* @param string $method
*/
public function __construct($object, $method)
{
$this->_object = $object;
$this->_method = $method;
}
public function doAction()
{
$methodName = $this->_method;
$this->_object->$methodName();
}
}

226
thirdparty/Zend/Search/Lucene/Field.php vendored Normal file
View File

@ -0,0 +1,226 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Field.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* A field is a section of a Document. Each field has two parts,
* a name and a value. Values may be free text or they may be atomic
* keywords, which are not further processed. Such keywords may
* be used to represent dates, urls, etc. Fields are optionally
* stored in the index, so that they may be returned with hits
* on the document.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Document
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Field
{
/**
* Field name
*
* @var string
*/
public $name;
/**
* Field value
*
* @var boolean
*/
public $value;
/**
* Field is to be stored in the index for return with search hits.
*
* @var boolean
*/
public $isStored = false;
/**
* Field is to be indexed, so that it may be searched on.
*
* @var boolean
*/
public $isIndexed = true;
/**
* Field should be tokenized as text prior to indexing.
*
* @var boolean
*/
public $isTokenized = true;
/**
* Field is stored as binary.
*
* @var boolean
*/
public $isBinary = false;
/**
* Field are stored as a term vector
*
* @var boolean
*/
public $storeTermVector = false;
/**
* Field boost factor
* It's not stored directly in the index, but affects on normalization factor
*
* @var float
*/
public $boost = 1.0;
/**
* Field value encoding.
*
* @var string
*/
public $encoding;
/**
* Object constructor
*
* @param string $name
* @param string $value
* @param string $encoding
* @param boolean $isStored
* @param boolean $isIndexed
* @param boolean $isTokenized
* @param boolean $isBinary
*/
public function __construct($name, $value, $encoding, $isStored, $isIndexed, $isTokenized, $isBinary = false)
{
$this->name = $name;
$this->value = $value;
if (!$isBinary) {
$this->encoding = $encoding;
$this->isTokenized = $isTokenized;
} else {
$this->encoding = '';
$this->isTokenized = false;
}
$this->isStored = $isStored;
$this->isIndexed = $isIndexed;
$this->isBinary = $isBinary;
$this->storeTermVector = false;
$this->boost = 1.0;
}
/**
* Constructs a String-valued Field that is not tokenized, but is indexed
* and stored. Useful for non-text fields, e.g. date or url.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return Zend_Search_Lucene_Field
*/
public static function keyword($name, $value, $encoding = '')
{
return new self($name, $value, $encoding, true, true, false);
}
/**
* Constructs a String-valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return Zend_Search_Lucene_Field
*/
public static function unIndexed($name, $value, $encoding = '')
{
return new self($name, $value, $encoding, true, false, false);
}
/**
* Constructs a Binary String valued Field that is not tokenized nor indexed,
* but is stored in the index, for return with hits.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return Zend_Search_Lucene_Field
*/
public static function binary($name, $value)
{
return new self($name, $value, '', true, false, false, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* and is stored in the index, for return with hits. Useful for short text
* fields, like "title" or "subject". Term vector will not be stored for this field.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return Zend_Search_Lucene_Field
*/
public static function text($name, $value, $encoding = '')
{
return new self($name, $value, $encoding, true, true, true);
}
/**
* Constructs a String-valued Field that is tokenized and indexed,
* but that is not stored in the index.
*
* @param string $name
* @param string $value
* @param string $encoding
* @return Zend_Search_Lucene_Field
*/
public static function unStored($name, $value, $encoding = '')
{
return new self($name, $value, $encoding, false, true, true);
}
/**
* Get field value in UTF-8 encoding
*
* @return string
*/
public function getUtf8Value()
{
if (strcasecmp($this->encoding, 'utf8' ) == 0 ||
strcasecmp($this->encoding, 'utf-8') == 0 ) {
return $this->value;
} else {
return (PHP_OS != 'AIX') ? iconv($this->encoding, 'UTF-8', $this->value) : iconv('ISO8859-1', 'UTF-8', $this->value);
}
}
}

View File

@ -0,0 +1,268 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: DictionaryLoader.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Dictionary loader
*
* It's a dummy class which is created to encapsulate non-good structured code.
* Manual "method inlining" is performed to increase dictionary index loading operation
* which is major bottelneck for search performance.
*
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_DictionaryLoader
{
/**
* Dictionary index loader.
*
* It takes a string which is actually <segment_name>.tii index file data and
* returns two arrays - term and tremInfo lists.
*
* See Zend_Search_Lucene_Index_SegmintInfo class for details
*
* @param string $data
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public static function load($data)
{
$termDictionary = array();
$termInfos = array();
$pos = 0;
// $tiVersion = $tiiFile->readInt();
$tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]);
$pos += 4;
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
}
// $indexTermCount = $tiiFile->readLong();
if (PHP_INT_SIZE > 4) {
$indexTermCount = ord($data[$pos]) << 56 |
ord($data[$pos+1]) << 48 |
ord($data[$pos+2]) << 40 |
ord($data[$pos+3]) << 32 |
ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
} else {
if ((ord($data[$pos]) != 0) ||
(ord($data[$pos+1]) != 0) ||
(ord($data[$pos+2]) != 0) ||
(ord($data[$pos+3]) != 0) ||
((ord($data[$pos+4]) & 0x80) != 0)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
}
$indexTermCount = ord($data[$pos+4]) << 24 |
ord($data[$pos+5]) << 16 |
ord($data[$pos+6]) << 8 |
ord($data[$pos+7]);
}
$pos += 8;
// $tiiFile->readInt(); // IndexInterval
$pos += 4;
// $skipInterval = $tiiFile->readInt();
$skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]);
$pos += 4;
if ($indexTermCount < 1) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
}
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
/* Skip MaxSkipLevels value */
$pos += 4;
}
$prevTerm = '';
$freqPointer = 0;
$proxPointer = 0;
$indexPointer = 0;
for ($count = 0; $count < $indexTermCount; $count++) {
//$termPrefixLength = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termPrefixLength = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termPrefixLength |= ($nbyte & 0x7F) << $shift;
}
// $termSuffix = $tiiFile->readString();
$nbyte = ord($data[$pos++]);
$len = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$len |= ($nbyte & 0x7F) << $shift;
}
if ($len == 0) {
$termSuffix = '';
} else {
$termSuffix = substr($data, $pos, $len);
$pos += $len;
for ($count1 = 0; $count1 < $len; $count1++ ) {
if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($termSuffix[$count1]) & 0x20 ) {
$addBytes++;
// Never used for Java Lucene created index.
// Java2 doesn't encode strings in four bytes
if (ord($termSuffix[$count1]) & 0x10 ) {
$addBytes++;
}
}
$termSuffix .= substr($data, $pos, $addBytes);
$pos += $addBytes;
$len += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($termSuffix[$count1]) == 0xC0 &&
ord($termSuffix[$count1+1]) == 0x80 ) {
$termSuffix[$count1] = 0;
$termSuffix = substr($termSuffix,0,$count1+1)
. substr($termSuffix,$count1+2);
}
$count1 += $addBytes;
}
}
}
// $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
$pb = 0; $pc = 0;
while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) {
$charBytes = 1;
if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x20 ) {
$charBytes++;
if (ord($prevTerm[$pb]) & 0x10 ) {
$charBytes++;
}
}
}
if ($pb + $charBytes > strlen($data)) {
// wrong character
break;
}
$pc++;
$pb += $charBytes;
}
$termValue = substr($prevTerm, 0, $pb) . $termSuffix;
// $termFieldNum = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$termFieldNum = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$termFieldNum |= ($nbyte & 0x7F) << $shift;
}
// $docFreq = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$docFreq = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$docFreq |= ($nbyte & 0x7F) << $shift;
}
// $freqPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$freqPointer += $vint;
// $proxPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$proxPointer += $vint;
if( $docFreq >= $skipInterval ) {
// $skipDelta = $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$skipDelta = $vint;
} else {
$skipDelta = 0;
}
// $indexPointer += $tiiFile->readVInt();
$nbyte = ord($data[$pos++]);
$vint = $nbyte & 0x7F;
for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
$nbyte = ord($data[$pos++]);
$vint |= ($nbyte & 0x7F) << $shift;
}
$indexPointer += $vint;
// $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
$termDictionary[] = array($termFieldNum, $termValue);
$termInfos[] =
// new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
$prevTerm = $termValue;
}
// Check special index entry mark
if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
}
if (PHP_INT_SIZE > 4) {
// Treat 64-bit 0xFFFFFFFF as -1
$termDictionary[0][0] = -1;
}
return array($termDictionary, $termInfos);
}
}

View File

@ -0,0 +1,59 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: DocsFilter.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* A Zend_Search_Lucene_Index_DocsFilter is used to filter documents while searching.
*
* It may or _may_not_ be used for actual filtering, so it's just a hint that upper query limits
* search result by specified list.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_DocsFilter
{
/**
* Set of segment filters:
* array( <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* <segmentName> => array(<docId> => <undefined_value>,
* <docId> => <undefined_value>,
* <docId> => <undefined_value>,
* ... ),
* ...
* )
*
* @var array
*/
public $segmentFilters = array();
}

View File

@ -0,0 +1,50 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: FieldInfo.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_FieldInfo
{
public $name;
public $isIndexed;
public $number;
public $storeTermVector;
public $normsOmitted;
public $payloadsStored;
public function __construct($name, $isIndexed, $number, $storeTermVector, $normsOmitted = false, $payloadsStored = false)
{
$this->name = $name;
$this->isIndexed = $isIndexed;
$this->number = $number;
$this->storeTermVector = $storeTermVector;
$this->normsOmitted = $normsOmitted;
$this->payloadsStored = $payloadsStored;
}
}

View File

@ -0,0 +1,2130 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: SegmentInfo.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_TermsStream_Interface */
require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
/** Zend_Search_Lucene_Index_FieldInfo */
require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_TermInfo */
require_once 'Zend/Search/Lucene/Index/TermInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface
{
/**
* "Full scan vs fetch" boundary.
*
* If filter selectivity is less than this value, then full scan is performed
* (since term entries fetching has some additional overhead).
*/
const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
/**
* Number of docs in a segment
*
* @var integer
*/
private $_docCount;
/**
* Segment name
*
* @var string
*/
private $_name;
/**
* Term Dictionary Index
*
* Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
* of performance considerations)
* [0] -> $termValue
* [1] -> $termFieldNum
*
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
private $_termDictionary;
/**
* Term Dictionary Index TermInfos
*
* Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
* of performance considerations)
* [0] -> $docFreq
* [1] -> $freqPointer
* [2] -> $proxPointer
* [3] -> $skipOffset
* [4] -> $indexPointer
*
* @var array
*/
private $_termDictionaryInfos;
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
private $_fields;
/**
* Field positions in a dictionary.
* (Term dictionary contains filelds ordered by names)
*
* @var array
*/
private $_fieldsDicPositions;
/**
* Associative array where the key is the file name and the value is data offset
* in a compound segment file (.csf).
*
* @var array
*/
private $_segFiles;
/**
* Associative array where the key is the file name and the value is file size (.csf).
*
* @var array
*/
private $_segFileSizes;
/**
* Delete file generation number
*
* -2 means autodetect latest delete generation
* -1 means 'there is no delete file'
* 0 means pre-2.1 format delete file
* X specifies used delete file
*
* @var integer
*/
private $_delGen;
/**
* Segment has single norms file
*
* If true then one .nrm file is used for all fields
* Otherwise .fN files are used
*
* @var boolean
*/
private $_hasSingleNormFile;
/**
* Use compound segment file (*.cfs) to collect all other segment files
* (excluding .del files)
*
* @var boolean
*/
private $_isCompound;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory_Filesystem
*/
private $_directory;
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
private $_norms = array();
/**
* List of deleted documents.
* bitset if bitset extension is loaded or array otherwise.
*
* @var mixed
*/
private $_deleted = null;
/**
* $this->_deleted update flag
*
* @var boolean
*/
private $_deletedDirty = false;
/**
* True if segment uses shared doc store
*
* @var boolean
*/
private $_usesSharedDocStore;
/*
* Shared doc store options.
* It's an assotiative array with the following items:
* - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin
* - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files.
* - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file).
*/
private $_sharedDocStoreOptions;
/**
* Zend_Search_Lucene_Index_SegmentInfo constructor
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
* @param integer $docCount
* @param integer $delGen
* @param array|null $docStoreOptions
* @param boolean $hasSingleNormFile
* @param boolean $isCompound
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
{
$this->_directory = $directory;
$this->_name = $name;
$this->_docCount = $docCount;
if ($docStoreOptions !== null) {
$this->_usesSharedDocStore = true;
$this->_sharedDocStoreOptions = $docStoreOptions;
if ($docStoreOptions['isCompound']) {
$cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
$cfxFilesCount = $cfxFile->readVInt();
$cfxFiles = array();
$cfxFileSizes = array();
for ($count = 0; $count < $cfxFilesCount; $count++) {
$dataOffset = $cfxFile->readLong();
if ($count != 0) {
$cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
}
$fileName = $cfxFile->readString();
$cfxFiles[$fileName] = $dataOffset;
}
if ($count != 0) {
$cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
}
$this->_sharedDocStoreOptions['files'] = $cfxFiles;
$this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
}
}
$this->_hasSingleNormFile = $hasSingleNormFile;
$this->_delGen = $delGen;
$this->_termDictionary = null;
if ($isCompound !== null) {
$this->_isCompound = $isCompound;
} else {
// It's a pre-2.1 segment or isCompound is set to 'unknown'
// Detect if segment uses compound file
require_once 'Zend/Search/Lucene/Exception.php';
try {
// Try to open compound file
$this->_directory->getFileObject($name . '.cfs');
// Compound file is found
$this->_isCompound = true;
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
// Compound file is not found or is not readable
$this->_isCompound = false;
} else {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
}
$this->_segFiles = array();
if ($this->_isCompound) {
$cfsFile = $this->_directory->getFileObject($name . '.cfs');
$segFilesCount = $cfsFile->readVInt();
for ($count = 0; $count < $segFilesCount; $count++) {
$dataOffset = $cfsFile->readLong();
if ($count != 0) {
$this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
}
$fileName = $cfsFile->readString();
$this->_segFiles[$fileName] = $dataOffset;
}
if ($count != 0) {
$this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
}
}
$fnmFile = $this->openCompoundFile('.fnm');
$fieldsCount = $fnmFile->readVInt();
$fieldNames = array();
$fieldNums = array();
$this->_fields = array();
for ($count=0; $count < $fieldsCount; $count++) {
$fieldName = $fnmFile->readString();
$fieldBits = $fnmFile->readByte();
$this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
$fieldBits & 0x01 /* field is indexed */,
$count,
$fieldBits & 0x02 /* termvectors are stored */,
$fieldBits & 0x10 /* norms are omitted */,
$fieldBits & 0x20 /* payloads are stored */);
if ($fieldBits & 0x10) {
// norms are omitted for the indexed field
$this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
}
$fieldNums[$count] = $count;
$fieldNames[$count] = $fieldName;
}
array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
$this->_fieldsDicPositions = array_flip($fieldNums);
if ($this->_delGen == -2) {
// SegmentInfo constructor is invoked from index writer
// Autodetect current delete file generation number
$this->_delGen = $this->_detectLatestDelGen();
}
// Load deletions
$this->_deleted = $this->_loadDelFile();
}
/**
* Load detetions file
*
* Returns bitset or an array depending on bitset extension availability
*
* @return mixed
* @throws Zend_Search_Lucene_Exception
*/
private function _loadDelFile()
{
if ($this->_delGen == -1) {
// There is no delete file for this segment
return null;
} else if ($this->_delGen == 0) {
// It's a segment with pre-2.1 format delete file
// Try to load deletions file
return $this->_loadPre21DelFile();
} else {
// It's 2.1+ format deleteions file
return $this->_load21DelFile();
}
}
/**
* Load pre-2.1 detetions file
*
* Returns bitset or an array depending on bitset extension availability
*
* @return mixed
* @throws Zend_Search_Lucene_Exception
*/
private function _loadPre21DelFile()
{
require_once 'Zend/Search/Lucene/Exception.php';
try {
// '.del' files always stored in a separate file
// Segment compound is not used
$delFile = $this->_directory->getFileObject($this->_name . '.del');
$byteCount = $delFile->readInt();
$byteCount = ceil($byteCount/8);
$bitCount = $delFile->readInt();
if ($bitCount == 0) {
$delBytes = '';
} else {
$delBytes = $delFile->readBytes($byteCount);
}
if (extension_loaded('bitset')) {
return $delBytes;
} else {
$deletions = array();
for ($count = 0; $count < $byteCount; $count++) {
$byte = ord($delBytes[$count]);
for ($bit = 0; $bit < 8; $bit++) {
if ($byte & (1<<$bit)) {
$deletions[$count*8 + $bit] = 1;
}
}
}
return $deletions;
}
} catch(Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') === false) {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
// There is no deletion file
$this->_delGen = -1;
return null;
}
}
/**
* Load 2.1+ format detetions file
*
* Returns bitset or an array depending on bitset extension availability
*
* @return mixed
*/
private function _load21DelFile()
{
$delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$format = $delFile->readInt();
if ($format == (int)0xFFFFFFFF) {
if (extension_loaded('bitset')) {
$deletions = bitset_empty();
} else {
$deletions = array();
}
$byteCount = $delFile->readInt();
$bitCount = $delFile->readInt();
$delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$byteNum = 0;
do {
$dgap = $delFile->readVInt();
$nonZeroByte = $delFile->readByte();
$byteNum += $dgap;
if (extension_loaded('bitset')) {
for ($bit = 0; $bit < 8; $bit++) {
if ($nonZeroByte & (1<<$bit)) {
bitset_incl($deletions, $byteNum*8 + $bit);
}
}
return $deletions;
} else {
for ($bit = 0; $bit < 8; $bit++) {
if ($nonZeroByte & (1<<$bit)) {
$deletions[$byteNum*8 + $bit] = 1;
}
}
return (count($deletions) > 0) ? $deletions : null;
}
} while ($delFile->tell() < $delFileSize);
} else {
// $format is actually byte count
$byteCount = ceil($format/8);
$bitCount = $delFile->readInt();
if ($bitCount == 0) {
$delBytes = '';
} else {
$delBytes = $delFile->readBytes($byteCount);
}
if (extension_loaded('bitset')) {
return $delBytes;
} else {
$deletions = array();
for ($count = 0; $count < $byteCount; $count++) {
$byte = ord($delBytes[$count]);
for ($bit = 0; $bit < 8; $bit++) {
if ($byte & (1<<$bit)) {
$deletions[$count*8 + $bit] = 1;
}
}
}
return (count($deletions) > 0) ? $deletions : null;
}
}
}
/**
* Opens index file stoted within compound index file
*
* @param string $extension
* @param boolean $shareHandler
* @throws Zend_Search_Lucene_Exception
* @return Zend_Search_Lucene_Storage_File
*/
public function openCompoundFile($extension, $shareHandler = true)
{
if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
$fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
$fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
if (!$this->_sharedDocStoreOptions['isCompound']) {
$fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
$fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
if ($extension == '.fdx') {
// '.fdx' file is requested
return $fdxFile;
} else {
// '.fdt' file is requested
$fdtStartOffset = $fdxFile->readLong();
$fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
$fdtFile->seek($fdtStartOffset, SEEK_CUR);
return $fdtFile;
}
}
if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
. $fdxFName . ' file.' );
}
if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
. $fdtFName . ' file.' );
}
// Open shared docstore segment file
$cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
// Seek to the start of '.fdx' file within compound file
$cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
// Seek to the start of current segment documents section
$cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
if ($extension == '.fdx') {
// '.fdx' file is requested
return $cfxFile;
} else {
// '.fdt' file is requested
$fdtStartOffset = $cfxFile->readLong();
// Seek to the start of '.fdt' file within compound file
$cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
// Seek to the start of current segment documents section
$cfxFile->seek($fdtStartOffset, SEEK_CUR);
return $fdtFile;
}
}
$filename = $this->_name . $extension;
if (!$this->_isCompound) {
return $this->_directory->getFileObject($filename, $shareHandler);
}
if( !isset($this->_segFiles[$filename]) ) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
. $filename . ' file.' );
}
$file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
$file->seek($this->_segFiles[$filename]);
return $file;
}
/**
* Get compound file length
*
* @param string $extension
* @return integer
*/
public function compoundFileLength($extension)
{
if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) {
$filename = $this->_sharedDocStoreOptions['segment'] . $extension;
if (!$this->_sharedDocStoreOptions['isCompound']) {
return $this->_directory->fileLength($filename);
}
if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
. $filename . ' file.' );
}
return $this->_sharedDocStoreOptions['fileSizes'][$filename];
}
$filename = $this->_name . $extension;
// Try to get common file first
if ($this->_directory->fileExists($filename)) {
return $this->_directory->fileLength($filename);
}
if( !isset($this->_segFileSizes[$filename]) ) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
. $filename . ' file.' );
}
return $this->_segFileSizes[$filename];
}
/**
* Returns field index or -1 if field is not found
*
* @param string $fieldName
* @return integer
*/
public function getFieldNum($fieldName)
{
foreach( $this->_fields as $field ) {
if( $field->name == $fieldName ) {
return $field->number;
}
}
return -1;
}
/**
* Returns field info for specified field
*
* @param integer $fieldNum
* @return Zend_Search_Lucene_Index_FieldInfo
*/
public function getField($fieldNum)
{
return $this->_fields[$fieldNum];
}
/**
* Returns array of fields.
* if $indexed parameter is true, then returns only indexed fields.
*
* @param boolean $indexed
* @return array
*/
public function getFields($indexed = false)
{
$result = array();
foreach( $this->_fields as $field ) {
if( (!$indexed) || $field->isIndexed ) {
$result[ $field->name ] = $field->name;
}
}
return $result;
}
/**
* Returns array of FieldInfo objects.
*
* @return array
*/
public function getFieldInfos()
{
return $this->_fields;
}
/**
* Returns actual deletions file generation number.
*
* @return integer
*/
public function getDelGen()
{
return $this->_delGen;
}
/**
* Returns the total number of documents in this segment (including deleted documents).
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Returns number of deleted documents.
*
* @return integer
*/
private function _deletedCount()
{
if ($this->_deleted === null) {
return 0;
}
if (extension_loaded('bitset')) {
return count(bitset_to_array($this->_deleted));
} else {
return count($this->_deleted);
}
}
/**
* Returns the total number of non-deleted documents in this segment.
*
* @return integer
*/
public function numDocs()
{
if ($this->hasDeletions()) {
return $this->_docCount - $this->_deletedCount();
} else {
return $this->_docCount;
}
}
/**
* Get field position in a fields dictionary
*
* @param integer $fieldNum
* @return integer
*/
private function _getFieldPosition($fieldNum) {
// Treat values which are not in a translation table as a 'direct value'
return isset($this->_fieldsDicPositions[$fieldNum]) ?
$this->_fieldsDicPositions[$fieldNum] : $fieldNum;
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* TermInfo cache
*
* Size is 1024.
* Numbers are used instead of class constants because of performance considerations
*
* @var array
*/
private $_termInfoCache = array();
private function _cleanUpTermInfoCache()
{
// Clean 256 term infos
foreach ($this->_termInfoCache as $key => $termInfo) {
unset($this->_termInfoCache[$key]);
// leave 768 last used term infos
if (count($this->_termInfoCache) == 768) {
break;
}
}
}
/**
* Load terms dictionary index
*
* @throws Zend_Search_Lucene_Exception
*/
private function _loadDictionaryIndex()
{
// Check, if index is already serialized
if ($this->_directory->fileExists($this->_name . '.sti')) {
// Load serialized dictionary index data
$stiFile = $this->_directory->getFileObject($this->_name . '.sti');
$stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
// Load dictionary index data
if (($unserializedData = @unserialize($stiFileData)) !== false) {
list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
return;
}
}
// Load data from .tii file and generate .sti file
// Prefetch dictionary index data
$tiiFile = $this->openCompoundFile('.tii');
$tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
/** Zend_Search_Lucene_Index_DictionaryLoader */
require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
// Load dictionary index data
list($this->_termDictionary, $this->_termDictionaryInfos) =
Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
$stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
$stiFile = $this->_directory->createFile($this->_name . '.sti');
$stiFile->writeBytes($stiFileData);
}
/**
* Scans terms dictionary and returns term info
*
* @param Zend_Search_Lucene_Index_Term $term
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
{
$termKey = $term->key();
if (isset($this->_termInfoCache[$termKey])) {
$termInfo = $this->_termInfoCache[$termKey];
// Move termInfo to the end of cache
unset($this->_termInfoCache[$termKey]);
$this->_termInfoCache[$termKey] = $termInfo;
return $termInfo;
}
if ($this->_termDictionary === null) {
$this->_loadDictionaryIndex();
}
$searchField = $this->getFieldNum($term->field);
if ($searchField == -1) {
return null;
}
$searchDicField = $this->_getFieldPosition($searchField);
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($term->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
// return $this->_termDictionaryInfos[$mid]; // We got it!
$a = $this->_termDictionaryInfos[$mid];
$termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
return $termInfo;
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
return null;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
$tisFile = $this->openCompoundFile('.tis');
$tiVersion = $tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$termCount = $tisFile->readLong();
$indexInterval = $tisFile->readInt();
$skipInterval = $tisFile->readInt();
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
$maxSkipLevels = $tisFile->readInt();
}
$tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
$termValue = $prevTerm[1] /* text */;
$termFieldNum = $prevTerm[0] /* field */;
$freqPointer = $prevTermInfo[1] /* freqPointer */;
$proxPointer = $prevTermInfo[2] /* proxPointer */;
for ($count = $prevPosition*$indexInterval + 1;
$count <= $termCount &&
( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
($this->_getFieldPosition($termFieldNum) == $searchDicField &&
strcmp($termValue, $term->text) < 0) );
$count++) {
$termPrefixLength = $tisFile->readVInt();
$termSuffix = $tisFile->readString();
$termFieldNum = $tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
$docFreq = $tisFile->readVInt();
$freqPointer += $tisFile->readVInt();
$proxPointer += $tisFile->readVInt();
if( $docFreq >= $skipInterval ) {
$skipOffset = $tisFile->readVInt();
} else {
$skipOffset = 0;
}
}
if ($termFieldNum == $searchField && $termValue == $term->text) {
$termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
} else {
$termInfo = null;
}
// Put loaded termInfo into cache
$this->_termInfoCache[$termKey] = $termInfo;
if (count($this->_termInfoCache) == 1024) {
$this->_cleanUpTermInfoCache();
}
return $termInfo;
}
/**
* Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$docId = 0;
$result = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
$updatedFilterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
if (isset($filter[$docId])) {
$result[] = $shift + $docId;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
$updatedFilterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
if (isset($filter[$docId])) {
$result[] = $shift + $docId;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter is present, but doesn't has data for the current segment yet
$filterData = array();
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
$result[] = $shift + $docId;
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for( $count=0; $count < $termInfo->docFreq; $count++ ) {
$docDelta = $frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
} else {
$docId += $docDelta/2;
// read freq
$frqFile->readVInt();
}
$result[] = $shift + $docId;
}
}
return $result;
}
/**
* Returns term freqs array.
* Result array structure: array(docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$result = array();
$docId = 0;
$result = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
$updatedFilterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = 1;
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
} else {
$docId += $docDelta/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt();
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
$updatedFilterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = 1;
$updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
}
} else {
$docId += $docDelta/2;
if (isset($filter[$docId])) {
$result[$shift + $docId] = $frqFile->readVInt();
$updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
}
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter doesn't has data for current segment
$filterData = array();
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$result[$shift + $docId] = 1;
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
} else {
$docId += $docDelta/2;
$result[$shift + $docId] = $frqFile->readVInt();
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
}
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$result[$shift + $docId] = 1;
} else {
$docId += $docDelta/2;
$result[$shift + $docId] = $frqFile->readVInt();
}
}
}
return $result;
}
/**
* Returns term positions array.
* Result array structure: array(docId => array(pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $shift
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_TermInfo
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
{
$termInfo = $this->getTermInfo($term);
if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
$docsFilter->segmentFilters[$this->_name] = array();
}
return array();
}
$frqFile = $this->openCompoundFile('.frq');
$frqFile->seek($termInfo->freqPointer,SEEK_CUR);
$docId = 0;
$freqs = array();
if ($docsFilter !== null) {
if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
}
if (isset($docsFilter->segmentFilters[$this->_name])) {
// Filter already has some data for the current segment
// Make short name for the filter (which doesn't need additional dereferencing)
$filter = &$docsFilter->segmentFilters[$this->_name];
// Check if filter is not empty
if (count($filter) == 0) {
return array();
}
if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
// Perform fetching
// ---------------------------------------------------------------
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$updatedFilterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
// we have to read .prx file to get right position for next doc
// even filter doesn't match current document
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
// Include into updated filter and into result only if doc is matched by filter
if (isset($filter[$docId])) {
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$result[$shift + $docId] = $positions;
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
// ---------------------------------------------------------------
} else {
// Perform full scan
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$updatedFilterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
// we have to read .prx file to get right position for next doc
// even filter doesn't match current document
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
// Include into updated filter and into result only if doc is matched by filter
if (isset($filter[$docId])) {
$updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$result[$shift + $docId] = $positions;
}
}
$docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
}
} else {
// Filter doesn't has data for current segment
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$filterData = array();
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
$termPosition = 0;
$positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
$result[$shift + $docId] = $positions;
}
$docsFilter->segmentFilters[$this->_name] = $filterData;
}
} else {
for ($count = 0; $count < $termInfo->docFreq; $count++) {
$docDelta = $frqFile->readVInt();
if ($docDelta % 2 == 1) {
$docId += ($docDelta-1)/2;
$freqs[$docId] = 1;
} else {
$docId += $docDelta/2;
$freqs[$docId] = $frqFile->readVInt();
}
}
$result = array();
$prxFile = $this->openCompoundFile('.prx');
$prxFile->seek($termInfo->proxPointer, SEEK_CUR);
foreach ($freqs as $docId => $freq) {
$termPosition = 0;
$positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $prxFile->readVInt();
$positions[] = $termPosition;
}
$result[$shift + $docId] = $positions;
}
}
return $result;
}
/**
* Load normalizatin factors from an index file
*
* @param integer $fieldNum
* @throws Zend_Search_Lucene_Exception
*/
private function _loadNorm($fieldNum)
{
if ($this->_hasSingleNormFile) {
$normfFile = $this->openCompoundFile('.nrm');
$header = $normfFile->readBytes(3);
$headerFormatVersion = $normfFile->readByte();
if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong norms file format.');
}
foreach ($this->_fields as $fNum => $fieldInfo) {
if ($fieldInfo->isIndexed) {
$this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
}
}
} else {
$fFile = $this->openCompoundFile('.f' . $fieldNum);
$this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
}
}
/**
* Returns normalization factor for specified documents
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ( !($this->_fields[$fieldNum]->isIndexed) ) {
return null;
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
}
/**
* Returns norm vector, encoded in a byte string
*
* @param string $fieldName
* @return string
*/
public function normVector($fieldName)
{
$fieldNum = $this->getFieldNum($fieldName);
if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) {
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (!isset($this->_norms[$fieldNum])) {
$this->_loadNorm($fieldNum);
}
return $this->_norms[$fieldNum];
}
/**
* Returns true if any documents have been deleted from this index segment.
*
* @return boolean
*/
public function hasDeletions()
{
return $this->_deleted !== null;
}
/**
* Returns true if segment has single norms file.
*
* @return boolean
*/
public function hasSingleNormFile()
{
return $this->_hasSingleNormFile ? true : false;
}
/**
* Returns true if segment is stored using compound segment file.
*
* @return boolean
*/
public function isCompound()
{
return $this->_isCompound;
}
/**
* Deletes a document from the index segment.
* $id is an internal document id
*
* @param integer
*/
public function delete($id)
{
$this->_deletedDirty = true;
if (extension_loaded('bitset')) {
if ($this->_deleted === null) {
$this->_deleted = bitset_empty($id);
}
bitset_incl($this->_deleted, $id);
} else {
if ($this->_deleted === null) {
$this->_deleted = array();
}
$this->_deleted[$id] = 1;
}
}
/**
* Checks, that document is deleted
*
* @param integer
* @return boolean
*/
public function isDeleted($id)
{
if ($this->_deleted === null) {
return false;
}
if (extension_loaded('bitset')) {
return bitset_in($this->_deleted, $id);
} else {
return isset($this->_deleted[$id]);
}
}
/**
* Detect latest delete generation
*
* Is actualy used from writeChanges() method or from the constructor if it's invoked from
* Index writer. In both cases index write lock is already obtained, so we shouldn't care
* about it
*
* @return integer
*/
private function _detectLatestDelGen()
{
$delFileList = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == $this->_name . '.del') {
// Matches <segment_name>.del file name
$delFileList[] = 0;
} else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
// Matches <segment_name>_NNN.del file names
$delFileList[] = (int)base_convert($matches[1], 36, 10);
}
}
if (count($delFileList) == 0) {
// There is no deletions file for current segment in the directory
// Set deletions file generation number to 1
return -1;
} else {
// There are some deletions files for current segment in the directory
// Set deletions file generation number to the highest nuber
return max($delFileList);
}
}
/**
* Write changes if it's necessary.
*
* This method must be invoked only from the Writer _updateSegments() method,
* so index Write lock has to be already obtained.
*
* @internal
* @throws Zend_Search_Lucene_Exceptions
*/
public function writeChanges()
{
// Get new generation number
$latestDelGen = $this->_detectLatestDelGen();
if (!$this->_deletedDirty) {
// There was no deletions by current process
if ($latestDelGen == $this->_delGen) {
// Delete file hasn't been updated by any concurrent process
return;
} else if ($latestDelGen > $this->_delGen) {
// Delete file has been updated by some concurrent process
// Reload deletions file
$this->_delGen = $latestDelGen;
$this->_deleted = $this->_loadDelFile();
return;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.');
}
}
if ($latestDelGen > $this->_delGen) {
// Merge current deletions with latest deletions file
$this->_delGen = $latestDelGen;
$latestDelete = $this->_loadDelFile();
if (extension_loaded('bitset')) {
$this->_deleted = bitset_union($this->_deleted, $latestDelete);
} else {
$this->_deleted += $latestDelete;
}
}
if (extension_loaded('bitset')) {
$delBytes = $this->_deleted;
$bitCount = count(bitset_to_array($delBytes));
} else {
$byteCount = floor($this->_docCount/8)+1;
$delBytes = str_repeat(chr(0), $byteCount);
for ($count = 0; $count < $byteCount; $count++) {
$byte = 0;
for ($bit = 0; $bit < 8; $bit++) {
if (isset($this->_deleted[$count*8 + $bit])) {
$byte |= (1<<$bit);
}
}
$delBytes[$count] = chr($byte);
}
$bitCount = count($this->_deleted);
}
if ($this->_delGen == -1) {
// Set delete file generation number to 1
$this->_delGen = 1;
} else {
// Increase delete file generation number by 1
$this->_delGen++;
}
$delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
$delFile->writeInt($this->_docCount);
$delFile->writeInt($bitCount);
$delFile->writeBytes($delBytes);
$this->_deletedDirty = false;
}
/**
* Term Dictionary File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tisFile = null;
/**
* Actual offset of the .tis file data
*
* @var integer
*/
private $_tisFileOffset;
/**
* Frequencies File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_frqFile = null;
/**
* Actual offset of the .frq file data
*
* @var integer
*/
private $_frqFileOffset;
/**
* Positions File object for stream like terms reading
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Actual offset of the .prx file in the compound file
*
* @var integer
*/
private $_prxFileOffset;
/**
* Actual number of terms in term stream
*
* @var integer
*/
private $_termCount = 0;
/**
* Overall number of terms in term stream
*
* @var integer
*/
private $_termNum = 0;
/**
* Segment index interval
*
* @var integer
*/
private $_indexInterval;
/**
* Segment skip interval
*
* @var integer
*/
private $_skipInterval;
/**
* Last TermInfo in a terms stream
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_lastTermInfo = null;
/**
* Last Term in a terms stream
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lastTerm = null;
/**
* Map of the document IDs
* Used to get new docID after removing deleted documents.
* It's not very effective from memory usage point of view,
* but much more faster, then other methods
*
* @var array|null
*/
private $_docMap = null;
/**
* An array of all term positions in the documents.
* Array structure: array( docId => array( pos1, pos2, ...), ...)
*
* Is set to null if term positions loading has to be skipped
*
* @var array|null
*/
private $_lastTermPositions;
/**
* Terms scan mode
*
* Values:
*
* self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
* self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved
* self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
* document numbers are compacted (shifted if segment has deleted documents)
*
* @var integer
*/
private $_termsScanMode;
/** Scan modes */
const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved
const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved
const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved
// document numbers are compacted (shifted if segment contains deleted documents)
/**
* Reset terms stream
*
* $startId - id for the fist document
* $compact - remove deleted documents
*
* Returns start document id for the next segment
*
* @param integer $startId
* @param integer $mode
* @throws Zend_Search_Lucene_Exception
* @return integer
*/
public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */)
{
/**
* SegmentInfo->resetTermsStream() method actually takes two optional parameters:
* $startId (default value is 0)
* $mode (default value is self::SM_TERMS_ONLY)
*/
$argList = func_get_args();
if (count($argList) > 2) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong number of arguments');
} else if (count($argList) == 2) {
$startId = $argList[0];
$mode = $argList[1];
} else if (count($argList) == 1) {
$startId = $argList[0];
$mode = self::SM_TERMS_ONLY;
} else {
$startId = 0;
$mode = self::SM_TERMS_ONLY;
}
if ($this->_tisFile !== null) {
$this->_tisFile = null;
}
$this->_tisFile = $this->openCompoundFile('.tis', false);
$this->_tisFileOffset = $this->_tisFile->tell();
$tiVersion = $this->_tisFile->readInt();
if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
$tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
}
$this->_termCount =
$this->_termNum = $this->_tisFile->readLong(); // Read terms count
$this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval
$this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval
if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
$maxSkipLevels = $this->_tisFile->readInt();
}
if ($this->_frqFile !== null) {
$this->_frqFile = null;
}
if ($this->_prxFile !== null) {
$this->_prxFile = null;
}
$this->_docMap = array();
$this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1);
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
$this->_lastTermPositions = null;
$this->_termsScanMode = $mode;
switch ($mode) {
case self::SM_TERMS_ONLY:
// Do nothing
break;
case self::SM_FULL_INFO:
// break intentionally omitted
case self::SM_MERGE_INFO:
$this->_frqFile = $this->openCompoundFile('.frq', false);
$this->_frqFileOffset = $this->_frqFile->tell();
$this->_prxFile = $this->openCompoundFile('.prx', false);
$this->_prxFileOffset = $this->_prxFile->tell();
for ($count = 0; $count < $this->_docCount; $count++) {
if (!$this->isDeleted($count)) {
$this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
}
}
break;
default:
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
break;
}
// Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call)
$nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
$this->nextTerm();
return $nextSegmentStartId;
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
* @throws Zend_Search_Lucene_Exception
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
if ($this->_termDictionary === null) {
$this->_loadDictionaryIndex();
}
$searchField = $this->getFieldNum($prefix->field);
if ($searchField == -1) {
/**
* Field is not presented in this segment
* Go to the end of dictionary
*/
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
return;
}
$searchDicField = $this->_getFieldPosition($searchField);
// search for appropriate value in dictionary
$lowIndex = 0;
$highIndex = count($this->_termDictionary)-1;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$midTerm = $this->_termDictionary[$mid];
$fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
$delta = $searchDicField - $fieldNum;
if ($delta == 0) {
$delta = strcmp($prefix->text, $midTerm[1] /* text */);
}
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
// We have reached term we are looking for
break;
}
}
if ($highIndex == -1) {
// Term is out of the dictionary range
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
return;
}
$prevPosition = $highIndex;
$prevTerm = $this->_termDictionary[$prevPosition];
$prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
if ($this->_tisFile === null) {
// The end of terms stream is reached and terms dictionary file is closed
// Perform mini-reset operation
$this->_tisFile = $this->openCompoundFile('.tis', false);
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_frqFile = $this->openCompoundFile('.frq', false);
$this->_prxFile = $this->openCompoundFile('.prx', false);
}
}
$this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
$this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
$prevTermInfo[1] /* freqPointer */,
$prevTermInfo[2] /* proxPointer */,
$prevTermInfo[3] /* skipOffset */);
$this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval;
if ($highIndex == 0) {
// skip start entry
$this->nextTerm();
} else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) {
// We got exact match in the dictionary index
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_lastTermPositions = array();
$this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
$freqs = array(); $docId = 0;
for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
$docDelta = $this->_frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $this->_frqFile->readVInt();
}
}
$this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
foreach ($freqs as $docId => $freq) {
$termPosition = 0; $positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $this->_prxFile->readVInt();
$positions[] = $termPosition;
}
if (isset($this->_docMap[$docId])) {
$this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
}
}
}
return;
}
// Search term matching specified prefix
while ($this->_lastTerm !== null) {
if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 ||
($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
// Current term matches or greate than the pattern
return;
}
$this->nextTerm();
}
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
if ($this->_tisFile === null || $this->_termCount == 0) {
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
$this->_docMap = null;
// may be necessary for "empty" segment
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
return null;
}
$termPrefixLength = $this->_tisFile->readVInt();
$termSuffix = $this->_tisFile->readString();
$termFieldNum = $this->_tisFile->readVInt();
$termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
$this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
$docFreq = $this->_tisFile->readVInt();
$freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
$proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
if ($docFreq >= $this->_skipInterval) {
$skipOffset = $this->_tisFile->readVInt();
} else {
$skipOffset = 0;
}
$this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) {
$this->_lastTermPositions = array();
$this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
$freqs = array(); $docId = 0;
for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
$docDelta = $this->_frqFile->readVInt();
if( $docDelta % 2 == 1 ) {
$docId += ($docDelta-1)/2;
$freqs[ $docId ] = 1;
} else {
$docId += $docDelta/2;
$freqs[ $docId ] = $this->_frqFile->readVInt();
}
}
$this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
foreach ($freqs as $docId => $freq) {
$termPosition = 0; $positions = array();
for ($count = 0; $count < $freq; $count++ ) {
$termPosition += $this->_prxFile->readVInt();
$positions[] = $termPosition;
}
if (isset($this->_docMap[$docId])) {
$this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
}
}
}
$this->_termCount--;
if ($this->_termCount == 0) {
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
}
return $this->_lastTerm;
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_tisFile = null;
$this->_frqFile = null;
$this->_prxFile = null;
$this->_lastTerm = null;
$this->_lastTermInfo = null;
$this->_lastTermPositions = null;
$this->_docMap = null;
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_lastTerm;
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @return array
*/
public function currentTermPositions()
{
return $this->_lastTermPositions;
}
}

View File

@ -0,0 +1,271 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: SegmentMerger.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentMerger
{
/**
* Target segment writer
*
* @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
*/
private $_writer;
/**
* Number of docs in a new segment
*
* @var integer
*/
private $_docCount;
/**
* A set of segments to be merged
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos = array();
/**
* Flag to signal, that merge is already done
*
* @var boolean
*/
private $_mergeDone = false;
/**
* Field map
* [<segment_name>][<field_number>] => <target_field_number>
*
* @var array
*/
private $_fieldsMap = array();
/**
* Object constructor.
*
* Creates new segment merger with $directory as target to merge segments into
* and $name as a name of new segment
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct($directory, $name)
{
/** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
$this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
}
/**
* Add segmnet to a collection of segments to be merged
*
* @param Zend_Search_Lucene_Index_SegmentInfo $segment
*/
public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
{
$this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
}
/**
* Do merge.
*
* Returns number of documents in newly created segment
*
* @return Zend_Search_Lucene_Index_SegmentInfo
* @throws Zend_Search_Lucene_Exception
*/
public function merge()
{
if ($this->_mergeDone) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Merge is already done.');
}
if (count($this->_segmentInfos) < 1) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
. count($this->_segmentInfos)
. ').');
}
$this->_mergeFields();
$this->_mergeNorms();
$this->_mergeStoredFields();
$this->_mergeTerms();
$this->_mergeDone = true;
return $this->_writer->close();
}
/**
* Merge fields information
*/
private function _mergeFields()
{
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
$this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
}
}
}
/**
* Merge field's normalization factors
*/
private function _mergeNorms()
{
foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
if ($fieldInfo->isIndexed) {
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if ($segmentInfo->hasDeletions()) {
$srcNorm = $segmentInfo->normVector($fieldInfo->name);
$norm = '';
$docs = $segmentInfo->count();
for ($count = 0; $count < $docs; $count++) {
if (!$segmentInfo->isDeleted($count)) {
$norm .= $srcNorm[$count];
}
}
$this->_writer->addNorm($fieldInfo->name, $norm);
} else {
$this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
}
}
}
}
}
/**
* Merge fields information
*/
private function _mergeStoredFields()
{
$this->_docCount = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$fdtFile = $segmentInfo->openCompoundFile('.fdt');
for ($count = 0; $count < $segmentInfo->count(); $count++) {
$fieldCount = $fdtFile->readVInt();
$storedFields = array();
for ($count2 = 0; $count2 < $fieldCount; $count2++) {
$fieldNum = $fdtFile->readVInt();
$bits = $fdtFile->readByte();
$fieldInfo = $segmentInfo->getField($fieldNum);
if (!($bits & 2)) { // Text data
$storedFields[] =
new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readString(),
'UTF-8',
true,
$fieldInfo->isIndexed,
$bits & 1 );
} else { // Binary data
$storedFields[] =
new Zend_Search_Lucene_Field($fieldInfo->name,
$fdtFile->readBinary(),
'',
true,
$fieldInfo->isIndexed,
$bits & 1,
true);
}
}
if (!$segmentInfo->isDeleted($count)) {
$this->_docCount++;
$this->_writer->addStoredFields($storedFields);
}
}
}
}
/**
* Merge fields information
*/
private function _mergeTerms()
{
/** Zend_Search_Lucene_Index_TermsPriorityQueue */
require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
$segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
$segmentStartId = 0;
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO);
// Skip "empty" segments
if ($segmentInfo->currentTerm() !== null) {
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->initializeDictionaryFiles();
$termDocs = array();
while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
// Merge positions array
$termDocs += $segmentInfo->currentTermPositions();
if ($segmentInfoQueue->top() === null ||
$segmentInfoQueue->top()->currentTerm()->key() !=
$segmentInfo->currentTerm()->key()) {
// We got new term
ksort($termDocs, SORT_NUMERIC);
// Add term if it's contained in any document
if (count($termDocs) > 0) {
$this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
}
$termDocs = array();
}
$segmentInfo->nextTerm();
// check, if segment dictionary is finished
if ($segmentInfo->currentTerm() !== null) {
// Put segment back into the priority queue
$segmentInfoQueue->put($segmentInfo);
}
}
$this->_writer->closeDictionaryFiles();
}
}

View File

@ -0,0 +1,634 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: SegmentWriter.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_FieldInfo */
require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_TermInfo */
require_once 'Zend/Search/Lucene/Index/TermInfo.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.
*
* @var integer
*/
public static $indexInterval = 128;
/**
* Expert: The fraction of TermDocs entries stored in skip tables.
* Larger values result in smaller indexes, greater acceleration, but fewer
* accelerable cases, while smaller values result in bigger indexes,
* less acceleration and more
* accelerable cases. More detailed experiments would be useful here.
*
* 0x7FFFFFFF indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $skipInterval = 0x7FFFFFFF;
/**
* Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*
* 0 indicates that we don't use skip data
*
* Note: not used in current implementation
*
* @var integer
*/
public static $maxSkipLevels = 0;
/**
* Number of docs in a segment
*
* @var integer
*/
protected $_docCount = 0;
/**
* Segment name
*
* @var string
*/
protected $_name;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
protected $_directory;
/**
* List of the index files.
* Used for automatic compound file generation
*
* @var unknown_type
*/
protected $_files = array();
/**
* Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
*
* @var array
*/
protected $_fields = array();
/**
* Normalization factors.
* An array fieldName => normVector
* normVector is a binary string.
* Each byte corresponds to an indexed document in a segment and
* encodes normalization factor (float value, encoded by
* Zend_Search_Lucene_Search_Similarity::encodeNorm())
*
* @var array
*/
protected $_norms = array();
/**
* '.fdx' file - Stored Fields, the field index.
*
* @var Zend_Search_Lucene_Storage_File
*/
protected $_fdxFile = null;
/**
* '.fdt' file - Stored Fields, the field data.
*
* @var Zend_Search_Lucene_Storage_File
*/
protected $_fdtFile = null;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
$this->_directory = $directory;
$this->_name = $name;
}
/**
* Add field to the segment
*
* Returns actual field number
*
* @param Zend_Search_Lucene_Field $field
* @return integer
*/
public function addField(Zend_Search_Lucene_Field $field)
{
if (!isset($this->_fields[$field->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$field->name] =
new Zend_Search_Lucene_Index_FieldInfo($field->name,
$field->isIndexed,
$fieldNumber,
$field->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$field->name]->isIndexed |= $field->isIndexed;
$this->_fields[$field->name]->storeTermVector |= $field->storeTermVector;
return $this->_fields[$field->name]->number;
}
}
/**
* Add fieldInfo to the segment
*
* Returns actual field number
*
* @param Zend_Search_Lucene_Index_FieldInfo $fieldInfo
* @return integer
*/
public function addFieldInfo(Zend_Search_Lucene_Index_FieldInfo $fieldInfo)
{
if (!isset($this->_fields[$fieldInfo->name])) {
$fieldNumber = count($this->_fields);
$this->_fields[$fieldInfo->name] =
new Zend_Search_Lucene_Index_FieldInfo($fieldInfo->name,
$fieldInfo->isIndexed,
$fieldNumber,
$fieldInfo->storeTermVector);
return $fieldNumber;
} else {
$this->_fields[$fieldInfo->name]->isIndexed |= $fieldInfo->isIndexed;
$this->_fields[$fieldInfo->name]->storeTermVector |= $fieldInfo->storeTermVector;
return $this->_fields[$fieldInfo->name]->number;
}
}
/**
* Returns array of FieldInfo objects.
*
* @return array
*/
public function getFieldInfos()
{
return $this->_fields;
}
/**
* Add stored fields information
*
* @param array $storedFields array of Zend_Search_Lucene_Field objects
*/
public function addStoredFields($storedFields)
{
if (!isset($this->_fdxFile)) {
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
$this->_fdxFile->writeLong($this->_fdtFile->tell());
$this->_fdtFile->writeVInt(count($storedFields));
foreach ($storedFields as $field) {
$this->_fdtFile->writeVInt($this->_fields[$field->name]->number);
$fieldBits = ($field->isTokenized ? 0x01 : 0x00) |
($field->isBinary ? 0x02 : 0x00) |
0x00; /* 0x04 - third bit, compressed (ZLIB) */
$this->_fdtFile->writeByte($fieldBits);
if ($field->isBinary) {
$this->_fdtFile->writeVInt(strlen($field->value));
$this->_fdtFile->writeBytes($field->value);
} else {
$this->_fdtFile->writeString($field->getUtf8Value());
}
}
$this->_docCount++;
}
/**
* Returns the total number of documents in this segment.
*
* @return integer
*/
public function count()
{
return $this->_docCount;
}
/**
* Return segment name
*
* @return string
*/
public function getName()
{
return $this->_name;
}
/**
* Dump Field Info (.fnm) segment file
*/
protected function _dumpFNM()
{
$fnmFile = $this->_directory->createFile($this->_name . '.fnm');
$fnmFile->writeVInt(count($this->_fields));
$nrmFile = $this->_directory->createFile($this->_name . '.nrm');
// Write header
$nrmFile->writeBytes('NRM');
// Write format specifier
$nrmFile->writeByte((int)0xFF);
foreach ($this->_fields as $field) {
$fnmFile->writeString($field->name);
$fnmFile->writeByte(($field->isIndexed ? 0x01 : 0x00) |
($field->storeTermVector ? 0x02 : 0x00)
// not supported yet 0x04 /* term positions are stored with the term vectors */ |
// not supported yet 0x08 /* term offsets are stored with the term vectors */ |
);
if ($field->isIndexed) {
// pre-2.1 index mode (not used now)
// $normFileName = $this->_name . '.f' . $field->number;
// $fFile = $this->_directory->createFile($normFileName);
// $fFile->writeBytes($this->_norms[$field->name]);
// $this->_files[] = $normFileName;
$nrmFile->writeBytes($this->_norms[$field->name]);
}
}
$this->_files[] = $this->_name . '.fnm';
$this->_files[] = $this->_name . '.nrm';
}
/**
* Term Dictionary file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tisFile = null;
/**
* Term Dictionary index file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_tiiFile = null;
/**
* Frequencies file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_frqFile = null;
/**
* Positions file
*
* @var Zend_Search_Lucene_Storage_File
*/
private $_prxFile = null;
/**
* Number of written terms
*
* @var integer
*/
private $_termCount;
/**
* Last saved term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevTerm;
/**
* Last saved term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevTermInfo;
/**
* Last saved index term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_prevIndexTerm;
/**
* Last saved index term info
*
* @var Zend_Search_Lucene_Index_TermInfo
*/
private $_prevIndexTermInfo;
/**
* Last term dictionary file position
*
* @var integer
*/
private $_lastIndexPosition;
/**
* Create dicrionary, frequency and positions files and write necessary headers
*/
public function initializeDictionaryFiles()
{
$this->_tisFile = $this->_directory->createFile($this->_name . '.tis');
$this->_tisFile->writeInt((int)0xFFFFFFFD);
$this->_tisFile->writeLong(0 /* dummy data for terms count */);
$this->_tisFile->writeInt(self::$indexInterval);
$this->_tisFile->writeInt(self::$skipInterval);
$this->_tisFile->writeInt(self::$maxSkipLevels);
$this->_tiiFile = $this->_directory->createFile($this->_name . '.tii');
$this->_tiiFile->writeInt((int)0xFFFFFFFD);
$this->_tiiFile->writeLong(0 /* dummy data for terms count */);
$this->_tiiFile->writeInt(self::$indexInterval);
$this->_tiiFile->writeInt(self::$skipInterval);
$this->_tiiFile->writeInt(self::$maxSkipLevels);
/** Dump dictionary header */
$this->_tiiFile->writeVInt(0); // preffix length
$this->_tiiFile->writeString(''); // suffix
$this->_tiiFile->writeInt((int)0xFFFFFFFF); // field number
$this->_tiiFile->writeByte((int)0x0F);
$this->_tiiFile->writeVInt(0); // DocFreq
$this->_tiiFile->writeVInt(0); // FreqDelta
$this->_tiiFile->writeVInt(0); // ProxDelta
$this->_tiiFile->writeVInt(24); // IndexDelta
$this->_frqFile = $this->_directory->createFile($this->_name . '.frq');
$this->_prxFile = $this->_directory->createFile($this->_name . '.prx');
$this->_files[] = $this->_name . '.tis';
$this->_files[] = $this->_name . '.tii';
$this->_files[] = $this->_name . '.frq';
$this->_files[] = $this->_name . '.prx';
$this->_prevTerm = null;
$this->_prevTermInfo = null;
$this->_prevIndexTerm = null;
$this->_prevIndexTermInfo = null;
$this->_lastIndexPosition = 24;
$this->_termCount = 0;
}
/**
* Add term
*
* Term positions is an array( docId => array(pos1, pos2, pos3, ...), ... )
*
* @param Zend_Search_Lucene_Index_Term $termEntry
* @param array $termDocs
*/
public function addTerm($termEntry, $termDocs)
{
$freqPointer = $this->_frqFile->tell();
$proxPointer = $this->_prxFile->tell();
$prevDoc = 0;
foreach ($termDocs as $docId => $termPositions) {
$docDelta = ($docId - $prevDoc)*2;
$prevDoc = $docId;
if (count($termPositions) > 1) {
$this->_frqFile->writeVInt($docDelta);
$this->_frqFile->writeVInt(count($termPositions));
} else {
$this->_frqFile->writeVInt($docDelta + 1);
}
$prevPosition = 0;
foreach ($termPositions as $position) {
$this->_prxFile->writeVInt($position - $prevPosition);
$prevPosition = $position;
}
}
if (count($termDocs) >= self::$skipInterval) {
/**
* @todo Write Skip Data to a freq file.
* It's not used now, but make index more optimal
*/
$skipOffset = $this->_frqFile->tell() - $freqPointer;
} else {
$skipOffset = 0;
}
$term = new Zend_Search_Lucene_Index_Term($termEntry->text,
$this->_fields[$termEntry->field]->number);
$termInfo = new Zend_Search_Lucene_Index_TermInfo(count($termDocs),
$freqPointer, $proxPointer, $skipOffset);
$this->_dumpTermDictEntry($this->_tisFile, $this->_prevTerm, $term, $this->_prevTermInfo, $termInfo);
if (($this->_termCount + 1) % self::$indexInterval == 0) {
$this->_dumpTermDictEntry($this->_tiiFile, $this->_prevIndexTerm, $term, $this->_prevIndexTermInfo, $termInfo);
$indexPosition = $this->_tisFile->tell();
$this->_tiiFile->writeVInt($indexPosition - $this->_lastIndexPosition);
$this->_lastIndexPosition = $indexPosition;
}
$this->_termCount++;
}
/**
* Close dictionary
*/
public function closeDictionaryFiles()
{
$this->_tisFile->seek(4);
$this->_tisFile->writeLong($this->_termCount);
$this->_tiiFile->seek(4);
// + 1 is used to count an additional special index entry (empty term at the start of the list)
$this->_tiiFile->writeLong(($this->_termCount - $this->_termCount % self::$indexInterval)/self::$indexInterval + 1);
}
/**
* Dump Term Dictionary segment file entry.
* Used to write entry to .tis or .tii files
*
* @param Zend_Search_Lucene_Storage_File $dicFile
* @param Zend_Search_Lucene_Index_Term $prevTerm
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_TermInfo $prevTermInfo
* @param Zend_Search_Lucene_Index_TermInfo $termInfo
*/
protected function _dumpTermDictEntry(Zend_Search_Lucene_Storage_File $dicFile,
&$prevTerm, Zend_Search_Lucene_Index_Term $term,
&$prevTermInfo, Zend_Search_Lucene_Index_TermInfo $termInfo)
{
if (isset($prevTerm) && $prevTerm->field == $term->field) {
$matchedBytes = 0;
$maxBytes = min(strlen($prevTerm->text), strlen($term->text));
while ($matchedBytes < $maxBytes &&
$prevTerm->text[$matchedBytes] == $term->text[$matchedBytes]) {
$matchedBytes++;
}
// Calculate actual matched UTF-8 pattern
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < $matchedBytes) {
$charBytes = 1;
if ((ord($term->text[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($term->text[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > $matchedBytes) {
// char crosses matched bytes boundary
// skip char
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
// Write preffix length
$dicFile->writeVInt($prefixChars);
// Write suffix
$dicFile->writeString(substr($term->text, $prefixBytes));
} else {
// Write preffix length
$dicFile->writeVInt(0);
// Write suffix
$dicFile->writeString($term->text);
}
// Write field number
$dicFile->writeVInt($term->field);
// DocFreq (the count of documents which contain the term)
$dicFile->writeVInt($termInfo->docFreq);
$prevTerm = $term;
if (!isset($prevTermInfo)) {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer);
} else {
// Write FreqDelta
$dicFile->writeVInt($termInfo->freqPointer - $prevTermInfo->freqPointer);
// Write ProxDelta
$dicFile->writeVInt($termInfo->proxPointer - $prevTermInfo->proxPointer);
}
// Write SkipOffset - it's not 0 when $termInfo->docFreq > self::$skipInterval
if ($termInfo->skipOffset != 0) {
$dicFile->writeVInt($termInfo->skipOffset);
}
$prevTermInfo = $termInfo;
}
/**
* Generate compound index file
*/
protected function _generateCFS()
{
$cfsFile = $this->_directory->createFile($this->_name . '.cfs');
$cfsFile->writeVInt(count($this->_files));
$dataOffsetPointers = array();
foreach ($this->_files as $fileName) {
$dataOffsetPointers[$fileName] = $cfsFile->tell();
$cfsFile->writeLong(0); // write dummy data
$cfsFile->writeString($fileName);
}
foreach ($this->_files as $fileName) {
// Get actual data offset
$dataOffset = $cfsFile->tell();
// Seek to the data offset pointer
$cfsFile->seek($dataOffsetPointers[$fileName]);
// Write actual data offset value
$cfsFile->writeLong($dataOffset);
// Seek back to the end of file
$cfsFile->seek($dataOffset);
$dataFile = $this->_directory->getFileObject($fileName);
$byteCount = $this->_directory->fileLength($fileName);
while ($byteCount > 0) {
$data = $dataFile->readBytes(min($byteCount, 131072 /*128Kb*/));
$byteCount -= strlen($data);
$cfsFile->writeBytes($data);
}
$this->_directory->deleteFile($fileName);
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
abstract public function close();
}

View File

@ -0,0 +1,230 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: DocumentWriter.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Term Dictionary
* Array of the Zend_Search_Lucene_Index_Term objects
* Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
*
* @var array
*/
protected $_termDictionary;
/**
* Documents, which contain the term
*
* @var array
*/
protected $_termDocs;
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
$this->_termDocs = array();
$this->_termDictionary = array();
}
/**
* Adds a document to this segment.
*
* @param Zend_Search_Lucene_Document $document
* @throws Zend_Search_Lucene_Exception
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
$storedFields = array();
$docNorms = array();
$similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
foreach ($document->getFieldNames() as $fieldName) {
$field = $document->getField($fieldName);
if ($field->storeTermVector) {
/**
* @todo term vector storing support
*/
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
}
if ($field->isIndexed) {
if ($field->isTokenized) {
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
$analyzer->setInput($field->value, $field->encoding);
$position = 0;
$tokenCounter = 0;
while (($token = $analyzer->nextToken()) !== null) {
$tokenCounter++;
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$position += $token->getPositionIncrement();
$this->_termDocs[$termKey][$this->_docCount][] = $position;
}
if ($tokenCounter == 0) {
// Field contains empty value. Treat it as non-indexed and non-tokenized
$field = clone($field);
$field->isIndexed = $field->isTokenized = false;
} else {
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
$tokenCounter)*
$document->boost*
$field->boost ));
}
} else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
// Field contains empty value. Treat it as non-indexed and non-tokenized
$field = clone($field);
$field->isIndexed = $field->isTokenized = false;
} else {
$term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
$termKey = $term->key();
if (!isset($this->_termDictionary[$termKey])) {
// New term
$this->_termDictionary[$termKey] = $term;
$this->_termDocs[$termKey] = array();
$this->_termDocs[$termKey][$this->_docCount] = array();
} else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
// Existing term, but new term entry
$this->_termDocs[$termKey][$this->_docCount] = array();
}
$this->_termDocs[$termKey][$this->_docCount][] = 0; // position
$docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
$document->boost*
$field->boost ));
}
}
if ($field->isStored) {
$storedFields[] = $field;
}
$this->addField($field);
}
foreach ($this->_fields as $fieldName => $field) {
if (!$field->isIndexed) {
continue;
}
if (!isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
$this->_docCount);
}
if (isset($docNorms[$fieldName])){
$this->_norms[$fieldName] .= $docNorms[$fieldName];
} else {
$this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
}
}
$this->addStoredFields($storedFields);
}
/**
* Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
*/
protected function _dumpDictionary()
{
ksort($this->_termDictionary, SORT_STRING);
$this->initializeDictionaryFiles();
foreach ($this->_termDictionary as $termId => $term) {
$this->addTerm($term, $this->_termDocs[$termId]);
}
$this->closeDictionaryFiles();
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_dumpDictionary();
$this->_generateCFS();
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}

View File

@ -0,0 +1,94 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: StreamWriter.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_SegmentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_SegmentWriter_StreamWriter extends Zend_Search_Lucene_Index_SegmentWriter
{
/**
* Object constructor.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param string $name
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
{
parent::__construct($directory, $name);
}
/**
* Create stored fields files and open them for write
*/
public function createStoredFieldsFiles()
{
$this->_fdxFile = $this->_directory->createFile($this->_name . '.fdx');
$this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt');
$this->_files[] = $this->_name . '.fdx';
$this->_files[] = $this->_name . '.fdt';
}
public function addNorm($fieldName, $normVector)
{
if (isset($this->_norms[$fieldName])) {
$this->_norms[$fieldName] .= $normVector;
} else {
$this->_norms[$fieldName] = $normVector;
}
}
/**
* Close segment, write it to disk and return segment info
*
* @return Zend_Search_Lucene_Index_SegmentInfo
*/
public function close()
{
if ($this->_docCount == 0) {
return null;
}
$this->_dumpFNM();
$this->_generateCFS();
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$this->_name,
$this->_docCount,
-1,
null,
true,
true);
}
}

View File

@ -0,0 +1,144 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* A Term represents a word from text. This is the unit of search. It is
* composed of two elements, the text of the word, as a string, and the name of
* the field that the text occured in, an interned string.
*
* Note that terms may represent more than words from text fields, but also
* things like dates, email addresses, urls, etc.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Term
{
/**
* Field name or field number (depending from context)
*
* @var mixed
*/
public $field;
/**
* Term value
*
* @var string
*/
public $text;
/**
* Object constructor
*/
public function __construct($text, $field = null)
{
$this->field = ($field === null)? Zend_Search_Lucene::getDefaultSearchField() : $field;
$this->text = $text;
}
/**
* Returns term key
*
* @return string
*/
public function key()
{
return $this->field . chr(0) . $this->text;
}
/**
* Get term prefix
*
* @param string $str
* @param integer $length
* @return string
*/
public static function getPrefix($str, $length)
{
$prefixBytes = 0;
$prefixChars = 0;
while ($prefixBytes < strlen($str) && $prefixChars < $length) {
$charBytes = 1;
if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$prefixBytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($prefixBytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$prefixChars++;
$prefixBytes += $charBytes;
}
return substr($str, 0, $prefixBytes);
}
/**
* Get UTF-8 string length
*
* @param string $str
* @return string
*/
public static function getLength($str)
{
$bytes = 0;
$chars = 0;
while ($bytes < strlen($str)) {
$charBytes = 1;
if ((ord($str[$bytes]) & 0xC0) == 0xC0) {
$charBytes++;
if (ord($str[$bytes]) & 0x20 ) {
$charBytes++;
if (ord($str[$bytes]) & 0x10 ) {
$charBytes++;
}
}
}
if ($bytes + $charBytes > strlen($str)) {
// wrong character
break;
}
$chars++;
$bytes += $charBytes;
}
return $chars;
}
}

View File

@ -0,0 +1,80 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: TermInfo.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* A Zend_Search_Lucene_Index_TermInfo represents a record of information stored for a term.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermInfo
{
/**
* The number of documents which contain the term.
*
* @var integer
*/
public $docFreq;
/**
* Data offset in a Frequencies file.
*
* @var integer
*/
public $freqPointer;
/**
* Data offset in a Positions file.
*
* @var integer
*/
public $proxPointer;
/**
* ScipData offset in a Frequencies file.
*
* @var integer
*/
public $skipOffset;
/**
* Term offset of the _next_ term in a TermDictionary file.
* Used only for Term Index
*
* @var integer
*/
public $indexPointer;
public function __construct($docFreq, $freqPointer, $proxPointer, $skipOffset, $indexPointer = null)
{
$this->docFreq = $docFreq;
$this->freqPointer = $freqPointer;
$this->proxPointer = $proxPointer;
$this->skipOffset = $skipOffset;
$this->indexPointer = $indexPointer;
}
}

View File

@ -0,0 +1,49 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: TermsPriorityQueue.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_PriorityQueue */
require_once 'Zend/Search/Lucene/PriorityQueue.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_TermsPriorityQueue extends Zend_Search_Lucene_PriorityQueue
{
/**
* Compare elements
*
* Returns true, if $termsStream1 is "less" than $termsStream2; else otherwise
*
* @param mixed $termsStream1
* @param mixed $termsStream2
* @return boolean
*/
protected function _less($termsStream1, $termsStream2)
{
return strcmp($termsStream1->currentTerm()->key(), $termsStream2->currentTerm()->key()) < 0;
}
}

View File

@ -0,0 +1,66 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Interface.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
interface Zend_Search_Lucene_Index_TermsStream_Interface
{
/**
* Reset terms stream.
*/
public function resetTermsStream();
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix);
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm();
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm();
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream();
}

View File

@ -0,0 +1,841 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Writer.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_LockManager */
require_once 'Zend/Search/Lucene/LockManager.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Index_Writer
{
/**
* @todo Implement Analyzer substitution
* @todo Implement Zend_Search_Lucene_Storage_DirectoryRAM and Zend_Search_Lucene_Storage_FileRAM to use it for
* temporary index files
* @todo Directory lock processing
*/
/**
* Number of documents required before the buffered in-memory
* documents are written into a new Segment
*
* Default value is 10
*
* @var integer
*/
public $maxBufferedDocs = 10;
/**
* Largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @var integer
*/
public $maxMergeDocs = PHP_INT_MAX;
/**
* Determines how often segment indices are merged by addDocument().
*
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
*
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
*
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @var integer
*/
public $mergeFactor = 10;
/**
* File system adapter.
*
* @var Zend_Search_Lucene_Storage_Directory
*/
private $_directory = null;
/**
* Changes counter.
*
* @var integer
*/
private $_versionUpdate = 0;
/**
* List of the segments, created by index writer
* Array of Zend_Search_Lucene_Index_SegmentInfo objects
*
* @var array
*/
private $_newSegments = array();
/**
* List of segments to be deleted on commit
*
* @var array
*/
private $_segmentsToDelete = array();
/**
* Current segment to add documents
*
* @var Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter
*/
private $_currentSegment = null;
/**
* Array of Zend_Search_Lucene_Index_SegmentInfo objects for this index.
*
* It's a reference to the corresponding Zend_Search_Lucene::$_segmentInfos array
*
* @var array Zend_Search_Lucene_Index_SegmentInfo
*/
private $_segmentInfos;
/**
* Index target format version
*
* @var integer
*/
private $_targetFormatVersion;
/**
* List of indexfiles extensions
*
* @var array
*/
private static $_indexExtensions = array('.cfs' => '.cfs',
'.cfx' => '.cfx',
'.fnm' => '.fnm',
'.fdx' => '.fdx',
'.fdt' => '.fdt',
'.tis' => '.tis',
'.tii' => '.tii',
'.frq' => '.frq',
'.prx' => '.prx',
'.tvx' => '.tvx',
'.tvd' => '.tvd',
'.tvf' => '.tvf',
'.del' => '.del',
'.sti' => '.sti' );
/**
* Create empty index
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param integer $generation
* @param integer $nameCount
*/
public static function createIndex(Zend_Search_Lucene_Storage_Directory $directory, $generation, $nameCount)
{
if ($generation == 0) {
// Create index in pre-2.1 mode
foreach ($directory->fileList() as $file) {
if ($file == 'deletable' ||
$file == 'segments' ||
isset(self::$_indexExtensions[ substr($file, strlen($file)-4)]) ||
preg_match('/\.f\d+$/i', $file) /* matches <segment_name>.f<decimal_nmber> file names */) {
$directory->deleteFile($file);
}
}
$segmentsFile = $directory->createFile('segments');
$segmentsFile->writeInt((int)0xFFFFFFFF);
// write version (initialized by current time)
$segmentsFile->writeLong(round(microtime(true)));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
$deletableFile = $directory->createFile('deletable');
// write counter
$deletableFile->writeInt(0);
} else {
$genFile = $directory->createFile('segments.gen');
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation two times
$genFile->writeLong($generation);
$genFile->writeLong($generation);
$segmentsFile = $directory->createFile(Zend_Search_Lucene::getSegmentFileName($generation));
$segmentsFile->writeInt((int)0xFFFFFFFD);
// write version (initialized by current time)
$segmentsFile->writeLong(round(microtime(true)));
// write name counter
$segmentsFile->writeInt($nameCount);
// write segment counter
$segmentsFile->writeInt(0);
}
}
/**
* Open the index for writing
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @param array $segmentInfos
* @param integer $targetFormatVersion
* @param Zend_Search_Lucene_Storage_File $cleanUpLock
*/
public function __construct(Zend_Search_Lucene_Storage_Directory $directory, &$segmentInfos, $targetFormatVersion)
{
$this->_directory = $directory;
$this->_segmentInfos = &$segmentInfos;
$this->_targetFormatVersion = $targetFormatVersion;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
/** Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter */
require_once 'Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php';
if ($this->_currentSegment === null) {
$this->_currentSegment =
new Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter($this->_directory, $this->_newSegmentName());
}
$this->_currentSegment->addDocument($document);
if ($this->_currentSegment->count() >= $this->maxBufferedDocs) {
$this->commit();
}
$this->_maybeMergeSegments();
$this->_versionUpdate++;
}
/**
* Check if we have anything to merge
*
* @return boolean
*/
private function _hasAnythingToMerge()
{
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
return true;
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
return false;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
return true;
}
return false;
}
/**
* Merge segments if necessary
*/
private function _maybeMergeSegments()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return;
}
if (!$this->_hasAnythingToMerge()) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
// Perform standard auto-optimization procedure
$segmentSizes = array();
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
$segmentSizes[$segName] = $segmentInfo->count();
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge = $this->maxBufferedDocs;
asort($segmentSizes, SORT_NUMERIC);
foreach ($segmentSizes as $segName => $size) {
// Check, if segment comes into a new merging block
while ($size >= $sizeToMerge) {
// Merge previous block if it's large enough
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
$mergePool = array();
$poolSize = 0;
$sizeToMerge *= $this->mergeFactor;
if ($sizeToMerge > $this->maxMergeDocs) {
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return;
}
}
$mergePool[] = $this->_segmentInfos[$segName];
$poolSize += $size;
}
if ($poolSize >= $sizeToMerge) {
$this->_mergeSegments($mergePool);
}
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
}
/**
* Merge specified segments
*
* $segments is an array of SegmentInfo objects
*
* @param array $segments
*/
private function _mergeSegments($segments)
{
$newName = $this->_newSegmentName();
/** Zend_Search_Lucene_Index_SegmentMerger */
require_once 'Zend/Search/Lucene/Index/SegmentMerger.php';
$merger = new Zend_Search_Lucene_Index_SegmentMerger($this->_directory,
$newName);
foreach ($segments as $segmentInfo) {
$merger->addSource($segmentInfo);
$this->_segmentsToDelete[$segmentInfo->getName()] = $segmentInfo->getName();
}
$newSegment = $merger->merge();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->commit();
}
/**
* Update segments file by adding current segment to a list
*
* @throws Zend_Search_Lucene_Exception
*/
private function _updateSegments()
{
// Get an exclusive index lock
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
// Write down changes for the segments
foreach ($this->_segmentInfos as $segInfo) {
$segInfo->writeChanges();
}
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$newSegmentFile = $this->_directory->createFile(Zend_Search_Lucene::getSegmentFileName(++$generation), false);
try {
$genFile = $this->_directory->getFileObject('segments.gen', false);
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'is not readable') !== false) {
$genFile = $this->_directory->createFile('segments.gen');
} else {
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
$genFile->writeInt((int)0xFFFFFFFE);
// Write generation (first copy)
$genFile->writeLong($generation);
try {
// Write format marker
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_1) {
$newSegmentFile->writeInt((int)0xFFFFFFFD);
} else if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
$newSegmentFile->writeInt((int)0xFFFFFFFC);
}
// Read src file format identifier
$format = $segmentsFile->readInt();
if ($format == (int)0xFFFFFFFF) {
$srcFormat = Zend_Search_Lucene::FORMAT_PRE_2_1;
} else if ($format == (int)0xFFFFFFFD) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_1;
} else if ($format == (int)0xFFFFFFFC) {
$srcFormat = Zend_Search_Lucene::FORMAT_2_3;
} else {
throw new Zend_Search_Lucene_Exception('Unsupported segments file format');
}
$version = $segmentsFile->readLong() + $this->_versionUpdate;
$this->_versionUpdate = 0;
$newSegmentFile->writeLong($version);
// Write segment name counter
$newSegmentFile->writeInt($segmentsFile->readInt());
// Get number of segments offset
$numOfSegmentsOffset = $newSegmentFile->tell();
// Write dummy data (segment counter)
$newSegmentFile->writeInt(0);
// Read number of segemnts
$segmentsCount = $segmentsFile->readInt();
$segments = array();
for ($count = 0; $count < $segmentsCount; $count++) {
$segName = $segmentsFile->readString();
$segSize = $segmentsFile->readInt();
if ($srcFormat == Zend_Search_Lucene::FORMAT_PRE_2_1) {
// pre-2.1 index format
$delGen = 0;
$hasSingleNormFile = false;
$numField = (int)0xFFFFFFFF;
$isCompoundByte = 0;
$docStoreOptions = null;
} else {
$delGen = $segmentsFile->readLong();
if ($srcFormat == Zend_Search_Lucene::FORMAT_2_3) {
$docStoreOffset = $segmentsFile->readInt();
if ($docStoreOffset != (int)0xFFFFFFFF) {
$docStoreSegment = $segmentsFile->readString();
$docStoreIsCompoundFile = $segmentsFile->readByte();
$docStoreOptions = array('offset' => $docStoreOffset,
'segment' => $docStoreSegment,
'isCompound' => ($docStoreIsCompoundFile == 1));
} else {
$docStoreOptions = null;
}
} else {
$docStoreOptions = null;
}
$hasSingleNormFile = $segmentsFile->readByte();
$numField = $segmentsFile->readInt();
$normGens = array();
if ($numField != (int)0xFFFFFFFF) {
for ($count1 = 0; $count1 < $numField; $count1++) {
$normGens[] = $segmentsFile->readLong();
}
}
$isCompoundByte = $segmentsFile->readByte();
}
if (!in_array($segName, $this->_segmentsToDelete)) {
// Load segment if necessary
if (!isset($this->_segmentInfos[$segName])) {
if ($isCompoundByte == 0xFF) {
// The segment is not a compound file
$isCompound = false;
} else if ($isCompoundByte == 0x00) {
// The status is unknown
$isCompound = null;
} else if ($isCompoundByte == 0x01) {
// The segment is a compound file
$isCompound = true;
}
/** Zend_Search_Lucene_Index_SegmentInfo */
require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
$this->_segmentInfos[$segName] =
new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
$segName,
$segSize,
$delGen,
$docStoreOptions,
$hasSingleNormFile,
$isCompound);
} else {
// Retrieve actual deletions file generation number
$delGen = $this->_segmentInfos[$segName]->getDelGen();
}
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segSize);
$newSegmentFile->writeLong($delGen);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
if ($docStoreOptions !== null) {
$newSegmentFile->writeInt($docStoreOffset);
$newSegmentFile->writeString($docStoreSegment);
$newSegmentFile->writeByte($docStoreIsCompoundFile);
} else {
// Set DocStoreOffset to -1
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
} else if ($docStoreOptions !== null) {
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
throw new Zend_Search_Lucene_Exception('Index conversion to lower format version is not supported.');
}
$newSegmentFile->writeByte($hasSingleNormFile);
$newSegmentFile->writeInt($numField);
if ($numField != (int)0xFFFFFFFF) {
foreach ($normGens as $normGen) {
$newSegmentFile->writeLong($normGen);
}
}
$newSegmentFile->writeByte($isCompoundByte);
$segments[$segName] = $segSize;
}
}
$segmentsFile->close();
$segmentsCount = count($segments) + count($this->_newSegments);
foreach ($this->_newSegments as $segName => $segmentInfo) {
$newSegmentFile->writeString($segName);
$newSegmentFile->writeInt($segmentInfo->count());
// delete file generation: -1 (there is no delete file yet)
$newSegmentFile->writeInt((int)0xFFFFFFFF);$newSegmentFile->writeInt((int)0xFFFFFFFF);
if ($this->_targetFormatVersion == Zend_Search_Lucene::FORMAT_2_3) {
// docStoreOffset: -1 (segment doesn't use shared doc store)
$newSegmentFile->writeInt((int)0xFFFFFFFF);
}
// HasSingleNormFile
$newSegmentFile->writeByte($segmentInfo->hasSingleNormFile());
// NumField
$newSegmentFile->writeInt((int)0xFFFFFFFF);
// IsCompoundFile
$newSegmentFile->writeByte($segmentInfo->isCompound() ? 1 : -1);
$segments[$segmentInfo->getName()] = $segmentInfo->count();
$this->_segmentInfos[$segName] = $segmentInfo;
}
$this->_newSegments = array();
$newSegmentFile->seek($numOfSegmentsOffset);
$newSegmentFile->writeInt($segmentsCount); // Update segments count
$newSegmentFile->close();
} catch (Exception $e) {
/** Restore previous index generation */
$generation--;
$genFile->seek(4, SEEK_SET);
// Write generation number twice
$genFile->writeLong($generation); $genFile->writeLong($generation);
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Throw the exception
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
// Write generation (second copy)
$genFile->writeLong($generation);
// Check if another update or read process is not running now
// If yes, skip clean-up procedure
if (Zend_Search_Lucene_LockManager::escalateReadLock($this->_directory)) {
/**
* Clean-up directory
*/
$filesToDelete = array();
$filesTypes = array();
$filesNumbers = array();
// list of .del files of currently used segments
// each segment can have several generations of .del files
// only last should not be deleted
$delFiles = array();
foreach ($this->_directory->fileList() as $file) {
if ($file == 'deletable') {
// 'deletable' file
$filesToDelete[] = $file;
$filesTypes[] = 0; // delete this file first, since it's not used starting from Lucene v2.1
$filesNumbers[] = 0;
} else if ($file == 'segments') {
// 'segments' file
$filesToDelete[] = $file;
$filesTypes[] = 1; // second file to be deleted "zero" version of segments file (Lucene pre-2.1)
$filesNumbers[] = 0;
} else if (preg_match('/^segments_[a-zA-Z0-9]+$/i', $file)) {
// 'segments_xxx' file
// Check if it's not a just created generation file
if ($file != Zend_Search_Lucene::getSegmentFileName($generation)) {
$filesToDelete[] = $file;
$filesTypes[] = 2; // first group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 9), 36, 10); // ordered by segment generation numbers
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))\.f\d+$/i', $file, $matches)) {
// one of per segment files ('<segment_name>.f<decimal_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
}
} else if (preg_match('/(^_([a-zA-Z0-9]+))(_([a-zA-Z0-9]+))\.del$/i', $file, $matches)) {
// one of per segment files ('<segment_name>_<del_generation>.del' where <segment_name> is '_<segment_number>')
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$matches[1]])) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert($matches[2], 36, 10); // order by segment number
} else {
$segmentNumber = (int)base_convert($matches[2], 36, 10);
$delGeneration = (int)base_convert($matches[4], 36, 10);
if (!isset($delFiles[$segmentNumber])) {
$delFiles[$segmentNumber] = array();
}
$delFiles[$segmentNumber][$delGeneration] = $file;
}
} else if (isset(self::$_indexExtensions[substr($file, strlen($file)-4)])) {
// one of per segment files ('<segment_name>.<ext>')
$segmentName = substr($file, 0, strlen($file) - 4);
// Check if it's not one of the segments in the current segments set
if (!isset($segments[$segmentName]) &&
($this->_currentSegment === null || $this->_currentSegment->getName() != $segmentName)) {
$filesToDelete[] = $file;
$filesTypes[] = 3; // second group of files for deletions
$filesNumbers[] = (int)base_convert(substr($file, 1 /* skip '_' */, strlen($file)-5), 36, 10); // order by segment number
}
}
}
$maxGenNumber = 0;
// process .del files of currently used segments
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
ksort($delFiles[$segmentNumber], SORT_NUMERIC);
array_pop($delFiles[$segmentNumber]); // remove last delete file generation from candidates for deleting
end($delFiles[$segmentNumber]);
$lastGenNumber = key($delFiles[$segmentNumber]);
if ($lastGenNumber > $maxGenNumber) {
$maxGenNumber = $lastGenNumber;
}
}
foreach ($delFiles as $segmentNumber => $segmentDelFiles) {
foreach ($segmentDelFiles as $delGeneration => $file) {
$filesToDelete[] = $file;
$filesTypes[] = 4; // third group of files for deletions
$filesNumbers[] = $segmentNumber*$maxGenNumber + $delGeneration; // order by <segment_number>,<del_generation> pair
}
}
// Reorder files for deleting
array_multisort($filesTypes, SORT_ASC, SORT_NUMERIC,
$filesNumbers, SORT_ASC, SORT_NUMERIC,
$filesToDelete, SORT_ASC, SORT_STRING);
foreach ($filesToDelete as $file) {
try {
/** Skip shared docstore segments deleting */
/** @todo Process '.cfx' files to check if them are already unused */
if (substr($file, strlen($file)-4) != '.cfx') {
$this->_directory->deleteFile($file);
}
} catch (Zend_Search_Lucene_Exception $e) {
if (strpos($e->getMessage(), 'Can\'t delete file') === false) {
// That's not "file is under processing or already deleted" exception
// Pass it through
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
}
// Return read lock into the previous state
Zend_Search_Lucene_LockManager::deEscalateReadLock($this->_directory);
} else {
// Only release resources if another index reader is running now
foreach ($this->_segmentsToDelete as $segName) {
foreach (self::$_indexExtensions as $ext) {
$this->_directory->purgeFile($segName . $ext);
}
}
}
// Clean-up _segmentsToDelete container
$this->_segmentsToDelete = array();
// Release index write lock
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
// Remove unused segments from segments list
foreach ($this->_segmentInfos as $segName => $segmentInfo) {
if (!isset($segments[$segName])) {
unset($this->_segmentInfos[$segName]);
}
}
}
/**
* Commit current changes
*/
public function commit()
{
if ($this->_currentSegment !== null) {
$newSegment = $this->_currentSegment->close();
if ($newSegment !== null) {
$this->_newSegments[$newSegment->getName()] = $newSegment;
}
$this->_currentSegment = null;
}
$this->_updateSegments();
}
/**
* Merges the provided indexes into this index.
*
* @param array $readers
* @return void
*/
public function addIndexes($readers)
{
/**
* @todo implementation
*/
}
/**
* Merges all segments together into new one
*
* Returns true on success and false if another optimization or auto-optimization process
* is running now
*
* @return boolean
*/
public function optimize()
{
if (Zend_Search_Lucene_LockManager::obtainOptimizationLock($this->_directory) === false) {
return false;
}
// Update segments list to be sure all segments are not merged yet by another process
//
// Segment merging functionality is concentrated in this class and surrounded
// by optimization lock obtaining/releasing.
// _updateSegments() refreshes segments list from the latest index generation.
// So only new segments can be added to the index while we are merging some already existing
// segments.
// Newly added segments will be also included into the index by the _updateSegments() call
// either by another process or by the current process with the commit() call at the end of _mergeSegments() method.
// That's guaranteed by the serialisation of _updateSegments() execution using exclusive locks.
$this->_updateSegments();
$this->_mergeSegments($this->_segmentInfos);
Zend_Search_Lucene_LockManager::releaseOptimizationLock($this->_directory);
return true;
}
/**
* Get name for new segment
*
* @return string
*/
private function _newSegmentName()
{
Zend_Search_Lucene_LockManager::obtainWriteLock($this->_directory);
$generation = Zend_Search_Lucene::getActualGeneration($this->_directory);
$segmentsFile = $this->_directory->getFileObject(Zend_Search_Lucene::getSegmentFileName($generation), false);
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentNameCounter = $segmentsFile->readInt();
$segmentsFile->seek(12); // 12 = 4 (int, file format marker) + 8 (long, index version)
$segmentsFile->writeInt($segmentNameCounter + 1);
// Flash output to guarantee that wrong value will not be loaded between unlock and
// return (which calls $segmentsFile destructor)
$segmentsFile->flush();
Zend_Search_Lucene_LockManager::releaseWriteLock($this->_directory);
return '_' . base_convert($segmentNameCounter, 10, 36);
}
}

View File

@ -0,0 +1,417 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Interface.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_TermsStream_Interface */
require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
/** Classes used within Zend_Search_Lucene_Interface API */
/** Zend_Search_Lucene_Document */
require_once 'Zend/Search/Lucene/Document.php';
/** Zend_Search_Lucene_Index_Term */
require_once 'Zend/Search/Lucene/Index/Term.php';
/** Zend_Search_Lucene_Index_DocsFilter */
require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
interface Zend_Search_Lucene_Interface extends Zend_Search_Lucene_Index_TermsStream_Interface
{
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory);
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation);
/**
* Get index format version
*
* @return integer
*/
public function getFormatVersion();
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
* @throws Zend_Search_Lucene_Exception
*/
public function setFormatVersion($formatVersion);
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @return Zend_Search_Lucene_Storage_Directory
*/
public function getDirectory();
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count();
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc();
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs();
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function isDeleted($id);
/**
* Set default search field.
*
* Null means, that search is performed through all fields by default
*
* Default value is null
*
* @param string $fieldName
*/
public static function setDefaultSearchField($fieldName);
/**
* Get default search field.
*
* Null means, that search is performed through all fields by default
*
* @return string
*/
public static function getDefaultSearchField();
/**
* Set result set limit.
*
* 0 (default) means no limit
*
* @param integer $limit
*/
public static function setResultSetLimit($limit);
/**
* Set result set limit.
*
* 0 means no limit
*
* @return integer
*/
public static function getResultSetLimit();
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @return integer
*/
public function getMaxBufferedDocs();
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs);
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @return integer
*/
public function getMaxMergeDocs();
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs);
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @return integer
*/
public function getMergeFactor();
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor);
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array Zend_Search_Lucene_Search_QueryHit
* @throws Zend_Search_Lucene_Exception
*/
public function find($query);
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false);
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @return Zend_Search_Lucene_Document
*/
public function getDocument($id);
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return boolean
*/
public function hasTerm(Zend_Search_Lucene_Index_Term $term);
/**
* Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null);
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_DocsFilter
*/
public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null);
/**
* Returns an array of all term freqs.
* Return array structure: array( docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return integer
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null);
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null);
/**
* Returns the number of documents in this index containing the $term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return integer
*/
public function docFreq(Zend_Search_Lucene_Index_Term $term);
/**
* Retrive similarity used by index reader
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public function getSimilarity();
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName);
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions();
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @throws Zend_Search_Lucene_Exception
*/
public function delete($id);
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document);
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*/
public function commit();
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize();
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms();
/**
* Undeletes all documents currently marked as deleted in this index.
*/
public function undeleteAll();
/**
* Add reference to the index object
*
* @internal
*/
public function addReference();
/**
* Remove reference from the index object
*
* When reference count becomes zero, index is closed and resources are cleaned up
*
* @internal
*/
public function removeReference();
}

View File

@ -0,0 +1,236 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: LockManager.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Storage_Directory */
require_once 'Zend/Search/Lucene/Storage/Directory.php';
/** Zend_Search_Lucene_Storage_File */
require_once 'Zend/Search/Lucene/Storage/File.php';
/**
* This is an utility class which provides index locks processing functionality
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_LockManager
{
/**
* consts for name of file to show lock status
*/
const WRITE_LOCK_FILE = 'write.lock.file';
const READ_LOCK_FILE = 'read.lock.file';
const READ_LOCK_PROCESSING_LOCK_FILE = 'read-lock-processing.lock.file';
const OPTIMIZATION_LOCK_FILE = 'optimization.lock.file';
/**
* Obtain exclusive write lock on the index
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
* @return Zend_Search_Lucene_Storage_File
* @throws Zend_Search_Lucene_Exception
*/
public static function obtainWriteLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::WRITE_LOCK_FILE);
if (!$lock->lock(LOCK_EX)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive index lock');
}
return $lock;
}
/**
* Release exclusive write lock
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
*/
public static function releaseWriteLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::WRITE_LOCK_FILE);
$lock->unlock();
}
/**
* Obtain the exclusive "read escalation/de-escalation" lock
*
* Required to protect the escalate/de-escalate read lock process
* on GFS (and potentially other) mounted filesystems.
*
* Why we need this:
* While GFS supports cluster-wide locking via flock(), it's
* implementation isn't quite what it should be. The locking
* semantics that work consistently on a local filesystem tend to
* fail on GFS mounted filesystems. This appears to be a design defect
* in the implementation of GFS. How this manifests itself is that
* conditional promotion of a shared lock to exclusive will always
* fail, lock release requests are honored but not immediately
* processed (causing erratic failures of subsequent conditional
* requests) and the releasing of the exclusive lock before the
* shared lock is set when a lock is demoted (which can open a window
* of opportunity for another process to gain an exclusive lock when
* it shoudln't be allowed to).
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
* @return Zend_Search_Lucene_Storage_File
* @throws Zend_Search_Lucene_Exception
*/
private static function _startReadLockProcessing(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::READ_LOCK_PROCESSING_LOCK_FILE);
if (!$lock->lock(LOCK_EX)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Can\'t obtain exclusive lock for the read lock processing file');
}
return $lock;
}
/**
* Release the exclusive "read escalation/de-escalation" lock
*
* Required to protect the escalate/de-escalate read lock process
* on GFS (and potentially other) mounted filesystems.
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
*/
private static function _stopReadLockProcessing(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_PROCESSING_LOCK_FILE);
$lock->unlock();
}
/**
* Obtain shared read lock on the index
*
* It doesn't block other read or update processes, but prevent index from the premature cleaning-up
*
* @param Zend_Search_Lucene_Storage_Directory $defaultLockDirectory
* @return Zend_Search_Lucene_Storage_File
* @throws Zend_Search_Lucene_Exception
*/
public static function obtainReadLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::READ_LOCK_FILE);
if (!$lock->lock(LOCK_SH)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Can\'t obtain shared reading index lock');
}
return $lock;
}
/**
* Release shared read lock
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
*/
public static function releaseReadLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
$lock->unlock();
}
/**
* Escalate Read lock to exclusive level
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
* @return boolean
*/
public static function escalateReadLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
self::_startReadLockProcessing($lockDirectory);
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
// First, release the shared lock for the benefit of GFS since
// it will fail the conditional request to promote the lock to
// "exclusive" while the shared lock is held (even when we are
// the only holder).
$lock->unlock();
// GFS is really poor. While the above "unlock" returns, GFS
// doesn't clean up it's tables right away (which will potentially
// cause the conditional locking for the "exclusive" lock to fail.
// We will retry the conditional lock request several times on a
// failure to get past this. The performance hit is negligible
// in the grand scheme of things and only will occur with GFS
// filesystems or if another local process has the shared lock
// on local filesystems.
for ($retries = 0; $retries < 10; $retries++) {
if ($lock->lock(LOCK_EX, true)) {
// Exclusive lock is obtained!
self::_stopReadLockProcessing($lockDirectory);
return true;
}
// wait 1 microsecond
usleep(1);
}
// Restore lock state
$lock->lock(LOCK_SH);
self::_stopReadLockProcessing($lockDirectory);
return false;
}
/**
* De-escalate Read lock to shared level
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
*/
public static function deEscalateReadLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE);
$lock->lock(LOCK_SH);
}
/**
* Obtain exclusive optimization lock on the index
*
* Returns lock object on success and false otherwise (doesn't block execution)
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
* @return mixed
*/
public static function obtainOptimizationLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->createFile(self::OPTIMIZATION_LOCK_FILE);
if (!$lock->lock(LOCK_EX, true)) {
return false;
}
return $lock;
}
/**
* Release exclusive optimization lock
*
* @param Zend_Search_Lucene_Storage_Directory $lockDirectory
*/
public static function releaseOptimizationLock(Zend_Search_Lucene_Storage_Directory $lockDirectory)
{
$lock = $lockDirectory->getFileObject(self::OPTIMIZATION_LOCK_FILE);
$lock->unlock();
}
}

View File

@ -0,0 +1,973 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: MultiSearcher.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Interface */
require_once 'Zend/Search/Lucene/Interface.php';
/**
* Multisearcher allows to search through several independent indexes.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Interface_MultiSearcher implements Zend_Search_Lucene_Interface
{
/**
* List of indices for searching.
* Array of Zend_Search_Lucene_Interface objects
*
* @var array
*/
protected $_indices;
/**
* Object constructor.
*
* @param array $indices Arrays of indices for search
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($indices = array())
{
$this->_indices = $indices;
foreach ($this->_indices as $index) {
if (!$index instanceof Zend_Search_Lucene_Interface) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('sub-index objects have to implement Zend_Search_Lucene_Interface.');
}
}
}
/**
* Add index for searching.
*
* @param Zend_Search_Lucene_Interface $index
*/
public function addIndex(Zend_Search_Lucene_Interface $index)
{
$this->_indices[] = $index;
}
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Generation number can't be retrieved for multi-searcher");
}
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation)
{
return Zend_Search_Lucene::getSegmentFileName($generation);
}
/**
* Get index format version
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function getFormatVersion()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Format version can't be retrieved for multi-searcher");
}
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
*/
public function setFormatVersion($formatVersion)
{
foreach ($this->_indices as $index) {
$index->setFormatVersion($formatVersion);
}
}
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @return Zend_Search_Lucene_Storage_Directory
*/
public function getDirectory()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Index directory can't be retrieved for multi-searcher");
}
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count()
{
$count = 0;
foreach ($this->_indices as $index) {
$count += $this->_indices->count();
}
return $count;
}
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc()
{
return $this->count();
}
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs()
{
$docs = 0;
foreach ($this->_indices as $index) {
$docs += $this->_indices->numDocs();
}
return $docs;
}
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function isDeleted($id)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->isDeleted($id);
}
$id -= $indexCount;
}
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
/**
* Set default search field.
*
* Null means, that search is performed through all fields by default
*
* Default value is null
*
* @param string $fieldName
*/
public static function setDefaultSearchField($fieldName)
{
foreach ($this->_indices as $index) {
$index->setDefaultSearchField($fieldName);
}
}
/**
* Get default search field.
*
* Null means, that search is performed through all fields by default
*
* @return string
* @throws Zend_Search_Lucene_Exception
*/
public static function getDefaultSearchField()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$defaultSearchField = reset($this->_indices)->getDefaultSearchField();
foreach ($this->_indices as $index) {
if ($index->getDefaultSearchField() !== $defaultSearchField) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different default search field.');
}
}
return $defaultSearchField;
}
/**
* Set result set limit.
*
* 0 (default) means no limit
*
* @param integer $limit
*/
public static function setResultSetLimit($limit)
{
foreach ($this->_indices as $index) {
$index->setResultSetLimit($limit);
}
}
/**
* Set result set limit.
*
* 0 means no limit
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public static function getResultSetLimit()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$defaultResultSetLimit = reset($this->_indices)->getResultSetLimit();
foreach ($this->_indices as $index) {
if ($index->getResultSetLimit() !== $defaultResultSetLimit) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different default search field.');
}
}
return $defaultResultSetLimit;
}
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function getMaxBufferedDocs()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$maxBufferedDocs = reset($this->_indices)->getMaxBufferedDocs();
foreach ($this->_indices as $index) {
if ($index->getMaxBufferedDocs() !== $maxBufferedDocs) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different default search field.');
}
}
return $maxBufferedDocs;
}
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs)
{
foreach ($this->_indices as $index) {
$index->setMaxBufferedDocs($maxBufferedDocs);
}
}
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function getMaxMergeDocs()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$maxMergeDocs = reset($this->_indices)->getMaxMergeDocs();
foreach ($this->_indices as $index) {
if ($index->getMaxMergeDocs() !== $maxMergeDocs) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different default search field.');
}
}
return $maxMergeDocs;
}
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs)
{
foreach ($this->_indices as $index) {
$index->setMaxMergeDocs($maxMergeDocs);
}
}
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function getMergeFactor()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$mergeFactor = reset($this->_indices)->getMergeFactor();
foreach ($this->_indices as $index) {
if ($index->getMergeFactor() !== $mergeFactor) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different default search field.');
}
}
return $mergeFactor;
}
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor)
{
foreach ($this->_indices as $index) {
$index->setMaxMergeDocs($mergeFactor);
}
}
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array Zend_Search_Lucene_Search_QueryHit
* @throws Zend_Search_Lucene_Exception
*/
public function find($query)
{
if (count($this->_indices) == 0) {
return array();
}
$hitsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$hits = $index->find($query);
if ($indexShift != 0) {
foreach ($hits as $hit) {
$hit->id += $indexShift;
}
}
$indexShift += $index->count();
$hitsList[] = $hits;
}
/** @todo Implement advanced sorting */
return call_user_func_array('array_merge', $hitsList);
}
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false)
{
$fieldNamesList = array();
foreach ($this->_indices as $index) {
$fieldNamesList[] = $index->getFieldNames($indexed);
}
return array_unique(call_user_func_array('array_merge', $fieldNamesList));
}
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @return Zend_Search_Lucene_Document
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function getDocument($id)
{
if ($id instanceof Zend_Search_Lucene_Search_QueryHit) {
/* @var $id Zend_Search_Lucene_Search_QueryHit */
$id = $id->id;
}
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->getDocument($id);
}
$id -= $indexCount;
}
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return boolean
*/
public function hasTerm(Zend_Search_Lucene_Index_Term $term)
{
foreach ($this->_indices as $index) {
if ($index->hasTerm($term)) {
return true;
}
}
return false;
}
/**
* Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document filters could not used with multi-searcher');
}
$docsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$docs = $index->termDocs($term);
if ($indexShift != 0) {
foreach ($docs as $id => $docId) {
$docs[$id] += $indexShift;
}
}
$indexShift += $index->count();
$docsList[] = $docs;
}
return call_user_func_array('array_merge', $docsList);
}
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_DocsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document filters could not used with multi-searcher');
}
/**
* Returns an array of all term freqs.
* Return array structure: array( docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document filters could not used with multi-searcher');
}
$freqsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$freqs = $index->termFreqs($term);
if ($indexShift != 0) {
$freqsShifted = array();
foreach ($freqs as $docId => $freq) {
$freqsShifted[$docId + $indexShift] = $freq;
}
$freqs = $freqsShifted;
}
$indexShift += $index->count();
$freqsList[] = $freqs;
}
return call_user_func_array('array_merge', $freqsList);
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
if ($docsFilter != null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document filters could not used with multi-searcher');
}
$termPositionsList = array();
$indexShift = 0;
foreach ($this->_indices as $index) {
$termPositions = $index->termPositions($term);
if ($indexShift != 0) {
$termPositionsShifted = array();
foreach ($termPositions as $docId => $positions) {
$termPositions[$docId + $indexShift] = $positions;
}
$termPositions = $termPositionsShifted;
}
$indexShift += $index->count();
$termPositionsList[] = $termPositions;
}
return call_user_func_array('array_merge', $termPositions);
}
/**
* Returns the number of documents in this index containing the $term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return integer
*/
public function docFreq(Zend_Search_Lucene_Index_Term $term)
{
$docFreq = 0;
foreach ($this->_indices as $index) {
$docFreq += $index->docFreq($term);
}
return $docFreq;
}
/**
* Retrive similarity used by index reader
*
* @return Zend_Search_Lucene_Search_Similarity
* @throws Zend_Search_Lucene_Exception
*/
public function getSimilarity()
{
if (count($this->_indices) == 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices list is empty');
}
$similarity = reset($this->_indices)->getSimilarity();
foreach ($this->_indices as $index) {
if ($index->getSimilarity() !== $similarity) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Indices have different similarity.');
}
}
return $similarity;
}
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
return $index->norm($id, $fieldName);
}
$id -= $indexCount;
}
return null;
}
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions()
{
foreach ($this->_indices as $index) {
if ($index->hasDeletions()) {
return true;
}
}
return false;
}
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @throws Zend_Search_Lucene_Exception
*/
public function delete($id)
{
foreach ($this->_indices as $index) {
$indexCount = $index->count();
if ($indexCount > $id) {
$index->delete($id);
return;
}
$id -= $indexCount;
}
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Document id is out of the range.');
}
/**
* Callback used to choose target index for new documents
*
* Function/method signature:
* Zend_Search_Lucene_Interface callbackFunction(Zend_Search_Lucene_Document $document, array $indices);
*
* null means "default documents distributing algorithm"
*
* @var callback
*/
protected $_documentDistributorCallBack = null;
/**
* Set callback for choosing target index.
*
* @param callback $callback
* @throws Zend_Search_Lucene_Exception
*/
public function setDocumentDistributorCallback($callback)
{
if ($callback !== null && !is_callable($callback)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('$callback parameter must be a valid callback.');
}
$this->_documentDistributorCallBack = $callback;
}
/**
* Get callback for choosing target index.
*
* @return callback
*/
public function getDocumentDistributorCallback()
{
return $this->_documentDistributorCallBack;
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
* @throws Zend_Search_Lucene_Exception
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
if ($this->_documentDistributorCallBack !== null) {
$index = call_user_func($this->_documentDistributorCallBack, $document, $this->_indices);
} else {
$index = $this->_indices[array_rand($this->_indices)];
}
$index->addDocument($document);
}
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*/
public function commit()
{
foreach ($this->_indices as $index) {
$index->commit();
}
}
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize()
{
foreach ($this->_indices as $index) {
$index->optimise();
}
}
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms()
{
$termsList = array();
foreach ($this->_indices as $index) {
$termsList[] = $index->terms();
}
return array_unique(call_user_func_array('array_merge', $termsList));
}
/**
* Terms stream priority queue object
*
* @var Zend_Search_Lucene_TermStreamsPriorityQueue
*/
private $_termsStream = null;
/**
* Reset terms stream.
*/
public function resetTermsStream()
{
if ($this->_termsStream === null) {
/** Zend_Search_Lucene_TermStreamsPriorityQueue */
require_once 'Zend/Search/Lucene/TermStreamsPriorityQueue.php';
$this->_termsStream = new Zend_Search_Lucene_TermStreamsPriorityQueue($this->_indices);
} else {
$this->_termsStream->resetTermsStream();
}
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
$this->_termsStream->skipTo($prefix);
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
return $this->_termsStream->nextTerm();
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_termsStream->currentTerm();
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_termsStream->closeTermsStream();
$this->_termsStream = null;
}
/**
* Undeletes all documents currently marked as deleted in this index.
*/
public function undeleteAll()
{
foreach ($this->_indices as $index) {
$index->undeleteAll();
}
}
/**
* Add reference to the index object
*
* @internal
*/
public function addReference()
{
// Do nothing, since it's never referenced by indices
}
/**
* Remove reference from the index object
*
* When reference count becomes zero, index is closed and resources are cleaned up
*
* @internal
*/
public function removeReference()
{
// Do nothing, since it's never referenced by indices
}
}

View File

@ -0,0 +1,171 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: PriorityQueue.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Abstract Priority Queue
*
* It implements a priority queue.
* Please go to "Data Structures and Algorithms",
* Aho, Hopcroft, and Ullman, Addison-Wesley, 1983 (corrected 1987 edition),
* for implementation details.
*
* It provides O(log(N)) time of put/pop operations, where N is a size of queue
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_PriorityQueue
{
/**
* Queue heap
*
* Heap contains balanced partial ordered binary tree represented in array
* [0] - top of the tree
* [1] - first child of [0]
* [2] - second child of [0]
* ...
* [2*n + 1] - first child of [n]
* [2*n + 2] - second child of [n]
*
* @var array
*/
private $_heap = array();
/**
* Add element to the queue
*
* O(log(N)) time
*
* @param mixed $element
*/
public function put($element)
{
$nodeId = count($this->_heap);
$parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) {
// Move parent node down
$this->_heap[$nodeId] = $this->_heap[$parentId];
// Move pointer to the next level of tree
$nodeId = $parentId;
$parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 )
}
// Put new node into the tree
$this->_heap[$nodeId] = $element;
}
/**
* Return least element of the queue
*
* Constant time
*
* @return mixed
*/
public function top()
{
if (count($this->_heap) == 0) {
return null;
}
return $this->_heap[0];
}
/**
* Removes and return least element of the queue
*
* O(log(N)) time
*
* @return mixed
*/
public function pop()
{
if (count($this->_heap) == 0) {
return null;
}
$top = $this->_heap[0];
$lastId = count($this->_heap) - 1;
/**
* Find appropriate position for last node
*/
$nodeId = 0; // Start from a top
$childId = 1; // First child
// Choose smaller child
if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) {
$childId = 2;
}
while ($childId < $lastId &&
$this->_less($this->_heap[$childId], $this->_heap[$lastId])
) {
// Move child node up
$this->_heap[$nodeId] = $this->_heap[$childId];
$nodeId = $childId; // Go down
$childId = ($nodeId << 1) + 1; // First child
// Choose smaller child
if (($childId+1) < $lastId &&
$this->_less($this->_heap[$childId+1], $this->_heap[$childId])
) {
$childId++;
}
}
// Move last element to the new position
$this->_heap[$nodeId] = $this->_heap[$lastId];
unset($this->_heap[$lastId]);
return $top;
}
/**
* Clear queue
*/
public function clear()
{
$this->_heap = array();
}
/**
* Compare elements
*
* Returns true, if $el1 is less than $el2; else otherwise
*
* @param mixed $el1
* @param mixed $el2
* @return boolean
*/
abstract protected function _less($el1, $el2);
}

612
thirdparty/Zend/Search/Lucene/Proxy.php vendored Normal file
View File

@ -0,0 +1,612 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Proxy.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Interface */
require_once 'Zend/Search/Lucene/Interface.php';
/**
* Proxy class intended to be used in userland.
*
* It tracks, when index object goes out of scope and forces ndex closing
*
* @category Zend
* @package Zend_Search_Lucene
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Proxy implements Zend_Search_Lucene_Interface
{
/**
* Index object
*
* @var Zend_Search_Lucene_Interface
*/
private $_index;
/**
* Object constructor
*
* @param Zend_Search_Lucene_Interface $index
*/
public function __construct(Zend_Search_Lucene_Interface $index)
{
$this->_index = $index;
$this->_index->addReference();
}
/**
* Object destructor
*/
public function __destruct()
{
if ($this->_index !== null) {
// This code is invoked if Zend_Search_Lucene_Interface object constructor throws an exception
$this->_index->removeReference();
}
$this->_index = null;
}
/**
* Get current generation number
*
* Returns generation number
* 0 means pre-2.1 index format
* -1 means there are no segments files.
*
* @param Zend_Search_Lucene_Storage_Directory $directory
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public static function getActualGeneration(Zend_Search_Lucene_Storage_Directory $directory)
{
Zend_Search_Lucene::getActualGeneration($directory);
}
/**
* Get segments file name
*
* @param integer $generation
* @return string
*/
public static function getSegmentFileName($generation)
{
Zend_Search_Lucene::getSegmentFileName($generation);
}
/**
* Get index format version
*
* @return integer
*/
public function getFormatVersion()
{
return $this->_index->getFormatVersion();
}
/**
* Set index format version.
* Index is converted to this format at the nearest upfdate time
*
* @param int $formatVersion
* @throws Zend_Search_Lucene_Exception
*/
public function setFormatVersion($formatVersion)
{
$this->_index->setFormatVersion($formatVersion);
}
/**
* Returns the Zend_Search_Lucene_Storage_Directory instance for this index.
*
* @return Zend_Search_Lucene_Storage_Directory
*/
public function getDirectory()
{
return $this->_index->getDirectory();
}
/**
* Returns the total number of documents in this index (including deleted documents).
*
* @return integer
*/
public function count()
{
return $this->_index->count();
}
/**
* Returns one greater than the largest possible document number.
* This may be used to, e.g., determine how big to allocate a structure which will have
* an element for every document number in an index.
*
* @return integer
*/
public function maxDoc()
{
return $this->_index->maxDoc();
}
/**
* Returns the total number of non-deleted documents in this index.
*
* @return integer
*/
public function numDocs()
{
return $this->_index->numDocs();
}
/**
* Checks, that document is deleted
*
* @param integer $id
* @return boolean
* @throws Zend_Search_Lucene_Exception Exception is thrown if $id is out of the range
*/
public function isDeleted($id)
{
return $this->_index->isDeleted($id);
}
/**
* Set default search field.
*
* Null means, that search is performed through all fields by default
*
* Default value is null
*
* @param string $fieldName
*/
public static function setDefaultSearchField($fieldName)
{
Zend_Search_Lucene::setDefaultSearchField($fieldName);
}
/**
* Get default search field.
*
* Null means, that search is performed through all fields by default
*
* @return string
*/
public static function getDefaultSearchField()
{
return Zend_Search_Lucene::getDefaultSearchField();
}
/**
* Set result set limit.
*
* 0 (default) means no limit
*
* @param integer $limit
*/
public static function setResultSetLimit($limit)
{
Zend_Search_Lucene::setResultSetLimit($limit);
}
/**
* Set result set limit.
*
* 0 means no limit
*
* @return integer
*/
public static function getResultSetLimit()
{
return Zend_Search_Lucene::getResultSetLimit();
}
/**
* Retrieve index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @return integer
*/
public function getMaxBufferedDocs()
{
return $this->_index->getMaxBufferedDocs();
}
/**
* Set index maxBufferedDocs option
*
* maxBufferedDocs is a minimal number of documents required before
* the buffered in-memory documents are written into a new Segment
*
* Default value is 10
*
* @param integer $maxBufferedDocs
*/
public function setMaxBufferedDocs($maxBufferedDocs)
{
$this->_index->setMaxBufferedDocs($maxBufferedDocs);
}
/**
* Retrieve index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @return integer
*/
public function getMaxMergeDocs()
{
return $this->_index->getMaxMergeDocs();
}
/**
* Set index maxMergeDocs option
*
* maxMergeDocs is a largest number of documents ever merged by addDocument().
* Small values (e.g., less than 10,000) are best for interactive indexing,
* as this limits the length of pauses while indexing to a few seconds.
* Larger values are best for batched indexing and speedier searches.
*
* Default value is PHP_INT_MAX
*
* @param integer $maxMergeDocs
*/
public function setMaxMergeDocs($maxMergeDocs)
{
$this->_index->setMaxMergeDocs($maxMergeDocs);
}
/**
* Retrieve index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @return integer
*/
public function getMergeFactor()
{
return $this->_index->getMergeFactor();
}
/**
* Set index mergeFactor option
*
* mergeFactor determines how often segment indices are merged by addDocument().
* With smaller values, less RAM is used while indexing,
* and searches on unoptimized indices are faster,
* but indexing speed is slower.
* With larger values, more RAM is used during indexing,
* and while searches on unoptimized indices are slower,
* indexing is faster.
* Thus larger values (> 10) are best for batch index creation,
* and smaller values (< 10) for indices that are interactively maintained.
*
* Default value is 10
*
* @param integer $maxMergeDocs
*/
public function setMergeFactor($mergeFactor)
{
$this->_index->setMergeFactor($mergeFactor);
}
/**
* Performs a query against the index and returns an array
* of Zend_Search_Lucene_Search_QueryHit objects.
* Input is a string or Zend_Search_Lucene_Search_Query.
*
* @param mixed $query
* @return array Zend_Search_Lucene_Search_QueryHit
* @throws Zend_Search_Lucene_Exception
*/
public function find($query)
{
// actual parameter list
$parameters = func_get_args();
// invoke $this->_index->find() method with specified parameters
return call_user_func_array(array(&$this->_index, 'find'), $parameters);
}
/**
* Returns a list of all unique field names that exist in this index.
*
* @param boolean $indexed
* @return array
*/
public function getFieldNames($indexed = false)
{
return $this->_index->getFieldNames($indexed);
}
/**
* Returns a Zend_Search_Lucene_Document object for the document
* number $id in this index.
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @return Zend_Search_Lucene_Document
*/
public function getDocument($id)
{
return $this->_index->getDocument($id);
}
/**
* Returns true if index contain documents with specified term.
*
* Is used for query optimization.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return boolean
*/
public function hasTerm(Zend_Search_Lucene_Index_Term $term)
{
return $this->_index->hasTerm($term);
}
/**
* Returns IDs of all the documents containing term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termDocs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
return $this->_index->termDocs($term, $docsFilter);
}
/**
* Returns documents filter for all documents containing term.
*
* It performs the same operation as termDocs, but return result as
* Zend_Search_Lucene_Index_DocsFilter object
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return Zend_Search_Lucene_Index_DocsFilter
*/
public function termDocsFilter(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
return $this->_index->termDocsFilter($term, $docsFilter);
}
/**
* Returns an array of all term freqs.
* Return array structure: array( docId => freq, ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return integer
*/
public function termFreqs(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
return $this->_index->termFreqs($term, $docsFilter);
}
/**
* Returns an array of all term positions in the documents.
* Return array structure: array( docId => array( pos1, pos2, ...), ...)
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @return array
*/
public function termPositions(Zend_Search_Lucene_Index_Term $term, $docsFilter = null)
{
return $this->_index->termPositions($term, $docsFilter);
}
/**
* Returns the number of documents in this index containing the $term.
*
* @param Zend_Search_Lucene_Index_Term $term
* @return integer
*/
public function docFreq(Zend_Search_Lucene_Index_Term $term)
{
return $this->_index->docFreq($term);
}
/**
* Retrive similarity used by index reader
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public function getSimilarity()
{
return $this->_index->getSimilarity();
}
/**
* Returns a normalization factor for "field, document" pair.
*
* @param integer $id
* @param string $fieldName
* @return float
*/
public function norm($id, $fieldName)
{
return $this->_index->norm($id, $fieldName);
}
/**
* Returns true if any documents have been deleted from this index.
*
* @return boolean
*/
public function hasDeletions()
{
return $this->_index->hasDeletions();
}
/**
* Deletes a document from the index.
* $id is an internal document id
*
* @param integer|Zend_Search_Lucene_Search_QueryHit $id
* @throws Zend_Search_Lucene_Exception
*/
public function delete($id)
{
return $this->_index->delete($id);
}
/**
* Adds a document to this index.
*
* @param Zend_Search_Lucene_Document $document
*/
public function addDocument(Zend_Search_Lucene_Document $document)
{
$this->_index->addDocument($document);
}
/**
* Commit changes resulting from delete() or undeleteAll() operations.
*/
public function commit()
{
$this->_index->commit();
}
/**
* Optimize index.
*
* Merges all segments into one
*/
public function optimize()
{
$this->_index->optimize();
}
/**
* Returns an array of all terms in this index.
*
* @return array
*/
public function terms()
{
return $this->_index->terms();
}
/**
* Reset terms stream.
*/
public function resetTermsStream()
{
$this->_index->resetTermsStream();
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
return $this->_index->skipTo($prefix);
}
/**
* Scans terms dictionary and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
return $this->_index->nextTerm();
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_index->currentTerm();
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
$this->_index->closeTermsStream();
}
/**
* Undeletes all documents currently marked as deleted in this index.
*/
public function undeleteAll()
{
return $this->_index->undeleteAll();
}
/**
* Add reference to the index object
*
* @internal
*/
public function addReference()
{
return $this->_index->addReference();
}
/**
* Remove reference from the index object
*
* When reference count becomes zero, index is closed and resources are cleaned up
*
* @internal
*/
public function removeReference()
{
return $this->_index->removeReference();
}
}

View File

@ -0,0 +1,278 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: BooleanExpressionRecognizer.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_BooleanExpressionRecognizer extends Zend_Search_Lucene_FSM
{
/** State Machine states */
const ST_START = 0;
const ST_LITERAL = 1;
const ST_NOT_OPERATOR = 2;
const ST_AND_OPERATOR = 3;
const ST_OR_OPERATOR = 4;
/** Input symbols */
const IN_LITERAL = 0;
const IN_NOT_OPERATOR = 1;
const IN_AND_OPERATOR = 2;
const IN_OR_OPERATOR = 3;
/**
* NOT operator signal
*
* @var boolean
*/
private $_negativeLiteral = false;
/**
* Current literal
*
* @var mixed
*/
private $_literal;
/**
* Set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @var array
*/
private $_conjunctions = array();
/**
* Current conjuction
*
* @var array
*/
private $_currentConjunction = array();
/**
* Object constructor
*/
public function __construct()
{
parent::__construct( array(self::ST_START,
self::ST_LITERAL,
self::ST_NOT_OPERATOR,
self::ST_AND_OPERATOR,
self::ST_OR_OPERATOR),
array(self::IN_LITERAL,
self::IN_NOT_OPERATOR,
self::IN_AND_OPERATOR,
self::IN_OR_OPERATOR));
$emptyOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyOperatorAction');
$emptyNotOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'emptyNotOperatorAction');
$this->addRules(array( array(self::ST_START, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_START, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_LITERAL, self::IN_AND_OPERATOR, self::ST_AND_OPERATOR),
array(self::ST_LITERAL, self::IN_OR_OPERATOR, self::ST_OR_OPERATOR),
array(self::ST_LITERAL, self::IN_LITERAL, self::ST_LITERAL, $emptyOperatorAction),
array(self::ST_LITERAL, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR, $emptyNotOperatorAction),
array(self::ST_NOT_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_AND_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
array(self::ST_OR_OPERATOR, self::IN_LITERAL, self::ST_LITERAL),
array(self::ST_OR_OPERATOR, self::IN_NOT_OPERATOR, self::ST_NOT_OPERATOR),
));
$notOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'notOperatorAction');
$orOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'orOperatorAction');
$literalAction = new Zend_Search_Lucene_FSMAction($this, 'literalAction');
$this->addEntryAction(self::ST_NOT_OPERATOR, $notOperatorAction);
$this->addEntryAction(self::ST_OR_OPERATOR, $orOperatorAction);
$this->addEntryAction(self::ST_LITERAL, $literalAction);
}
/**
* Process next operator.
*
* Operators are defined by class constants: IN_AND_OPERATOR, IN_OR_OPERATOR and IN_NOT_OPERATOR
*
* @param integer $operator
*/
public function processOperator($operator)
{
$this->process($operator);
}
/**
* Process expression literal.
*
* @param integer $operator
*/
public function processLiteral($literal)
{
$this->_literal = $literal;
$this->process(self::IN_LITERAL);
}
/**
* Finish an expression and return result
*
* Result is a set of boolean query conjunctions
*
* Each conjunction is an array of conjunction elements
* Each conjunction element is presented with two-elements array:
* array(<literal>, <is_negative>)
*
* So, it has a structure:
* array( array( array(<literal>, <is_negative>), // first literal of first conjuction
* array(<literal>, <is_negative>), // second literal of first conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of first conjuction
* array( array(<literal>, <is_negative>), // first literal of second conjuction
* array(<literal>, <is_negative>), // second literal of second conjuction
* ...
* array(<literal>, <is_negative>)
* ), // end of second conjuction
* ...
* ) // end of structure
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function finishExpression()
{
if ($this->getState() != self::ST_LITERAL) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Literal expected.');
}
$this->_conjunctions[] = $this->_currentConjunction;
return $this->_conjunctions;
}
/*********************************************************************
* Actions implementation
*********************************************************************/
/**
* default (omitted) operator processing
*/
public function emptyOperatorAction()
{
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process literal
$this->literalAction();
}
/**
* default (omitted) + NOT operator processing
*/
public function emptyNotOperatorAction()
{
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
// Do nothing
} else {
$this->orOperatorAction();
}
// Process NOT operator
$this->notOperatorAction();
}
/**
* NOT operator processing
*/
public function notOperatorAction()
{
$this->_negativeLiteral = true;
}
/**
* OR operator processing
* Close current conjunction
*/
public function orOperatorAction()
{
$this->_conjunctions[] = $this->_currentConjunction;
$this->_currentConjunction = array();
}
/**
* Literal processing
*/
public function literalAction()
{
// Add literal to the current conjunction
$this->_currentConjunction[] = array($this->_literal, !$this->_negativeLiteral);
// Switch off negative signal
$this->_negativeLiteral = false;
}
}

View File

@ -0,0 +1,94 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Default.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** @see Zend_Search_Lucene_Search_Highlighter_Interface */
require_once 'Zend/Search/Lucene/Search/Highlighter/Interface.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Highlighter_Default implements Zend_Search_Lucene_Search_Highlighter_Interface
{
/**
* List of colors for text highlighting
*
* @var array
*/
protected $_highlightColors = array('#66ffff', '#ff66ff', '#ffff66',
'#ff8888', '#88ff88', '#8888ff',
'#88dddd', '#dd88dd', '#dddd88',
'#aaddff', '#aaffdd', '#ddaaff',
'#ddffaa', '#ffaadd', '#ffddaa');
/**
* Index of current color for highlighting
*
* Index is increased at each highlight() call, so terms matching different queries are highlighted using different colors.
*
* @var integer
*/
protected $_currentColorIndex = 0;
/**
* HTML document for highlighting
*
* @var Zend_Search_Lucene_Document_Html
*/
protected $_doc;
/**
* Set document for highlighting.
*
* @param Zend_Search_Lucene_Document_Html $document
*/
public function setDocument(Zend_Search_Lucene_Document_Html $document)
{
$this->_doc = $document;
}
/**
* Get document for highlighting.
*
* @return Zend_Search_Lucene_Document_Html $document
*/
public function getDocument()
{
return $this->_doc;
}
/**
* Highlight specified words
*
* @param string|array $words Words to highlight. They could be organized using the array or string.
*/
public function highlight($words)
{
$color = $this->_highlightColors[$this->_currentColorIndex];
$this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors);
$this->_doc->highlight($words, $color);
}
}

View File

@ -0,0 +1,53 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Interface.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
interface Zend_Search_Lucene_Search_Highlighter_Interface
{
/**
* Set document for highlighting.
*
* @param Zend_Search_Lucene_Document_Html $document
*/
public function setDocument(Zend_Search_Lucene_Document_Html $document);
/**
* Get document for highlighting.
*
* @return Zend_Search_Lucene_Document_Html $document
*/
public function getDocument();
/**
* Highlight specified words (method is invoked once per subquery)
*
* @param string|array $words Words to highlight. They could be organized using the array or string.
*/
public function highlight($words);
}

View File

@ -0,0 +1,233 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Query.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Query
{
/**
* query boost factor
*
* @var float
*/
private $_boost = 1;
/**
* Query weight
*
* @var Zend_Search_Lucene_Search_Weight
*/
protected $_weight = null;
/**
* Current highlight color
*
* @var integer
*/
private $_currentColorIndex = 0;
/**
* Gets the boost for this clause. Documents matching
* this clause will (in addition to the normal weightings) have their score
* multiplied by boost. The boost is 1.0 by default.
*
* @return float
*/
public function getBoost()
{
return $this->_boost;
}
/**
* Sets the boost for this query clause to $boost.
*
* @param float $boost
*/
public function setBoost($boost)
{
$this->_boost = $boost;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
abstract public function score($docId, Zend_Search_Lucene_Interface $reader);
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
abstract public function matchedDocs();
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* Query specific implementation
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
abstract public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null);
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
abstract public function createWeight(Zend_Search_Lucene_Interface $reader);
/**
* Constructs an initializes a Weight for a _top-level_query_.
*
* @param Zend_Search_Lucene_Interface $reader
*/
protected function _initWeight(Zend_Search_Lucene_Interface $reader)
{
// Check, that it's a top-level query and query weight is not initialized yet.
if ($this->_weight !== null) {
return $this->_weight;
}
$this->createWeight($reader);
$sum = $this->_weight->sumOfSquaredWeights();
$queryNorm = $reader->getSimilarity()->queryNorm($sum);
$this->_weight->normalize($queryNorm);
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function rewrite(Zend_Search_Lucene_Interface $index);
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function optimize(Zend_Search_Lucene_Interface $index);
/**
* Reset query, so it can be reused within other queries or
* with other indeces
*/
public function reset()
{
$this->_weight = null;
}
/**
* Print a query
*
* @return string
*/
abstract public function __toString();
/**
* Return query terms
*
* @return array
*/
abstract public function getQueryTerms();
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
abstract protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter);
/**
* Highlight matches in $inputHTML
*
* @param string $inputHTML
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
* @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter
* @return string
*/
public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null)
{
if ($highlighter === null) {
require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php';
$highlighter = new Zend_Search_Lucene_Search_Highlighter_Default();
}
/** Zend_Search_Lucene_Document_Html */
require_once 'Zend/Search/Lucene/Document/Html.php';
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML, false, $defaultEncoding);
$highlighter->setDocument($doc);
$this->_highlightMatches($highlighter);
return $doc->getHTML();
}
/**
* Highlight matches in $inputHtmlFragment and return it (without HTML header and body tag)
*
* @param string $inputHtmlFragment
* @param string $encoding Input HTML string encoding
* @param Zend_Search_Lucene_Search_Highlighter_Interface|null $highlighter
* @return string
*/
public function htmlFragmentHighlightMatches($inputHtmlFragment, $encoding = 'UTF-8', $highlighter = null)
{
if ($highlighter === null) {
require_once 'Zend/Search/Lucene/Search/Highlighter/Default.php';
$highlighter = new Zend_Search_Lucene_Search_Highlighter_Default();
}
$inputHTML = '<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
. iconv($encoding, 'UTF-8//IGNORE', $inputHtmlFragment) . '</body></html>';
/** Zend_Search_Lucene_Document_Html */
require_once 'Zend/Search/Lucene/Document/Html.php';
$doc = Zend_Search_Lucene_Document_Html::loadHTML($inputHTML);
$highlighter->setDocument($doc);
$this->_highlightMatches($highlighter);
return $doc->getHtmlBody();
}
}

View File

@ -0,0 +1,815 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Boolean.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Boolean extends Zend_Search_Lucene_Search_Query
{
/**
* Subqueries
* Array of Zend_Search_Lucene_Search_Query
*
* @var array
*/
private $_subqueries = array();
/**
* Subqueries signs.
* If true then subquery is required.
* If false then subquery is prohibited.
* If null then subquery is neither prohibited, nor required
*
* If array is null then all subqueries are required
*
* @var array
*/
private $_signs = array();
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* A score factor based on the fraction of all query subqueries
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Class constructor. Create a new Boolean query object.
*
* if $signs array is omitted then all subqueries are required
* it differs from addSubquery() behavior, but should never be used
*
* @param array $subqueries Array of Zend_Search_Search_Query objects
* @param array $signs Array of signs. Sign is boolean|null.
* @return void
*/
public function __construct($subqueries = null, $signs = null)
{
if (is_array($subqueries)) {
$this->_subqueries = $subqueries;
$this->_signs = null;
// Check if all subqueries are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $subquery (Zend_Search_Lucene_Search_Query) to this query.
*
* The sign is specified as:
* TRUE - subquery is required
* FALSE - subquery is prohibited
* NULL - subquery is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Search_Query $subquery
* @param boolean|null $sign
* @return void
*/
public function addSubquery(Zend_Search_Lucene_Search_Query $subquery, $sign=null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all subqueries are required
if ($this->_signs === null) { // Check, If all previous subqueries are required
$this->_signs = array();
foreach ($this->_subqueries as $prevSubquery) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_subqueries[] = $subquery;
}
/**
* Re-write queries into primitive queries
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($this->_subqueries as $subqueryId => $subquery) {
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$subqueryId]);
}
return $query;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$subqueries = array();
$signs = array();
// Optimize all subqueries
foreach ($this->_subqueries as $id => $subquery) {
$subqueries[] = $subquery->optimize($index);
$signs[] = ($this->_signs === null)? true : $this->_signs[$id];
}
// Remove insignificant subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
// Insignificant subquery has to be removed anyway
unset($subqueries[$id]);
unset($signs[$id]);
}
}
if (count($subqueries) == 0) {
// Boolean query doesn't has non-insignificant subqueries
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check if all non-insignificant subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
// Check for empty subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Empty) {
if ($signs[$id] === true) {
// Matching is required, but is actually empty
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Matching is optional or prohibited, but is empty
// Remove it from subqueries and signs list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if reduced subqueries list is empty
if (count($subqueries) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check if all non-empty subqueries are prohibited
$allProhibited = true;
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
if ($allProhibited) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, if reduced subqueries list has only one entry
if (count($subqueries) == 1) {
// It's a query with only one required or optional clause
// (it's already checked, that it's not a prohibited clause)
if ($this->getBoost() == 1) {
return reset($subqueries);
}
$optimizedQuery = clone reset($subqueries);
$optimizedQuery->setBoost($optimizedQuery->getBoost()*$this->getBoost());
return $optimizedQuery;
}
// Prepare first candidate for optimized query
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
$terms = array();
$tsigns = array();
$boostFactors = array();
// Try to decompose term and multi-term subqueries
foreach ($subqueries as $id => $subquery) {
if ($subquery instanceof Zend_Search_Lucene_Search_Query_Term) {
$terms[] = $subquery->getTerm();
$tsigns[] = $signs[$id];
$boostFactors[] = $subquery->getBoost();
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else if ($subquery instanceof Zend_Search_Lucene_Search_Query_MultiTerm) {
$subTerms = $subquery->getTerms();
$subSigns = $subquery->getSigns();
if ($signs[$id] === true) {
// It's a required multi-term subquery.
// Something like '... +(+term1 -term2 term3 ...) ...'
// Multi-term required subquery can be decomposed only if it contains
// required terms and doesn't contain prohibited terms:
// ... +(+term1 term2 ...) ... => ... +term1 term2 ...
//
// Check this
$hasRequired = false;
$hasProhibited = false;
if ($subSigns === null) {
// All subterms are required
$hasRequired = true;
} else {
foreach ($subSigns as $sign) {
if ($sign === true) {
$hasRequired = true;
} else if ($sign === false) {
$hasProhibited = true;
break;
}
}
}
// Continue if subquery has prohibited terms or doesn't have required terms
if ($hasProhibited || !$hasRequired) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($subSigns === null)? true : $subSigns[$termId];
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
} else { // $signs[$id] === null || $signs[$id] === false
// It's an optional or prohibited multi-term subquery.
// Something like '... (+term1 -term2 term3 ...) ...'
// or
// something like '... -(+term1 -term2 term3 ...) ...'
// Multi-term optional and required subqueries can be decomposed
// only if all terms are optional.
//
// Check if all terms are optional.
$onlyOptional = true;
if ($subSigns === null) {
// All subterms are required
$onlyOptional = false;
} else {
foreach ($subSigns as $sign) {
if ($sign !== null) {
$onlyOptional = false;
break;
}
}
}
// Continue if non-optional terms are presented in this multi-term subquery
if (!$onlyOptional) {
continue;
}
foreach ($subTerms as $termId => $term) {
$terms[] = $term;
$tsigns[] = ($signs[$id] === null)? null /* optional */ :
false /* prohibited */;
$boostFactors[] = $subquery->getBoost();
}
// remove subquery from a subqueries list
unset($subqueries[$id]);
unset($signs[$id]);
}
}
}
// Check, if there are no decomposed subqueries
if (count($terms) == 0 ) {
// return prepared candidate
return $optimizedQuery;
}
// Check, if all subqueries have been decomposed and all terms has the same boost factor
if (count($subqueries) == 0 && count(array_unique($boostFactors)) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$optimizedQuery->setBoost(reset($boostFactors)*$this->getBoost());
return $optimizedQuery;
}
// This boolean query can't be transformed to Term/MultiTerm query and still contains
// several subqueries
// Separate prohibited terms
$prohibitedTerms = array();
foreach ($terms as $id => $term) {
if ($tsigns[$id] === false) {
$prohibitedTerms[] = $term;
unset($terms[$id]);
unset($tsigns[$id]);
unset($boostFactors[$id]);
}
}
if (count($terms) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$clause = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
$signs[] = reset($tsigns);
// Clear terms list
$terms = array();
} else if (count($terms) > 1 && count(array_unique($boostFactors)) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$clause = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $tsigns);
$clause->setBoost(reset($boostFactors));
$subqueries[] = $clause;
// Clause sign is 'required' if clause contains required terms. 'Optional' otherwise.
$signs[] = (in_array(true, $tsigns))? true : null;
// Clear terms list
$terms = array();
}
if (count($prohibitedTerms) == 1) {
// (boost factors are not significant for prohibited clauses)
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$subqueries[] = new Zend_Search_Lucene_Search_Query_Term(reset($prohibitedTerms));
$signs[] = false;
// Clear prohibited terms list
$prohibitedTerms = array();
} else if (count($prohibitedTerms) > 1) {
// prepare signs array
$prohibitedSigns = array();
foreach ($prohibitedTerms as $id => $term) {
// all prohibited term are grouped as optional into multi-term query
$prohibitedSigns[$id] = null;
}
// (boost factors are not significant for prohibited clauses)
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$subqueries[] = new Zend_Search_Lucene_Search_Query_MultiTerm($prohibitedTerms, $prohibitedSigns);
// Clause sign is 'prohibited'
$signs[] = false;
// Clear terms list
$prohibitedTerms = array();
}
/** @todo Group terms with the same boost factors together */
// Check, that all terms are processed
// Replace candidate for optimized query
if (count($terms) == 0 && count($prohibitedTerms) == 0) {
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Boolean($subqueries, $signs);
$optimizedQuery->setBoost($this->getBoost());
}
return $optimizedQuery;
}
/**
* Returns subqueries
*
* @return array
*/
public function getSubqueries()
{
return $this->_subqueries;
}
/**
* Return subqueries signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/Boolean.php';
$this->_weight = new Zend_Search_Lucene_Search_Weight_Boolean($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '<subquery1> AND <subquery2> AND <subquery3>')
*/
private function _calculateConjunctionResult()
{
$this->_resVector = null;
if (count($this->_subqueries) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_subqueries as $subqueryId => $subquery) {
$resVectors[] = $subquery->matchedDocs();
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $subqueryId;
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Used algorithm doesn't change elements order
}
/**
* Calculate result vector for non Conjunction query
* (like '<subquery1> AND <subquery2> AND NOT <subquery3> OR <subquery4>')
*/
private function _calculateNonConjunctionResult()
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs[$subqueryId] === true) {
// required
$requiredVectors[] = $subquery->matchedDocs();
$requiredVectorsSizes[] = count(end($requiredVectors));
$requiredVectorsIds[] = $subqueryId;
} elseif ($this->_signs[$subqueryId] === false) {
// prohibited
// Do nothing. matchedDocs() may include non-matching id's
// Calculating prohibited vector may take significant time, but do not affect the result
// Skipped.
} else {
// neither required, nor prohibited
// array union
$optional += $subquery->matchedDocs();
}
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = &$required;
} else {
$this->_resVector = &$optional;
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_subqueries),
count($this->_subqueries) );
}
$score = 0;
foreach ($this->_subqueries as $subquery) {
$subscore = $subquery->score($docId, $reader);
if ($subscore == 0) {
return 0;
}
$score += $subquery->score($docId, $reader) * $this->_coord;
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all subqueries are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0;
$matchedSubqueries = 0;
foreach ($this->_subqueries as $subqueryId => $subquery) {
$subscore = $subquery->score($docId, $reader);
// Prohibited
if ($this->_signs[$subqueryId] === false && $subscore != 0) {
return 0;
}
// is required, but doen't match
if ($this->_signs[$subqueryId] === true && $subscore == 0) {
return 0;
}
if ($subscore != 0) {
$matchedSubqueries++;
$score += $subscore;
}
}
return $score * $this->_coord[$matchedSubqueries] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Initialize weight if it's not done yet
$this->_initWeight($reader);
if ($docsFilter === null) {
// Create local documents filter if it's not provided by upper query
require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
}
foreach ($this->_subqueries as $subqueryId => $subquery) {
if ($this->_signs == null || $this->_signs[$subqueryId] === true) {
// Subquery is required
$subquery->execute($reader, $docsFilter);
} else {
$subquery->execute($reader);
}
}
if ($this->_signs === null) {
$this->_calculateConjunctionResult();
} else {
$this->_calculateNonConjunctionResult();
}
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
$terms = array();
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$terms = array_merge($terms, $subquery->getQueryTerms());
}
}
return $terms;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
foreach ($this->_subqueries as $id => $subquery) {
if ($this->_signs === null || $this->_signs[$id] !== false) {
$subquery->_highlightMatches($highlighter);
}
}
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_subqueries as $id => $subquery) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
$query .= '(' . $subquery->__toString() . ')';
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,138 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Empty.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Empty extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// "Empty" query is a primitive query and don't need to be optimized
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<EmptyQuery>';
}
}

View File

@ -0,0 +1,493 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Fuzzy.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Fuzzy extends Zend_Search_Lucene_Search_Query
{
/** Default minimum similarity */
const DEFAULT_MIN_SIMILARITY = 0.5;
/**
* Maximum number of matched terms.
* Apache Lucene defines this limitation as boolean query maximum number of clauses:
* org.apache.lucene.search.BooleanQuery.getMaxClauseCount()
*/
const MAX_CLAUSE_COUNT = 1024;
/**
* Array of precalculated max distances
*
* keys are integers representing a word size
*/
private $_maxDistances = array();
/**
* Base searching term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* The length of common (non-fuzzy) prefix
*
* @var integer
*/
private $_prefixLength;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Matched terms scores
*
* @var array
*/
private $_scores = null;
/**
* Array of the term keys.
* Used to sort terms in alphabetical order if terms have the same socres
*
* @var array
*/
private $_termKeys = null;
/**
* Default non-fuzzy prefix length
*
* @var integer
*/
private static $_defaultPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param float $minimumSimilarity
* @param integer $prefixLength
* @throws Zend_Search_Lucene_Exception
*/
public function __construct(Zend_Search_Lucene_Index_Term $term, $minimumSimilarity = self::DEFAULT_MIN_SIMILARITY, $prefixLength = null)
{
if ($minimumSimilarity < 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be less than 0');
}
if ($minimumSimilarity >= 1) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('minimumSimilarity cannot be greater than or equal to 1');
}
if ($prefixLength < 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('prefixLength cannot be less than 0');
}
$this->_term = $term;
$this->_minimumSimilarity = $minimumSimilarity;
$this->_prefixLength = ($prefixLength !== null)? $prefixLength : self::$_defaultPrefixLength;
}
/**
* Get default non-fuzzy prefix length
*
* @return integer
*/
public static function getDefaultPrefixLength()
{
return self::$_defaultPrefixLength;
}
/**
* Set default non-fuzzy prefix length
*
* @param integer $defaultPrefixLength
*/
public static function setDefaultPrefixLength($defaultPrefixLength)
{
self::$_defaultPrefixLength = $defaultPrefixLength;
}
/**
* Calculate maximum distance for specified word length
*
* @param integer $prefixLength
* @param integer $termLength
* @param integer $length
* @return integer
*/
private function _calculateMaxDistance($prefixLength, $termLength, $length)
{
$this->_maxDistances[$length] = (int) ((1 - $this->_minimumSimilarity)*(min($termLength, $length) + $prefixLength));
return $this->_maxDistances[$length];
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
$this->_scores = array();
$this->_termKeys = array();
if ($this->_term->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_term->field);
}
require_once 'Zend/Search/Lucene/Index/Term.php';
$prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
$termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
require_once 'Zend/Search/Lucene.php';
$maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
require_once 'Zend/Search/Lucene/Index/Term.php';
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($index->currentTerm()->text, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} else if (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} else if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
// Calculate similarity
$target = $index->currentTerm()->text;
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance(0, $termRestLength, strlen($target));
if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/min($termRestLength, strlen($target));
}
if ($similarity > $this->_minimumSimilarity) {
$this->_matches[] = $index->currentTerm();
$this->_termKeys[] = $index->currentTerm()->key();
$this->_scores[] = ($similarity - $this->_minimumSimilarity)*$scaleFactor;
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_Boolean();
array_multisort($this->_scores, SORT_DESC, SORT_NUMERIC,
$this->_termKeys, SORT_ASC, SORT_STRING,
$this->_matches);
$termCount = 0;
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
foreach ($this->_matches as $id => $matchedTerm) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($matchedTerm);
$subquery->setBoost($this->_scores[$id]);
$rewrittenQuery->addSubquery($subquery);
$termCount++;
if ($termCount >= self::MAX_CLAUSE_COUNT) {
break;
}
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Search or rewrite operations have to be performed before.');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Fuzzy query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$words = array();
require_once 'Zend/Search/Lucene/Index/Term.php';
$prefix = Zend_Search_Lucene_Index_Term::getPrefix($this->_term->text, $this->_prefixLength);
$prefixByteLength = strlen($prefix);
$prefixUtf8Length = Zend_Search_Lucene_Index_Term::getLength($prefix);
$termLength = Zend_Search_Lucene_Index_Term::getLength($this->_term->text);
$termRest = substr($this->_term->text, $prefixByteLength);
// we calculate length of the rest in bytes since levenshtein() is not UTF-8 compatible
$termRestLength = strlen($termRest);
$scaleFactor = 1/(1 - $this->_minimumSimilarity);
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (substr($termText, 0, $prefixByteLength) == $prefix) {
// Calculate similarity
$target = substr($termText, $prefixByteLength);
$maxDistance = isset($this->_maxDistances[strlen($target)])?
$this->_maxDistances[strlen($target)] :
$this->_calculateMaxDistance($prefixUtf8Length, $termRestLength, strlen($target));
if ($termRestLength == 0) {
// we don't have anything to compare. That means if we just add
// the letters for current term we get the new word
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - strlen($target)/$prefixUtf8Length);
} else if (strlen($target) == 0) {
$similarity = (($prefixUtf8Length == 0)? 0 : 1 - $termRestLength/$prefixUtf8Length);
} else if ($maxDistance < abs($termRestLength - strlen($target))){
//just adding the characters of term to target or vice-versa results in too many edits
//for example "pre" length is 3 and "prefixes" length is 8. We can see that
//given this optimal circumstance, the edit distance cannot be less than 5.
//which is 8-3 or more precisesly abs(3-8).
//if our maximum edit distance is 4, then we can discard this word
//without looking at it.
$similarity = 0;
} else {
$similarity = 1 - levenshtein($termRest, $target)/($prefixUtf8Length + min($termRestLength, strlen($target)));
}
if ($similarity > $this->_minimumSimilarity) {
$words[] = $termText;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_term->field === null)? '' : $this->_term->field . ':')
. $this->_term->text . '~'
. (($this->_minimumSimilarity != self::DEFAULT_MIN_SIMILARITY)? round($this->_minimumSimilarity, 4) : '')
. (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : '');
}
}

View File

@ -0,0 +1,139 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Insignificant.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* The insignificant query returns empty result, but doesn't limit result set as a part of other queries
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Insignificant extends Zend_Search_Lucene_Search_Query
{
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/Empty.php';
return new Zend_Search_Lucene_Search_Weight_Empty();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
// Do nothing
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return array();
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
return 0;
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array();
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
return '<InsignificantQuery>';
}
}

View File

@ -0,0 +1,668 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: MultiTerm.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_MultiTerm extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term
*
* @var array
*/
private $_terms = array();
/**
* Term signs.
* If true then term is required.
* If false then term is prohibited.
* If null then term is neither prohibited, nor required
*
* If array is null then all terms are required
*
* @var array
*/
private $_signs;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => freq, ...)
* term2Id => (docId => freq, ...)
*
* @var array
*/
private $_termsFreqs = array();
/**
* A score factor based on the fraction of all query terms
* that a document contains.
* float for conjunction queries
* array of float for non conjunction queries
*
* @var mixed
*/
private $_coord = null;
/**
* Terms weights
* array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights = array();
/**
* Class constructor. Create a new multi-term query object.
*
* if $signs array is omitted then all terms are required
* it differs from addTerm() behavior, but should never be used
*
* @param array $terms Array of Zend_Search_Lucene_Index_Term objects
* @param array $signs Array of signs. Sign is boolean|null.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $signs = null)
{
if (is_array($terms)) {
require_once 'Zend/Search/Lucene.php';
if (count($terms) > Zend_Search_Lucene::getTermsPerQueryLimit()) {
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$this->_terms = $terms;
$this->_signs = null;
// Check if all terms are required
if (is_array($signs)) {
foreach ($signs as $sign ) {
if ($sign !== true) {
$this->_signs = $signs;
break;
}
}
}
}
}
/**
* Add a $term (Zend_Search_Lucene_Index_Term) to this query.
*
* The sign is specified as:
* TRUE - term is required
* FALSE - term is prohibited
* NULL - term is neither prohibited, nor required
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean|null $sign
* @return void
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $sign = null) {
if ($sign !== true || $this->_signs !== null) { // Skip, if all terms are required
if ($this->_signs === null) { // Check, If all previous terms are required
$this->_signs = array();
foreach ($this->_terms as $prevTerm) {
$this->_signs[] = true;
}
}
$this->_signs[] = $sign;
}
$this->_terms[] = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
// Check, that all fields are qualified
$allQualified = true;
foreach ($this->_terms as $term) {
if ($term->field === null) {
$allQualified = false;
break;
}
}
if ($allQualified) {
return $this;
} else {
/** transform multiterm query to boolean and apply rewrite() method to subqueries. */
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
foreach ($this->_terms as $termId => $term) {
$subquery = new Zend_Search_Lucene_Search_Query_Term($term);
$query->addSubquery($subquery->rewrite($index),
($this->_signs === null)? true : $this->_signs[$termId]);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
$terms = $this->_terms;
$signs = $this->_signs;
foreach ($terms as $id => $term) {
if (!$index->hasTerm($term)) {
if ($signs === null || $signs[$id] === true) {
// Term is required
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else {
// Term is optional or prohibited
// Remove it from terms and signs list
unset($terms[$id]);
unset($signs[$id]);
}
}
}
// Check if all presented terms are prohibited
$allProhibited = true;
if ($signs === null) {
$allProhibited = false;
} else {
foreach ($signs as $sign) {
if ($sign !== false) {
$allProhibited = false;
break;
}
}
}
if ($allProhibited) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
/**
* @todo make an optimization for repeated terms
* (they may have different signs)
*/
if (count($terms) == 1) {
// It's already checked, that it's not a prohibited term
// It's one term query with one required or optional element
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($terms) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
$optimizedQuery = new Zend_Search_Lucene_Search_Query_MultiTerm($terms, $signs);
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Return terms signs
*
* @return array
*/
public function getSigns()
{
return $this->_signs;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/MultiTerm.php';
$this->_weight = new Zend_Search_Lucene_Search_Weight_MultiTerm($this, $reader);
return $this->_weight;
}
/**
* Calculate result vector for Conjunction query
* (like '+something +another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
// Order terms by selectivity
$docFreqs = array();
$ids = array();
foreach ($this->_terms as $id => $term) {
$docFreqs[] = $reader->docFreq($term);
$ids[] = $id; // Used to keep original order for terms with the same selectivity and omit terms comparison
}
array_multisort($docFreqs, SORT_ASC, SORT_NUMERIC,
$ids, SORT_ASC, SORT_NUMERIC,
$this->_terms);
require_once 'Zend/Search/Lucene/Index/DocsFilter.php';
$docsFilter = new Zend_Search_Lucene_Index_DocsFilter();
foreach ($this->_terms as $termId => $term) {
$termDocs = $reader->termDocs($term, $docsFilter);
}
// Treat last retrieved docs vector as a result set
// (filter collects data for other terms)
$this->_resVector = array_flip($termDocs);
foreach ($this->_terms as $termId => $term) {
$this->_termsFreqs[$termId] = $reader->termFreqs($term, $docsFilter);
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithms doesn't change elements order.
}
/**
* Calculate result vector for non Conjunction query
* (like '+something -another')
*
* @param Zend_Search_Lucene_Interface $reader
*/
private function _calculateNonConjunctionResult(Zend_Search_Lucene_Interface $reader)
{
$requiredVectors = array();
$requiredVectorsSizes = array();
$requiredVectorsIds = array(); // is used to prevent arrays comparison
$optional = array();
$prohibited = array();
foreach ($this->_terms as $termId => $term) {
$termDocs = array_flip($reader->termDocs($term));
if ($this->_signs[$termId] === true) {
// required
$requiredVectors[] = $termDocs;
$requiredVectorsSizes[] = count($termDocs);
$requiredVectorsIds[] = $termId;
} elseif ($this->_signs[$termId] === false) {
// prohibited
// array union
$prohibited += $termDocs;
} else {
// neither required, nor prohibited
// array union
$optional += $termDocs;
}
$this->_termsFreqs[$termId] = $reader->termFreqs($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($requiredVectorsSizes, SORT_ASC, SORT_NUMERIC,
$requiredVectorsIds, SORT_ASC, SORT_NUMERIC,
$requiredVectors);
$required = null;
foreach ($requiredVectors as $nextResVector) {
if($required === null) {
$required = $nextResVector;
} else {
//$required = array_intersect_key($required, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($required as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$required = $updatedVector;
}
if (count($required) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
if ($required !== null) {
$this->_resVector = $required;
} else {
$this->_resVector = $optional;
}
if (count($prohibited) != 0) {
// $this->_resVector = array_diff_key($this->_resVector, $prohibited);
/**
* This code is used as workaround for array_diff_key() slowness problem.
*/
if (count($this->_resVector) < count($prohibited)) {
$updatedVector = $this->_resVector;
foreach ($this->_resVector as $id => $value) {
if (isset($prohibited[$id])) {
unset($updatedVector[$id]);
}
}
$this->_resVector = $updatedVector;
} else {
$updatedVector = $this->_resVector;
foreach ($prohibited as $id => $value) {
unset($updatedVector[$id]);
}
$this->_resVector = $updatedVector;
}
}
ksort($this->_resVector, SORT_NUMERIC);
}
/**
* Score calculator for conjunction queries (all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _conjunctionScore($docId, Zend_Search_Lucene_Interface $reader)
{
if ($this->_coord === null) {
$this->_coord = $reader->getSimilarity()->coord(count($this->_terms),
count($this->_terms) );
}
$score = 0.0;
foreach ($this->_terms as $termId => $term) {
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score += $reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
return $score * $this->_coord * $this->getBoost();
}
/**
* Score calculator for non conjunction queries (not all terms are required)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _nonConjunctionScore($docId, $reader)
{
if ($this->_coord === null) {
$this->_coord = array();
$maxCoord = 0;
foreach ($this->_signs as $sign) {
if ($sign !== false /* not prohibited */) {
$maxCoord++;
}
}
for ($count = 0; $count <= $maxCoord; $count++) {
$this->_coord[$count] = $reader->getSimilarity()->coord($count, $maxCoord);
}
}
$score = 0.0;
$matchedTerms = 0;
foreach ($this->_terms as $termId=>$term) {
// Check if term is
if ($this->_signs[$termId] !== false && // not prohibited
isset($this->_termsFreqs[$termId][$docId]) // matched
) {
$matchedTerms++;
/**
* We don't need to check that term freq is not 0
* Score calculation is performed only for matched docs
*/
$score +=
$reader->getSimilarity()->tf($this->_termsFreqs[$termId][$docId]) *
$this->_weights[$termId]->getValue() *
$reader->norm($docId, $term->field);
}
}
return $score * $this->_coord[$matchedTerms] * $this->getBoost();
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
if ($this->_signs === null) {
$this->_calculateConjunctionResult($reader);
} else {
$this->_calculateNonConjunctionResult($reader);
}
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_signs === null) {
return $this->_conjunctionScore($docId, $reader);
} else {
return $this->_nonConjunctionScore($docId, $reader);
}
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
if ($this->_signs === null) {
return $this->_terms;
}
$terms = array();
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$terms[] = $this->_terms[$id];
}
}
return $terms;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$words = array();
if ($this->_signs === null) {
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
} else {
foreach ($this->_signs as $id => $sign) {
if ($sign !== false) {
$words[] = $this->_terms[$id]->text;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
$query = '';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
if ($this->_signs === null || $this->_signs[$id] === true) {
$query .= '+';
} else if ($this->_signs[$id] === false) {
$query .= '-';
}
if ($term->field !== null) {
$query .= $term->field . ':';
}
$query .= $term->text;
}
if ($this->getBoost() != 1) {
$query = '(' . $query . ')^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,576 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Phrase.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* A Query that matches documents containing a particular sequence of terms.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Phrase extends Zend_Search_Lucene_Search_Query
{
/**
* Terms to find.
* Array of Zend_Search_Lucene_Index_Term objects.
*
* @var array
*/
private $_terms;
/**
* Term positions (relative positions of terms within the phrase).
* Array of integers
*
* @var array
*/
private $_offsets;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Result vector.
*
* @var array
*/
private $_resVector = null;
/**
* Terms positions vectors.
* Array of Arrays:
* term1Id => (docId => array( pos1, pos2, ... ), ...)
* term2Id => (docId => array( pos1, pos2, ... ), ...)
*
* @var array
*/
private $_termsPositions = array();
/**
* Class constructor. Create a new prase query.
*
* @param string $field Field to search.
* @param array $terms Terms to search Array of strings.
* @param array $offsets Relative term positions. Array of integers.
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($terms = null, $offsets = null, $field = null)
{
$this->_slop = 0;
if (is_array($terms)) {
$this->_terms = array();
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($terms as $termId => $termText) {
$this->_terms[$termId] = ($field !== null)? new Zend_Search_Lucene_Index_Term($termText, $field):
new Zend_Search_Lucene_Index_Term($termText);
}
} else if ($terms === null) {
$this->_terms = array();
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('terms argument must be array of strings or null');
}
if (is_array($offsets)) {
if (count($this->_terms) != count($offsets)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('terms and offsets arguments must have the same size.');
}
$this->_offsets = $offsets;
} else if ($offsets === null) {
$this->_offsets = array();
foreach ($this->_terms as $termId => $term) {
$position = count($this->_offsets);
$this->_offsets[$termId] = $position;
}
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('offsets argument must be array of strings or null');
}
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Adds a term to the end of the query phrase.
* The relative position of the term is specified explicitly or the one immediately
* after the last term added.
*
* @param Zend_Search_Lucene_Index_Term $term
* @param integer $position
*/
public function addTerm(Zend_Search_Lucene_Index_Term $term, $position = null) {
if ((count($this->_terms) != 0)&&(end($this->_terms)->field != $term->field)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('All phrase terms must be in the same field: ' .
$term->field . ':' . $term->text);
}
$this->_terms[] = $term;
if ($position !== null) {
$this->_offsets[] = $position;
} else if (count($this->_offsets) != 0) {
$this->_offsets[] = end($this->_offsets) + 1;
} else {
$this->_offsets[] = 0;
}
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if (count($this->_terms) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else if ($this->_terms[0]->field !== null) {
return $this;
} else {
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
foreach ($index->getFieldNames(true) as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Phrase();
$subquery->setSlop($this->getSlop());
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($this->_terms as $termId => $term) {
$qualifiedTerm = new Zend_Search_Lucene_Index_Term($term->text, $fieldName);
$subquery->addTerm($qualifiedTerm, $this->_offsets[$termId]);
}
$query->addSubquery($subquery);
}
return $query;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains all phrase terms
foreach ($this->_terms as $term) {
if (!$index->hasTerm($term)) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
}
if (count($this->_terms) == 1) {
// It's one term query
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$optimizedQuery = new Zend_Search_Lucene_Search_Query_Term(reset($this->_terms));
$optimizedQuery->setBoost($this->getBoost());
return $optimizedQuery;
}
if (count($this->_terms) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Returns query term
*
* @return array
*/
public function getTerms()
{
return $this->_terms;
}
/**
* Set weight for specified term
*
* @param integer $num
* @param Zend_Search_Lucene_Search_Weight_Term $weight
*/
public function setWeight($num, $weight)
{
$this->_weights[$num] = $weight;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/Phrase.php';
$this->_weight = new Zend_Search_Lucene_Search_Weight_Phrase($this, $reader);
return $this->_weight;
}
/**
* Score calculator for exact phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @return float
*/
public function _exactPhraseFreq($docId)
{
$freq = 0;
// Term Id with lowest cardinality
$lowCardTermId = null;
// Calculate $lowCardTermId
foreach ($this->_terms as $termId => $term) {
if ($lowCardTermId === null ||
count($this->_termsPositions[$termId][$docId]) <
count($this->_termsPositions[$lowCardTermId][$docId]) ) {
$lowCardTermId = $termId;
}
}
// Walk through positions of the term with lowest cardinality
foreach ($this->_termsPositions[$lowCardTermId][$docId] as $lowCardPos) {
// We expect phrase to be found
$freq++;
// Walk through other terms
foreach ($this->_terms as $termId => $term) {
if ($termId != $lowCardTermId) {
$expectedPosition = $lowCardPos +
($this->_offsets[$termId] -
$this->_offsets[$lowCardTermId]);
if (!in_array($expectedPosition, $this->_termsPositions[$termId][$docId])) {
$freq--; // Phrase wasn't found.
break;
}
}
}
}
return $freq;
}
/**
* Score calculator for sloppy phrase queries (terms sequence is fixed)
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function _sloppyPhraseFreq($docId, Zend_Search_Lucene_Interface $reader)
{
$freq = 0;
$phraseQueue = array();
$phraseQueue[0] = array(); // empty phrase
$lastTerm = null;
// Walk through the terms to create phrases.
foreach ($this->_terms as $termId => $term) {
$queueSize = count($phraseQueue);
$firstPass = true;
// Walk through the term positions.
// Each term position produces a set of phrases.
foreach ($this->_termsPositions[$termId][$docId] as $termPosition ) {
if ($firstPass) {
for ($count = 0; $count < $queueSize; $count++) {
$phraseQueue[$count][$termId] = $termPosition;
}
} else {
for ($count = 0; $count < $queueSize; $count++) {
if ($lastTerm !== null &&
abs( $termPosition - $phraseQueue[$count][$lastTerm] -
($this->_offsets[$termId] - $this->_offsets[$lastTerm])) > $this->_slop) {
continue;
}
$newPhraseId = count($phraseQueue);
$phraseQueue[$newPhraseId] = $phraseQueue[$count];
$phraseQueue[$newPhraseId][$termId] = $termPosition;
}
}
$firstPass = false;
}
$lastTerm = $termId;
}
foreach ($phraseQueue as $phrasePos) {
$minDistance = null;
for ($shift = -$this->_slop; $shift <= $this->_slop; $shift++) {
$distance = 0;
$start = reset($phrasePos) - reset($this->_offsets) + $shift;
foreach ($this->_terms as $termId => $term) {
$distance += abs($phrasePos[$termId] - $this->_offsets[$termId] - $start);
if($distance > $this->_slop) {
break;
}
}
if ($minDistance === null || $distance < $minDistance) {
$minDistance = $distance;
}
}
if ($minDistance <= $this->_slop) {
$freq += $reader->getSimilarity()->sloppyFreq($minDistance);
}
}
return $freq;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_resVector = null;
if (count($this->_terms) == 0) {
$this->_resVector = array();
}
$resVectors = array();
$resVectorsSizes = array();
$resVectorsIds = array(); // is used to prevent arrays comparison
foreach ($this->_terms as $termId => $term) {
$resVectors[] = array_flip($reader->termDocs($term));
$resVectorsSizes[] = count(end($resVectors));
$resVectorsIds[] = $termId;
$this->_termsPositions[$termId] = $reader->termPositions($term);
}
// sort resvectors in order of subquery cardinality increasing
array_multisort($resVectorsSizes, SORT_ASC, SORT_NUMERIC,
$resVectorsIds, SORT_ASC, SORT_NUMERIC,
$resVectors);
foreach ($resVectors as $nextResVector) {
if($this->_resVector === null) {
$this->_resVector = $nextResVector;
} else {
//$this->_resVector = array_intersect_key($this->_resVector, $nextResVector);
/**
* This code is used as workaround for array_intersect_key() slowness problem.
*/
$updatedVector = array();
foreach ($this->_resVector as $id => $value) {
if (isset($nextResVector[$id])) {
$updatedVector[$id] = $value;
}
}
$this->_resVector = $updatedVector;
}
if (count($this->_resVector) == 0) {
// Empty result set, we don't need to check other terms
break;
}
}
// ksort($this->_resVector, SORT_NUMERIC);
// Docs are returned ordered. Used algorithm doesn't change elements order.
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_resVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_resVector[$docId])) {
if ($this->_slop == 0) {
$freq = $this->_exactPhraseFreq($docId);
} else {
$freq = $this->_sloppyPhraseFreq($docId, $reader);
}
if ($freq != 0) {
$tf = $reader->getSimilarity()->tf($freq);
$weight = $this->_weight->getValue();
$norm = $reader->norm($docId, reset($this->_terms)->field);
return $tf * $weight * $norm * $this->getBoost();
}
// Included in result, but culculated freq is zero
return 0;
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return $this->_terms;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$words = array();
foreach ($this->_terms as $term) {
$words[] = $term->text;
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if (isset($this->_terms[0]) && $this->_terms[0]->field !== null) {
$query = $this->_terms[0]->field . ':';
} else {
$query = '';
}
$query .= '"';
foreach ($this->_terms as $id => $term) {
if ($id != 0) {
$query .= ' ';
}
$query .= $term->text;
}
$query .= '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,127 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Preprocessing.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Query_Preprocessing extends Zend_Search_Lucene_Search_Query
{
/**
* Matched terms.
*
* Matched terms list.
* It's filled during rewrite operation and may be used for search result highlighting
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
protected $_matches = null;
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('This query is not intended to be executed.');
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('This query is not intended to be executed.');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('This query is not intended to be executed.');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('This query is not intended to be executed.');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('This query is not intended to be executed.');
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Rewrite operation has to be done before retrieving query terms.');
}
}

View File

@ -0,0 +1,287 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Fuzzy.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query_Processing */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy extends Zend_Search_Lucene_Search_Query_Preprocessing
{
/**
* word (query parser lexeme) to find.
*
* @var string
*/
private $_word;
/**
* Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_encoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* A value between 0 and 1 to set the required similarity
* between the query term and the matching terms. For example, for a
* _minimumSimilarity of 0.5 a term of the same length
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than length(term)*0.5
*
* @var float
*/
private $_minimumSimilarity;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $word Non-tokenized word (query parser lexeme) to search.
* @param string $encoding Word encoding.
* @param string $fieldName Field name.
* @param float $minimumSimilarity minimum similarity
*/
public function __construct($word, $encoding, $fieldName, $minimumSimilarity)
{
$this->_word = $word;
$this->_encoding = $encoding;
$this->_field = $fieldName;
$this->_minimumSimilarity = $minimumSimilarity;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if ($this->_field === null) {
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$hasInsignificantSubqueries = false;
require_once 'Zend/Search/Lucene.php';
if (Zend_Search_Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
}
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php';
foreach ($searchFields as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_word,
$this->_encoding,
$fieldName,
$this->_minimumSimilarity);
$rewrittenSubquery = $subquery->rewrite($index);
if ( !($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant ||
$rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Empty) ) {
$query->addSubquery($rewrittenSubquery);
}
if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
$hasInsignificantSubqueries = true;
}
}
$subqueries = $query->getSubqueries();
if (count($subqueries) == 0) {
$this->_matches = array();
if ($hasInsignificantSubqueries) {
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
} else {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
}
if (count($subqueries) == 1) {
$query = reset($subqueries);
}
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
if ($index->hasTerm($term)) {
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
$query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// -------------------------------------
// Recognize wildcard queries
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
$subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
} else {
$subPatterns = preg_split('/[*?]/', $this->_word);
}
if (count($subPatterns) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search doesn\'t support wildcards (except within Keyword fields).');
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
$this->_matches = array();
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
$query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// Word is tokenized into several tokens
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Fuzzy search is supported only for non-multiple word terms');
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
// -------------------------------------
// Recognize wildcard queries
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
$subPatterns = preg_split('/[*?]/u', iconv($this->_encoding, 'UTF-8', $this->_word));
} else {
$subPatterns = preg_split('/[*?]/', $this->_word);
}
if (count($subPatterns) > 1) {
// Do nothing
return;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
$query = new Zend_Search_Lucene_Search_Query_Fuzzy($term, $this->_minimumSimilarity);
$query->_highlightMatches($highlighter);
return;
}
// Word is tokenized into several tokens
// But fuzzy search is supported only for non-multiple word terms
// Do nothing
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= $this->_word;
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,270 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Phrase.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query_Processing */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Preprocessing_Phrase extends Zend_Search_Lucene_Search_Query_Preprocessing
{
/**
* Phrase to find.
*
* @var string
*/
private $_phrase;
/**
* Phrase encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_phraseEncoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* Sets the number of other words permitted between words in query phrase.
* If zero, then this is an exact phrase search. For larger values this works
* like a WITHIN or NEAR operator.
*
* The slop is in fact an edit-distance, where the units correspond to
* moves of terms in the query phrase out of position. For example, to switch
* the order of two words requires two moves (the first move places the words
* atop one another), so to permit re-orderings of phrases, the slop must be
* at least two.
* More exact matches are scored higher than sloppier matches, thus search
* results are sorted by exactness.
*
* The slop is zero by default, requiring exact matches.
*
* @var integer
*/
private $_slop;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $phrase Phrase to search.
* @param string $phraseEncoding Phrase encoding.
* @param string $fieldName Field name.
*/
public function __construct($phrase, $phraseEncoding, $fieldName)
{
$this->_phrase = $phrase;
$this->_phraseEncoding = $phraseEncoding;
$this->_field = $fieldName;
}
/**
* Set slop
*
* @param integer $slop
*/
public function setSlop($slop)
{
$this->_slop = $slop;
}
/**
* Get slop
*
* @return integer
*/
public function getSlop()
{
return $this->_slop;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
// Allow to use wildcards within phrases
// They are either removed by text analyzer or used as a part of keyword for keyword fields
//
// if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) {
// require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
// throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.');
// }
// Split query into subqueries if field name is not specified
if ($this->_field === null) {
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
$query->setBoost($this->getBoost());
require_once 'Zend/Search/Lucene.php';
if (Zend_Search_Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
}
foreach ($searchFields as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Phrase($this->_phrase,
$this->_phraseEncoding,
$fieldName);
$subquery->setSlop($this->getSlop());
$query->addSubquery($subquery->rewrite($index));
}
$this->_matches = $query->getQueryTerms();
return $query;
}
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($this->_phrase, $this->_field);
if ($index->hasTerm($term)) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// tokenize phrase using current analyzer and process it as a phrase query
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
if (count($tokens) == 0) {
$this->_matches = array();
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
//It's non-trivial phrase query
$position = -1;
require_once 'Zend/Search/Lucene/Search/Query/Phrase.php';
$query = new Zend_Search_Lucene_Search_Query_Phrase();
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($tokens as $token) {
$position += $token->getPositionIncrement();
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, $position);
$query->setSlop($this->getSlop());
}
$this->_matches = $query->getQueryTerms();
return $query;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
/** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */
// tokenize phrase using current analyzer and process it as a phrase query
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
$highlighter->highlight($tokens[0]->getTermText());
return;
}
//It's non-trivial phrase query
$words = array();
foreach ($tokens as $token) {
$words[] = $token->getTermText();
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= '"' . $this->_phrase . '"';
if ($this->_slop != 0) {
$query .= '~' . $this->_slop;
}
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,341 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query_Processing */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing.php';
/**
* It's an internal abstract class intended to finalize ase a query processing after query parsing.
* This type of query is not actually involved into query execution.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @internal
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Preprocessing_Term extends Zend_Search_Lucene_Search_Query_Preprocessing
{
/**
* word (query parser lexeme) to find.
*
* @var string
*/
private $_word;
/**
* Word encoding (field name is always provided using UTF-8 encoding since it may be retrieved from index).
*
* @var string
*/
private $_encoding;
/**
* Field name.
*
* @var string
*/
private $_field;
/**
* Class constructor. Create a new preprocessing object for prase query.
*
* @param string $word Non-tokenized word (query parser lexeme) to search.
* @param string $encoding Word encoding.
* @param string $fieldName Field name.
*/
public function __construct($word, $encoding, $fieldName)
{
$this->_word = $word;
$this->_encoding = $encoding;
$this->_field = $fieldName;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if ($this->_field === null) {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$query->setBoost($this->getBoost());
$hasInsignificantSubqueries = false;
require_once 'Zend/Search/Lucene.php';
if (Zend_Search_Lucene::getDefaultSearchField() === null) {
$searchFields = $index->getFieldNames(true);
} else {
$searchFields = array(Zend_Search_Lucene::getDefaultSearchField());
}
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Term.php';
foreach ($searchFields as $fieldName) {
$subquery = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_word,
$this->_encoding,
$fieldName);
$rewrittenSubquery = $subquery->rewrite($index);
foreach ($rewrittenSubquery->getQueryTerms() as $term) {
$query->addTerm($term);
}
if ($rewrittenSubquery instanceof Zend_Search_Lucene_Search_Query_Insignificant) {
$hasInsignificantSubqueries = true;
}
}
if (count($query->getTerms()) == 0) {
$this->_matches = array();
if ($hasInsignificantSubqueries) {
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
} else {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
}
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize exact term matching (it corresponds to Keyword fields stored in the index)
// encoding is not used since we expect binary matching
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($this->_word, $this->_field);
if ($index->hasTerm($term)) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
// -------------------------------------
// Recognize wildcard queries
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
$word = iconv($this->_encoding, 'UTF-8', $this->_word);
$wildcardsPattern = '/[*?]/u';
$subPatternsEncoding = 'UTF-8';
} else {
$word = $this->_word;
$wildcardsPattern = '/[*?]/';
$subPatternsEncoding = $this->_encoding;
}
$subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
if (count($subPatterns) > 1) {
// Wildcard query is recognized
$pattern = '';
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
foreach ($subPatterns as $id => $subPattern) {
// Append corresponding wildcard character to the pattern before each sub-pattern (except first)
if ($id != 0) {
$pattern .= $word[ $subPattern[1] - 1 ];
}
// Check if each subputtern is a single word in terms of current analyzer
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wildcard search is supported only for non-multiple word terms');
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
$query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
$query->setBoost($this->getBoost());
// Get rewritten query. Important! It also fills terms matching container.
$rewrittenQuery = $query->rewrite($index);
$this->_matches = $query->getQueryTerms();
return $rewrittenQuery;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
$this->_matches = array();
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($tokens[0]->getTermText(), $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
$query = new Zend_Search_Lucene_Search_Query_Term($term);
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
//It's not insignificant or one term query
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
/**
* @todo Process $token->getPositionIncrement() to support stemming, synonyms and other
* analizer design features
*/
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($tokens as $token) {
$term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $this->_field);
$query->addTerm($term, true); // all subterms are required
}
$query->setBoost($this->getBoost());
$this->_matches = $query->getQueryTerms();
return $query;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
/** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */
/** Skip exact term matching recognition, keyword fields highlighting is not supported */
// -------------------------------------
// Recognize wildcard queries
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
$word = iconv($this->_encoding, 'UTF-8', $this->_word);
$wildcardsPattern = '/[*?]/u';
$subPatternsEncoding = 'UTF-8';
} else {
$word = $this->_word;
$wildcardsPattern = '/[*?]/';
$subPatternsEncoding = $this->_encoding;
}
$subPatterns = preg_split($wildcardsPattern, $word, -1, PREG_SPLIT_OFFSET_CAPTURE);
if (count($subPatterns) > 1) {
// Wildcard query is recognized
$pattern = '';
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
foreach ($subPatterns as $id => $subPattern) {
// Append corresponding wildcard character to the pattern before each sub-pattern (except first)
if ($id != 0) {
$pattern .= $word[ $subPattern[1] - 1 ];
}
// Check if each subputtern is a single word in terms of current analyzer
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($subPattern[0], $subPatternsEncoding);
if (count($tokens) > 1) {
// Do nothing (nothing is highlighted)
return;
}
foreach ($tokens as $token) {
$pattern .= $token->getTermText();
}
}
require_once 'Zend/Search/Lucene/Index/Term.php';
$term = new Zend_Search_Lucene_Index_Term($pattern, $this->_field);
require_once 'Zend/Search/Lucene/Search/Query/Wildcard.php';
$query = new Zend_Search_Lucene_Search_Query_Wildcard($term);
$query->_highlightMatches($highlighter);
return;
}
// -------------------------------------
// Recognize one-term multi-term and "insignificant" queries
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_word, $this->_encoding);
if (count($tokens) == 0) {
// Do nothing
return;
}
if (count($tokens) == 1) {
$highlighter->highlight($tokens[0]->getTermText());
return;
}
//It's not insignificant or one term query
$words = array();
foreach ($tokens as $token) {
$words[] = $token->getTermText();
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_field !== null) {
$query = $this->_field . ':';
} else {
$query = '';
}
$query .= $this->_word;
if ($this->getBoost() != 1) {
$query .= '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,377 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Range.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Range extends Zend_Search_Lucene_Search_Query
{
/**
* Lower term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_lowerTerm;
/**
* Upper term.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_upperTerm;
/**
* Search field
*
* @var string
*/
private $_field;
/**
* Inclusive
*
* @var boolean
*/
private $_inclusive;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Zend_Search_Lucene_Search_Query_Range constructor.
*
* @param Zend_Search_Lucene_Index_Term|null $lowerTerm
* @param Zend_Search_Lucene_Index_Term|null $upperTerm
* @param boolean $inclusive
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($lowerTerm, $upperTerm, $inclusive)
{
if ($lowerTerm === null && $upperTerm === null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('At least one term must be non-null');
}
if ($lowerTerm !== null && $upperTerm !== null && $lowerTerm->field != $upperTerm->field) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Both terms must be for the same field');
}
$this->_field = ($lowerTerm !== null)? $lowerTerm->field : $upperTerm->field;
$this->_lowerTerm = $lowerTerm;
$this->_upperTerm = $upperTerm;
$this->_inclusive = $inclusive;
}
/**
* Get query field name
*
* @return string|null
*/
public function getField()
{
return $this->_field;
}
/**
* Get lower term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getLowerTerm()
{
return $this->_lowerTerm;
}
/**
* Get upper term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function getUpperTerm()
{
return $this->_upperTerm;
}
/**
* Get upper term
*
* @return boolean
*/
public function isInclusive()
{
return $this->_inclusive;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_field);
}
require_once 'Zend/Search/Lucene.php';
$maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
require_once 'Zend/Search/Lucene/Index/Term.php';
if ($this->_lowerTerm !== null) {
$lowerTerm = new Zend_Search_Lucene_Index_Term($this->_lowerTerm->text, $field);
$index->skipTo($lowerTerm);
if (!$this->_inclusive &&
$index->currentTerm() == $lowerTerm) {
// Skip lower term
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
}
if ($this->_upperTerm !== null) {
// Walk up to the upper term
$upperTerm = new Zend_Search_Lucene_Index_Term($this->_upperTerm->text, $field);
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
$index->currentTerm()->text < $upperTerm->text) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
if ($this->_inclusive && $index->currentTerm() == $upperTerm) {
// Include upper term into result
$this->_matches[] = $upperTerm;
}
} else {
// Walk up to the end of field data
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Search or rewrite operations have to be performed before.');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Range query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$words = array();
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
$lowerTermText = ($this->_lowerTerm !== null)? $this->_lowerTerm->text : null;
$upperTermText = ($this->_upperTerm !== null)? $this->_upperTerm->text : null;
if ($this->_inclusive) {
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (($lowerTermText == null || $lowerTermText <= $termText) &&
($upperTermText == null || $termText <= $upperTermText)) {
$words[] = $termText;
}
}
} else {
foreach ($tokens as $token) {
$termText = $token->getTermText();
if (($lowerTermText == null || $lowerTermText < $termText) &&
($upperTermText == null || $termText < $upperTermText)) {
$words[] = $termText;
}
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
return (($this->_field === null)? '' : $this->_field . ':')
. (($this->_inclusive)? '[' : '{')
. (($this->_lowerTerm !== null)? $this->_lowerTerm->text : 'null')
. ' TO '
. (($this->_upperTerm !== null)? $this->_upperTerm->text : 'null')
. (($this->_inclusive)? ']' : '}')
. (($this->getBoost() != 1)? '^' . round($this->getBoost(), 4) : '');
}
}

View File

@ -0,0 +1,228 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Term extends Zend_Search_Lucene_Search_Query
{
/**
* Term to find.
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* Documents vector.
*
* @var array
*/
private $_docVector = null;
/**
* Term freqs vector.
* array(docId => freq, ...)
*
* @var array
*/
private $_termFreqs;
/**
* Zend_Search_Lucene_Search_Query_Term constructor
*
* @param Zend_Search_Lucene_Index_Term $term
* @param boolean $sign
*/
public function __construct(Zend_Search_Lucene_Index_Term $term)
{
$this->_term = $term;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
if ($this->_term->field != null) {
return $this;
} else {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$query->setBoost($this->getBoost());
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($index->getFieldNames(true) as $fieldName) {
$term = new Zend_Search_Lucene_Index_Term($this->_term->text, $fieldName);
$query->addTerm($term);
}
return $query->rewrite($index);
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
// Check, that index contains specified term
if (!$index->hasTerm($this->_term)) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
}
return $this;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Search/Weight/Term.php';
$this->_weight = new Zend_Search_Lucene_Search_Weight_Term($this->_term, $this, $reader);
return $this->_weight;
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
$this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter));
$this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter);
// Initialize weight if it's not done yet
$this->_initWeight($reader);
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
*/
public function matchedDocs()
{
return $this->_docVector;
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
if (isset($this->_docVector[$docId])) {
return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) *
$this->_weight->getValue() *
$reader->norm($docId, $this->_term->field) *
$this->getBoost();
} else {
return 0;
}
}
/**
* Return query terms
*
* @return array
*/
public function getQueryTerms()
{
return array($this->_term);
}
/**
* Return query term
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getTerm()
{
return $this->_term;
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$highlighter->highlight($this->_term->text);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_term->field !== null) {
$query = $this->_term->field . ':';
} else {
$query = '';
}
$query .= $this->_term->text;
if ($this->getBoost() != 1) {
$query = $query . '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,362 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Wildcard.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Query */
require_once 'Zend/Search/Lucene/Search/Query.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Query_Wildcard extends Zend_Search_Lucene_Search_Query
{
/**
* Search pattern.
*
* Field has to be fully specified or has to be null
* Text may contain '*' or '?' symbols
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_pattern;
/**
* Matched terms.
*
* Matched terms list.
* It's filled during the search (rewrite operation) and may be used for search result
* post-processing
*
* Array of Zend_Search_Lucene_Index_Term objects
*
* @var array
*/
private $_matches = null;
/**
* Minimum term prefix length (number of minimum non-wildcard characters)
*
* @var integer
*/
private static $_minPrefixLength = 3;
/**
* Zend_Search_Lucene_Search_Query_Wildcard constructor.
*
* @param Zend_Search_Lucene_Index_Term $pattern
*/
public function __construct(Zend_Search_Lucene_Index_Term $pattern)
{
$this->_pattern = $pattern;
}
/**
* Get minimum prefix length
*
* @return integer
*/
public static function getMinPrefixLength()
{
return self::$_minPrefixLength;
}
/**
* Set minimum prefix length
*
* @param integer $minPrefixLength
*/
public static function setMinPrefixLength($minPrefixLength)
{
self::$_minPrefixLength = $minPrefixLength;
}
/**
* Get terms prefix
*
* @param string $word
* @return string
*/
private static function _getPrefix($word)
{
$questionMarkPosition = strpos($word, '?');
$astrericPosition = strpos($word, '*');
if ($questionMarkPosition !== false) {
if ($astrericPosition !== false) {
return substr($word, 0, min($questionMarkPosition, $astrericPosition));
}
return substr($word, 0, $questionMarkPosition);
} else if ($astrericPosition !== false) {
return substr($word, 0, $astrericPosition);
}
return $word;
}
/**
* Re-write query into primitive queries in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Exception
*/
public function rewrite(Zend_Search_Lucene_Interface $index)
{
$this->_matches = array();
if ($this->_pattern->field === null) {
// Search through all fields
$fields = $index->getFieldNames(true /* indexed fields list */);
} else {
$fields = array($this->_pattern->field);
}
$prefix = self::_getPrefix($this->_pattern->text);
$prefixLength = strlen($prefix);
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if ($prefixLength < self::$_minPrefixLength) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('At least ' . self::$_minPrefixLength . ' non-wildcard characters are required at the beginning of pattern.');
}
/** @todo check for PCRE unicode support may be performed through Zend_Environment in some future */
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$maxTerms = Zend_Search_Lucene::getTermsPerQueryLimit();
foreach ($fields as $field) {
$index->resetTermsStream();
require_once 'Zend/Search/Lucene/Index/Term.php';
if ($prefix != '') {
$index->skipTo(new Zend_Search_Lucene_Index_Term($prefix, $field));
while ($index->currentTerm() !== null &&
$index->currentTerm()->field == $field &&
substr($index->currentTerm()->text, 0, $prefixLength) == $prefix) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
} else {
$index->skipTo(new Zend_Search_Lucene_Index_Term('', $field));
while ($index->currentTerm() !== null && $index->currentTerm()->field == $field) {
if (preg_match($matchExpression, $index->currentTerm()->text) === 1) {
$this->_matches[] = $index->currentTerm();
if ($maxTerms != 0 && count($this->_matches) > $maxTerms) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Terms per query limit is reached.');
}
}
$index->nextTerm();
}
}
$index->closeTermsStream();
}
if (count($this->_matches) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Empty.php';
return new Zend_Search_Lucene_Search_Query_Empty();
} else if (count($this->_matches) == 1) {
require_once 'Zend/Search/Lucene/Search/Query/Term.php';
return new Zend_Search_Lucene_Search_Query_Term(reset($this->_matches));
} else {
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$rewrittenQuery = new Zend_Search_Lucene_Search_Query_MultiTerm();
foreach ($this->_matches as $matchedTerm) {
$rewrittenQuery->addTerm($matchedTerm);
}
return $rewrittenQuery;
}
}
/**
* Optimize query in the context of specified index
*
* @param Zend_Search_Lucene_Interface $index
* @return Zend_Search_Lucene_Search_Query
*/
public function optimize(Zend_Search_Lucene_Interface $index)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Returns query pattern
*
* @return Zend_Search_Lucene_Index_Term
*/
public function getPattern()
{
return $this->_pattern;
}
/**
* Return query terms
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function getQueryTerms()
{
if ($this->_matches === null) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Search has to be performed first to get matched terms');
}
return $this->_matches;
}
/**
* Constructs an appropriate Weight implementation for this query.
*
* @param Zend_Search_Lucene_Interface $reader
* @return Zend_Search_Lucene_Search_Weight
* @throws Zend_Search_Lucene_Exception
*/
public function createWeight(Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Execute query in context of index reader
* It also initializes necessary internal structures
*
* @param Zend_Search_Lucene_Interface $reader
* @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
* @throws Zend_Search_Lucene_Exception
*/
public function execute(Zend_Search_Lucene_Interface $reader, $docsFilter = null)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Get document ids likely matching the query
*
* It's an array with document ids as keys (performance considerations)
*
* @return array
* @throws Zend_Search_Lucene_Exception
*/
public function matchedDocs()
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Score specified document
*
* @param integer $docId
* @param Zend_Search_Lucene_Interface $reader
* @return float
* @throws Zend_Search_Lucene_Exception
*/
public function score($docId, Zend_Search_Lucene_Interface $reader)
{
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Wildcard query should not be directly used for search. Use $query->rewrite($index)');
}
/**
* Query specific matches highlighting
*
* @param Zend_Search_Lucene_Search_Highlighter_Interface $highlighter Highlighter object (also contains doc for highlighting)
*/
protected function _highlightMatches(Zend_Search_Lucene_Search_Highlighter_Interface $highlighter)
{
$words = array();
$matchExpression = '/^' . str_replace(array('\\?', '\\*'), array('.', '.*') , preg_quote($this->_pattern->text, '/')) . '$/';
if (@preg_match('/\pL/u', 'a') == 1) {
// PCRE unicode support is turned on
// add Unicode modifier to the match expression
$matchExpression .= 'u';
}
$docBody = $highlighter->getDocument()->getFieldUtf8Value('body');
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($docBody, 'UTF-8');
foreach ($tokens as $token) {
if (preg_match($matchExpression, $token->getTermText()) === 1) {
$words[] = $token->getTermText();
}
}
$highlighter->highlight($words);
}
/**
* Print a query
*
* @return string
*/
public function __toString()
{
// It's used only for query visualisation, so we don't care about characters escaping
if ($this->_pattern->field !== null) {
$query = $this->_pattern->field . ':';
} else {
$query = '';
}
$query .= $this->_pattern->text;
if ($this->getBoost() != 1) {
$query = $query . '^' . round($this->getBoost(), 4);
}
return $query;
}
}

View File

@ -0,0 +1,67 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryEntry.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_QueryEntry
{
/**
* Query entry boost factor
*
* @var float
*/
protected $_boost = 1.0;
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
abstract public function processFuzzyProximityModifier($parameter = null);
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
*/
abstract public function getQuery($encoding);
/**
* Boost query entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
$this->_boost *= $boostFactor;
}
}

View File

@ -0,0 +1,116 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Phrase.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Phrase extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Phrase value
*
* @var string
*/
private $_phrase;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Proximity phrase query
*
* @var boolean
*/
private $_proximityQuery = false;
/**
* Words distance, used for proximiti queries
*
* @var integer
*/
private $_wordsDistance = 0;
/**
* Object constractor
*
* @param string $phrase
* @param string $field
*/
public function __construct($phrase, $field)
{
$this->_phrase = $phrase;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_proximityQuery = true;
if ($parameter !== null) {
$this->_wordsDistance = $parameter;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
/** Zend_Search_Lucene_Search_Query_Preprocessing_Phrase */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Phrase.php';
$query = new Zend_Search_Lucene_Search_Query_Preprocessing_Phrase($this->_phrase,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null);
if ($this->_proximityQuery) {
$query->setSlop($this->_wordsDistance);
}
$query->setBoost($this->_boost);
return $query;
}
}

View File

@ -0,0 +1,77 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Subquery.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Subquery extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Query
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Object constractor
*
* @param Zend_Search_Lucene_Search_Query $query
*/
public function __construct(Zend_Search_Lucene_Search_Query $query)
{
$this->_query = $query;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' sign must follow term or phrase');
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery($encoding)
{
$this->_query->setBoost($this->_boost);
return $this->_query;
}
}

View File

@ -0,0 +1,130 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_QueryEntry */
require_once 'Zend/Search/Lucene/Search/QueryEntry.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryEntry_Term extends Zend_Search_Lucene_Search_QueryEntry
{
/**
* Term value
*
* @var string
*/
private $_term;
/**
* Field
*
* @var string|null
*/
private $_field;
/**
* Fuzzy search query
*
* @var boolean
*/
private $_fuzzyQuery = false;
/**
* Similarity
*
* @var float
*/
private $_similarity = 1.;
/**
* Object constractor
*
* @param string $term
* @param string $field
*/
public function __construct($term, $field)
{
$this->_term = $term;
$this->_field = $field;
}
/**
* Process modifier ('~')
*
* @param mixed $parameter
*/
public function processFuzzyProximityModifier($parameter = null)
{
$this->_fuzzyQuery = true;
if ($parameter !== null) {
$this->_similarity = $parameter;
} else {
/** Zend_Search_Lucene_Search_Query_Fuzzy */
require_once 'Zend/Search/Lucene/Search/Query/Fuzzy.php';
$this->_similarity = Zend_Search_Lucene_Search_Query_Fuzzy::DEFAULT_MIN_SIMILARITY;
}
}
/**
* Transform entry to a subquery
*
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function getQuery($encoding)
{
if ($this->_fuzzyQuery) {
/** Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Fuzzy.php';
$query = new Zend_Search_Lucene_Search_Query_Preprocessing_Fuzzy($this->_term,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null,
$this->_similarity
);
$query->setBoost($this->_boost);
return $query;
}
/** Zend_Search_Lucene_Search_Query_Preprocessing_Term */
require_once 'Zend/Search/Lucene/Search/Query/Preprocessing/Term.php';
$query = new Zend_Search_Lucene_Search_Query_Preprocessing_Term($this->_term,
$encoding,
($this->_field !== null)?
iconv($encoding, 'UTF-8', $this->_field) :
null
);
$query->setBoost($this->_boost);
return $query;
}
}

View File

@ -0,0 +1,110 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryHit.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryHit
{
/**
* Object handle of the index
* @var Zend_Search_Lucene_Interface
*/
protected $_index = null;
/**
* Object handle of the document associated with this hit
* @var Zend_Search_Lucene_Document
*/
protected $_document = null;
/**
* Number of the document in the index
* @var integer
*/
public $id;
/**
* Score of the hit
* @var float
*/
public $score;
/**
* Constructor - pass object handle of Zend_Search_Lucene_Interface index that produced
* the hit so the document can be retrieved easily from the hit.
*
* @param Zend_Search_Lucene_Interface $index
*/
public function __construct(Zend_Search_Lucene_Interface $index)
{
require_once 'Zend/Search/Lucene/Proxy.php';
$this->_index = new Zend_Search_Lucene_Proxy($index);
}
/**
* Convenience function for getting fields from the document
* associated with this hit.
*
* @param string $offset
* @return string
*/
public function __get($offset)
{
return $this->getDocument()->getFieldValue($offset);
}
/**
* Return the document object for this hit
*
* @return Zend_Search_Lucene_Document
*/
public function getDocument()
{
if (!$this->_document instanceof Zend_Search_Lucene_Document) {
$this->_document = $this->_index->getDocument($this->id);
}
return $this->_document;
}
/**
* Return the index object for this hit
*
* @return Zend_Search_Lucene_Interface
*/
public function getIndex()
{
return $this->_index;
}
}

View File

@ -0,0 +1,510 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryLexer.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/** Zend_Search_Lucene_Search_QueryParser */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryLexer extends Zend_Search_Lucene_FSM
{
/** State Machine states */
const ST_WHITE_SPACE = 0;
const ST_SYNT_LEXEME = 1;
const ST_LEXEME = 2;
const ST_QUOTED_LEXEME = 3;
const ST_ESCAPED_CHAR = 4;
const ST_ESCAPED_QCHAR = 5;
const ST_LEXEME_MODIFIER = 6;
const ST_NUMBER = 7;
const ST_MANTISSA = 8;
const ST_ERROR = 9;
/** Input symbols */
const IN_WHITE_SPACE = 0;
const IN_SYNT_CHAR = 1;
const IN_LEXEME_MODIFIER = 2;
const IN_ESCAPE_CHAR = 3;
const IN_QUOTE = 4;
const IN_DECIMAL_POINT = 5;
const IN_ASCII_DIGIT = 6;
const IN_CHAR = 7;
const IN_MUTABLE_CHAR = 8;
const QUERY_WHITE_SPACE_CHARS = " \n\r\t";
const QUERY_SYNT_CHARS = ':()[]{}!|&';
const QUERY_MUTABLE_CHARS = '+-';
const QUERY_DOUBLECHARLEXEME_CHARS = '|&';
const QUERY_LEXEMEMODIFIER_CHARS = '~^';
const QUERY_ASCIIDIGITS_CHARS = '0123456789';
/**
* List of recognized lexemes
*
* @var array
*/
private $_lexemes;
/**
* Query string (array of single- or non single-byte characters)
*
* @var array
*/
private $_queryString;
/**
* Current position within a query string
* Used to create appropriate error messages
*
* @var integer
*/
private $_queryStringPosition;
/**
* Recognized part of current lexeme
*
* @var string
*/
private $_currentLexeme;
public function __construct()
{
parent::__construct( array(self::ST_WHITE_SPACE,
self::ST_SYNT_LEXEME,
self::ST_LEXEME,
self::ST_QUOTED_LEXEME,
self::ST_ESCAPED_CHAR,
self::ST_ESCAPED_QCHAR,
self::ST_LEXEME_MODIFIER,
self::ST_NUMBER,
self::ST_MANTISSA,
self::ST_ERROR),
array(self::IN_WHITE_SPACE,
self::IN_SYNT_CHAR,
self::IN_MUTABLE_CHAR,
self::IN_LEXEME_MODIFIER,
self::IN_ESCAPE_CHAR,
self::IN_QUOTE,
self::IN_DECIMAL_POINT,
self::IN_ASCII_DIGIT,
self::IN_CHAR));
$lexemeModifierErrorAction = new Zend_Search_Lucene_FSMAction($this, 'lexModifierErrException');
$quoteWithinLexemeErrorAction = new Zend_Search_Lucene_FSMAction($this, 'quoteWithinLexemeErrException');
$wrongNumberErrorAction = new Zend_Search_Lucene_FSMAction($this, 'wrongNumberErrException');
$this->addRules(array( array(self::ST_WHITE_SPACE, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_WHITE_SPACE, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_WHITE_SPACE, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_WHITE_SPACE, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_WHITE_SPACE, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_SYNT_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_SYNT_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_SYNT_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
array(self::ST_SYNT_LEXEME, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_SYNT_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
array(self::ST_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_CHAR),
// IN_QUOTE not allowed
array(self::ST_LEXEME, self::IN_QUOTE, self::ST_ERROR, $quoteWithinLexemeErrorAction),
array(self::ST_LEXEME, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_LEXEME, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_QUOTED_LEXEME, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ESCAPE_CHAR, self::ST_ESCAPED_QCHAR),
array(self::ST_QUOTED_LEXEME, self::IN_QUOTE, self::ST_WHITE_SPACE),
array(self::ST_QUOTED_LEXEME, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_QUOTED_LEXEME, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_CHAR, self::IN_WHITE_SPACE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_SYNT_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_MUTABLE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_LEXEME_MODIFIER, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ESCAPE_CHAR, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_QUOTE, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_DECIMAL_POINT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_ASCII_DIGIT, self::ST_LEXEME),
array(self::ST_ESCAPED_CHAR, self::IN_CHAR, self::ST_LEXEME)
));
$this->addRules(array( array(self::ST_ESCAPED_QCHAR, self::IN_WHITE_SPACE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_SYNT_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_MUTABLE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_LEXEME_MODIFIER, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ESCAPE_CHAR, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_QUOTE, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_DECIMAL_POINT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_ASCII_DIGIT, self::ST_QUOTED_LEXEME),
array(self::ST_ESCAPED_QCHAR, self::IN_CHAR, self::ST_QUOTED_LEXEME)
));
$this->addRules(array( array(self::ST_LEXEME_MODIFIER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_LEXEME_MODIFIER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_LEXEME_MODIFIER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
// IN_QUOTE not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_QUOTE, self::ST_ERROR, $lexemeModifierErrorAction),
array(self::ST_LEXEME_MODIFIER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_LEXEME_MODIFIER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_LEXEME_MODIFIER, self::IN_CHAR, self::ST_ERROR, $lexemeModifierErrorAction),
));
$this->addRules(array( array(self::ST_NUMBER, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_NUMBER, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_NUMBER, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_NUMBER, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_NUMBER, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_NUMBER, self::IN_DECIMAL_POINT, self::ST_MANTISSA),
array(self::ST_NUMBER, self::IN_ASCII_DIGIT, self::ST_NUMBER),
// IN_CHAR not allowed
array(self::ST_NUMBER, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
$this->addRules(array( array(self::ST_MANTISSA, self::IN_WHITE_SPACE, self::ST_WHITE_SPACE),
array(self::ST_MANTISSA, self::IN_SYNT_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_MUTABLE_CHAR, self::ST_SYNT_LEXEME),
array(self::ST_MANTISSA, self::IN_LEXEME_MODIFIER, self::ST_LEXEME_MODIFIER),
// IN_ESCAPE_CHAR not allowed
array(self::ST_MANTISSA, self::IN_ESCAPE_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
// IN_QUOTE not allowed
array(self::ST_MANTISSA, self::IN_QUOTE, self::ST_ERROR, $wrongNumberErrorAction),
// IN_DECIMAL_POINT not allowed
array(self::ST_MANTISSA, self::IN_DECIMAL_POINT, self::ST_ERROR, $wrongNumberErrorAction),
array(self::ST_MANTISSA, self::IN_ASCII_DIGIT, self::ST_MANTISSA),
// IN_CHAR not allowed
array(self::ST_MANTISSA, self::IN_CHAR, self::ST_ERROR, $wrongNumberErrorAction),
));
/** Actions */
$syntaxLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuerySyntaxLexeme');
$lexemeModifierAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeModifier');
$addLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addLexeme');
$addQuotedLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addQuotedLexeme');
$addNumberLexemeAction = new Zend_Search_Lucene_FSMAction($this, 'addNumberLexeme');
$addLexemeCharAction = new Zend_Search_Lucene_FSMAction($this, 'addLexemeChar');
/** Syntax lexeme */
$this->addEntryAction(self::ST_SYNT_LEXEME, $syntaxLexemeAction);
// Two lexemes in succession
$this->addTransitionAction(self::ST_SYNT_LEXEME, self::ST_SYNT_LEXEME, $syntaxLexemeAction);
/** Lexeme */
$this->addEntryAction(self::ST_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME, $addLexemeCharAction);
// ST_ESCAPED_CHAR => ST_LEXEME transition is covered by ST_LEXEME entry action
$this->addTransitionAction(self::ST_LEXEME, self::ST_WHITE_SPACE, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_SYNT_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_LEXEME_MODIFIER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_NUMBER, $addLexemeAction);
$this->addTransitionAction(self::ST_LEXEME, self::ST_MANTISSA, $addLexemeAction);
/** Quoted lexeme */
// We don't need entry action (skeep quote)
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
$this->addTransitionAction(self::ST_ESCAPED_QCHAR, self::ST_QUOTED_LEXEME, $addLexemeCharAction);
// Closing quote changes state to the ST_WHITE_SPACE other states are not used
$this->addTransitionAction(self::ST_QUOTED_LEXEME, self::ST_WHITE_SPACE, $addQuotedLexemeAction);
/** Lexeme modifier */
$this->addEntryAction(self::ST_LEXEME_MODIFIER, $lexemeModifierAction);
/** Number */
$this->addEntryAction(self::ST_NUMBER, $addLexemeCharAction);
$this->addEntryAction(self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_NUMBER, $addLexemeCharAction);
// ST_NUMBER => ST_MANTISSA transition is covered by ST_MANTISSA entry action
$this->addTransitionAction(self::ST_MANTISSA, self::ST_MANTISSA, $addLexemeCharAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_NUMBER, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_WHITE_SPACE, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_SYNT_LEXEME, $addNumberLexemeAction);
$this->addTransitionAction(self::ST_MANTISSA, self::ST_LEXEME_MODIFIER, $addNumberLexemeAction);
}
/**
* Translate input char to an input symbol of state machine
*
* @param string $char
* @return integer
*/
private function _translateInput($char)
{
if (strpos(self::QUERY_WHITE_SPACE_CHARS, $char) !== false) { return self::IN_WHITE_SPACE;
} else if (strpos(self::QUERY_SYNT_CHARS, $char) !== false) { return self::IN_SYNT_CHAR;
} else if (strpos(self::QUERY_MUTABLE_CHARS, $char) !== false) { return self::IN_MUTABLE_CHAR;
} else if (strpos(self::QUERY_LEXEMEMODIFIER_CHARS, $char) !== false) { return self::IN_LEXEME_MODIFIER;
} else if (strpos(self::QUERY_ASCIIDIGITS_CHARS, $char) !== false) { return self::IN_ASCII_DIGIT;
} else if ($char === '"' ) { return self::IN_QUOTE;
} else if ($char === '.' ) { return self::IN_DECIMAL_POINT;
} else if ($char === '\\') { return self::IN_ESCAPE_CHAR;
} else { return self::IN_CHAR;
}
}
/**
* This method is used to tokenize query string into lexemes
*
* @param string $inputString
* @param string $encoding
* @return array
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function tokenize($inputString, $encoding)
{
$this->reset();
$this->_lexemes = array();
$this->_queryString = array();
if (PHP_OS == 'AIX' && $encoding == '') {
$encoding = 'ISO8859-1';
}
$strLength = iconv_strlen($inputString, $encoding);
// Workaround for iconv_substr bug
$inputString .= ' ';
for ($count = 0; $count < $strLength; $count++) {
$this->_queryString[$count] = iconv_substr($inputString, $count, 1, $encoding);
}
for ($this->_queryStringPosition = 0;
$this->_queryStringPosition < count($this->_queryString);
$this->_queryStringPosition++) {
$this->process($this->_translateInput($this->_queryString[$this->_queryStringPosition]));
}
$this->process(self::IN_WHITE_SPACE);
if ($this->getState() != self::ST_WHITE_SPACE) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Unexpected end of query');
}
$this->_queryString = null;
return $this->_lexemes;
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add query syntax lexeme
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function addQuerySyntaxLexeme()
{
$lexeme = $this->_queryString[$this->_queryStringPosition];
// Process two char lexemes
if (strpos(self::QUERY_DOUBLECHARLEXEME_CHARS, $lexeme) !== false) {
// increase current position in a query string
$this->_queryStringPosition++;
// check,
if ($this->_queryStringPosition == count($this->_queryString) ||
$this->_queryString[$this->_queryStringPosition] != $lexeme) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Two chars lexeme expected. ' . $this->_positionMsg());
}
// duplicate character
$lexeme .= $lexeme;
}
$token = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
$lexeme,
$this->_queryStringPosition);
// Skip this lexeme if it's a field indicator ':' and treat previous as 'field' instead of 'word'
if ($token->type == Zend_Search_Lucene_Search_QueryToken::TT_FIELD_INDICATOR) {
$token = array_pop($this->_lexemes);
if ($token === null || $token->type != Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Field mark \':\' must follow field name. ' . $this->_positionMsg());
}
$token->type = Zend_Search_Lucene_Search_QueryToken::TT_FIELD;
}
$this->_lexemes[] = $token;
}
/**
* Add lexeme modifier
*/
public function addLexemeModifier()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_SYNTAX_ELEMENT,
$this->_queryString[$this->_queryStringPosition],
$this->_queryStringPosition);
}
/**
* Add lexeme
*/
public function addLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_WORD,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Add quoted lexeme
*/
public function addQuotedLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_PHRASE,
$this->_currentLexeme,
$this->_queryStringPosition);
$this->_currentLexeme = '';
}
/**
* Add number lexeme
*/
public function addNumberLexeme()
{
$this->_lexemes[] = new Zend_Search_Lucene_Search_QueryToken(
Zend_Search_Lucene_Search_QueryToken::TC_NUMBER,
$this->_currentLexeme,
$this->_queryStringPosition - 1);
$this->_currentLexeme = '';
}
/**
* Extend lexeme by one char
*/
public function addLexemeChar()
{
$this->_currentLexeme .= $this->_queryString[$this->_queryStringPosition];
}
/**
* Position message
*
* @return string
*/
private function _positionMsg()
{
return 'Position is ' . $this->_queryStringPosition . '.';
}
/*********************************************************************
* Syntax errors actions
*********************************************************************/
public function lexModifierErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier character can be followed only by number, white space or query syntax element. ' . $this->_positionMsg());
}
public function quoteWithinLexemeErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Quote within lexeme must be escaped by \'\\\' char. ' . $this->_positionMsg());
}
public function wrongNumberErrException()
{
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Wrong number syntax.' . $this->_positionMsg());
}
}

View File

@ -0,0 +1,635 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryParser.php 21638 2010-03-24 17:56:46Z alexander $
*/
/** Internally used classes */
/** Zend_Search_Lucene_Analysis_Analyzer */
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/** Zend_Search_Lucene_FSM */
require_once 'Zend/Search/Lucene/FSM.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParser extends Zend_Search_Lucene_FSM
{
/**
* Parser instance
*
* @var Zend_Search_Lucene_Search_QueryParser
*/
private static $_instance = null;
/**
* Query lexer
*
* @var Zend_Search_Lucene_Search_QueryLexer
*/
private $_lexer;
/**
* Tokens list
* Array of Zend_Search_Lucene_Search_QueryToken objects
*
* @var array
*/
private $_tokens;
/**
* Current token
*
* @var integer|string
*/
private $_currentToken;
/**
* Last token
*
* It can be processed within FSM states, but this addirional state simplifies FSM
*
* @var Zend_Search_Lucene_Search_QueryToken
*/
private $_lastToken = null;
/**
* Range query first term
*
* @var string
*/
private $_rqFirstTerm = null;
/**
* Current query parser context
*
* @var Zend_Search_Lucene_Search_QueryParserContext
*/
private $_context;
/**
* Context stack
*
* @var array
*/
private $_contextStack;
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Query string default encoding
*
* @var string
*/
private $_defaultEncoding = '';
/**
* Defines query parsing mode.
*
* If this option is turned on, then query parser suppress query parser exceptions
* and constructs multi-term query using all words from a query.
*
* That helps to avoid exceptions caused by queries, which don't conform to query language,
* but limits possibilities to check, that query entered by user has some inconsistencies.
*
*
* Default is true.
*
* Use {@link Zend_Search_Lucene::suppressQueryParsingExceptions()},
* {@link Zend_Search_Lucene::dontSuppressQueryParsingExceptions()} and
* {@link Zend_Search_Lucene::checkQueryParsingExceptionsSuppressMode()} to operate
* with this setting.
*
* @var boolean
*/
private $_suppressQueryParsingExceptions = true;
/**
* Boolean operators constants
*/
const B_OR = 0;
const B_AND = 1;
/**
* Default boolean queries operator
*
* @var integer
*/
private $_defaultOperator = self::B_OR;
/** Query parser State Machine states */
const ST_COMMON_QUERY_ELEMENT = 0; // Terms, phrases, operators
const ST_CLOSEDINT_RQ_START = 1; // Range query start (closed interval) - '['
const ST_CLOSEDINT_RQ_FIRST_TERM = 2; // First term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_TO_TERM = 3; // 'TO' lexeme in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_LAST_TERM = 4; // Second term in '[term1 to term2]' construction
const ST_CLOSEDINT_RQ_END = 5; // Range query end (closed interval) - ']'
const ST_OPENEDINT_RQ_START = 6; // Range query start (opened interval) - '{'
const ST_OPENEDINT_RQ_FIRST_TERM = 7; // First term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_TO_TERM = 8; // 'TO' lexeme in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_LAST_TERM = 9; // Second term in '{term1 to term2}' construction
const ST_OPENEDINT_RQ_END = 10; // Range query end (opened interval) - '}'
/**
* Parser constructor
*/
public function __construct()
{
parent::__construct(array(self::ST_COMMON_QUERY_ELEMENT,
self::ST_CLOSEDINT_RQ_START,
self::ST_CLOSEDINT_RQ_FIRST_TERM,
self::ST_CLOSEDINT_RQ_TO_TERM,
self::ST_CLOSEDINT_RQ_LAST_TERM,
self::ST_CLOSEDINT_RQ_END,
self::ST_OPENEDINT_RQ_START,
self::ST_OPENEDINT_RQ_FIRST_TERM,
self::ST_OPENEDINT_RQ_TO_TERM,
self::ST_OPENEDINT_RQ_LAST_TERM,
self::ST_OPENEDINT_RQ_END
),
Zend_Search_Lucene_Search_QueryToken::getTypes());
$this->addRules(
array(array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_START, self::ST_CLOSEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_START, self::ST_OPENEDINT_RQ_START),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, self::ST_COMMON_QUERY_ELEMENT),
array(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_CLOSEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_FIRST_TERM),
array(self::ST_CLOSEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_CLOSEDINT_RQ_TO_TERM),
array(self::ST_CLOSEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_CLOSEDINT_RQ_LAST_TERM),
array(self::ST_CLOSEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_INCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$this->addRules(
array(array(self::ST_OPENEDINT_RQ_START, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_FIRST_TERM),
array(self::ST_OPENEDINT_RQ_FIRST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_TO_LEXEME, self::ST_OPENEDINT_RQ_TO_TERM),
array(self::ST_OPENEDINT_RQ_TO_TERM, Zend_Search_Lucene_Search_QueryToken::TT_WORD, self::ST_OPENEDINT_RQ_LAST_TERM),
array(self::ST_OPENEDINT_RQ_LAST_TERM, Zend_Search_Lucene_Search_QueryToken::TT_RANGE_EXCL_END, self::ST_COMMON_QUERY_ELEMENT)
));
$addTermEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addTermEntry');
$addPhraseEntryAction = new Zend_Search_Lucene_FSMAction($this, 'addPhraseEntry');
$setFieldAction = new Zend_Search_Lucene_FSMAction($this, 'setField');
$setSignAction = new Zend_Search_Lucene_FSMAction($this, 'setSign');
$setFuzzyProxAction = new Zend_Search_Lucene_FSMAction($this, 'processFuzzyProximityModifier');
$processModifierParameterAction = new Zend_Search_Lucene_FSMAction($this, 'processModifierParameter');
$subqueryStartAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryStart');
$subqueryEndAction = new Zend_Search_Lucene_FSMAction($this, 'subqueryEnd');
$logicalOperatorAction = new Zend_Search_Lucene_FSMAction($this, 'logicalOperator');
$openedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQFirstTerm');
$openedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'openedRQLastTerm');
$closedRQFirstTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQFirstTerm');
$closedRQLastTermAction = new Zend_Search_Lucene_FSMAction($this, 'closedRQLastTerm');
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_WORD, $addTermEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PHRASE, $addPhraseEntryAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FIELD, $setFieldAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED, $setSignAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK, $setFuzzyProxAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NUMBER, $processModifierParameterAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_START, $subqueryStartAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_SUBQUERY_END, $subqueryEndAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME, $logicalOperatorAction);
$this->addInputAction(self::ST_COMMON_QUERY_ELEMENT, Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME, $logicalOperatorAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_FIRST_TERM, $openedRQFirstTermAction);
$this->addEntryAction(self::ST_OPENEDINT_RQ_LAST_TERM, $openedRQLastTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_FIRST_TERM, $closedRQFirstTermAction);
$this->addEntryAction(self::ST_CLOSEDINT_RQ_LAST_TERM, $closedRQLastTermAction);
require_once 'Zend/Search/Lucene/Search/QueryLexer.php';
$this->_lexer = new Zend_Search_Lucene_Search_QueryLexer();
}
/**
* Get query parser instance
*
* @return Zend_Search_Lucene_Search_QueryParser
*/
private static function _getInstance()
{
if (self::$_instance === null) {
self::$_instance = new self();
}
return self::$_instance;
}
/**
* Set query string default encoding
*
* @param string $encoding
*/
public static function setDefaultEncoding($encoding)
{
self::_getInstance()->_defaultEncoding = $encoding;
}
/**
* Get query string default encoding
*
* @return string
*/
public static function getDefaultEncoding()
{
return self::_getInstance()->_defaultEncoding;
}
/**
* Set default boolean operator
*
* @param integer $operator
*/
public static function setDefaultOperator($operator)
{
self::_getInstance()->_defaultOperator = $operator;
}
/**
* Get default boolean operator
*
* @return integer
*/
public static function getDefaultOperator()
{
return self::_getInstance()->_defaultOperator;
}
/**
* Turn on 'suppress query parser exceptions' mode.
*/
public static function suppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = true;
}
/**
* Turn off 'suppress query parser exceptions' mode.
*/
public static function dontSuppressQueryParsingExceptions()
{
self::_getInstance()->_suppressQueryParsingExceptions = false;
}
/**
* Check 'suppress query parser exceptions' mode.
* @return boolean
*/
public static function queryParsingExceptionsSuppressed()
{
return self::_getInstance()->_suppressQueryParsingExceptions;
}
/**
* Escape keyword to force it to be parsed as one term
*
* @param string $keyword
* @return string
*/
public static function escape($keyword)
{
return '\\' . implode('\\', str_split($keyword));
}
/**
* Parses a query string
*
* @param string $strQuery
* @param string $encoding
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public static function parse($strQuery, $encoding = null)
{
self::_getInstance();
// Reset FSM if previous parse operation didn't return it into a correct state
self::$_instance->reset();
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
try {
require_once 'Zend/Search/Lucene/Search/QueryParserContext.php';
self::$_instance->_encoding = ($encoding !== null) ? $encoding : self::$_instance->_defaultEncoding;
self::$_instance->_lastToken = null;
self::$_instance->_context = new Zend_Search_Lucene_Search_QueryParserContext(self::$_instance->_encoding);
self::$_instance->_contextStack = array();
self::$_instance->_tokens = self::$_instance->_lexer->tokenize($strQuery, self::$_instance->_encoding);
// Empty query
if (count(self::$_instance->_tokens) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
foreach (self::$_instance->_tokens as $token) {
try {
self::$_instance->_currentToken = $token;
self::$_instance->process($token->type);
self::$_instance->_lastToken = $token;
} catch (Exception $e) {
if (strpos($e->getMessage(), 'There is no any rule for') !== false) {
throw new Zend_Search_Lucene_Search_QueryParserException( 'Syntax error at char position ' . $token->position . '.', 0, $e);
}
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
if (count(self::$_instance->_contextStack) != 0) {
throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing.' );
}
return self::$_instance->_context->getQuery();
} catch (Zend_Search_Lucene_Search_QueryParserException $e) {
if (self::$_instance->_suppressQueryParsingExceptions) {
$queryTokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($strQuery, self::$_instance->_encoding);
require_once 'Zend/Search/Lucene/Search/Query/MultiTerm.php';
$query = new Zend_Search_Lucene_Search_Query_MultiTerm();
$termsSign = (self::$_instance->_defaultOperator == self::B_AND) ? true /* required term */ :
null /* optional term */;
require_once 'Zend/Search/Lucene/Index/Term.php';
foreach ($queryTokens as $token) {
$query->addTerm(new Zend_Search_Lucene_Index_Term($token->getTermText()), $termsSign);
}
return $query;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
}
}
}
/*********************************************************************
* Actions implementation
*
* Actions affect on recognized lexemes list
*********************************************************************/
/**
* Add term to a query
*/
public function addTermEntry()
{
require_once 'Zend/Search/Lucene/Search/QueryEntry/Term.php';
$entry = new Zend_Search_Lucene_Search_QueryEntry_Term($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Add phrase to a query
*/
public function addPhraseEntry()
{
require_once 'Zend/Search/Lucene/Search/QueryEntry/Phrase.php';
$entry = new Zend_Search_Lucene_Search_QueryEntry_Phrase($this->_currentToken->text, $this->_context->getField());
$this->_context->addEntry($entry);
}
/**
* Set entry field
*/
public function setField()
{
$this->_context->setNextEntryField($this->_currentToken->text);
}
/**
* Set entry sign
*/
public function setSign()
{
$this->_context->setNextEntrySign($this->_currentToken->type);
}
/**
* Process fuzzy search/proximity modifier - '~'
*/
public function processFuzzyProximityModifier()
{
$this->_context->processFuzzyProximityModifier();
}
/**
* Process modifier parameter
*
* @throws Zend_Search_Lucene_Exception
*/
public function processModifierParameter()
{
if ($this->_lastToken === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
switch ($this->_lastToken->type) {
case Zend_Search_Lucene_Search_QueryToken::TT_FUZZY_PROX_MARK:
$this->_context->processFuzzyProximityModifier($this->_currentToken->text);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_BOOSTING_MARK:
$this->_context->boost($this->_currentToken->text);
break;
default:
// It's not a user input exception
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Lexeme modifier parameter must follow lexeme modifier. Char position 0.' );
}
}
/**
* Start subquery
*/
public function subqueryStart()
{
require_once 'Zend/Search/Lucene/Search/QueryParserContext.php';
$this->_contextStack[] = $this->_context;
$this->_context = new Zend_Search_Lucene_Search_QueryParserContext($this->_encoding, $this->_context->getField());
}
/**
* End subquery
*/
public function subqueryEnd()
{
if (count($this->_contextStack) == 0) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Syntax Error: mismatched parentheses, every opening must have closing. Char position ' . $this->_currentToken->position . '.' );
}
$query = $this->_context->getQuery();
$this->_context = array_pop($this->_contextStack);
require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php';
$this->_context->addEntry(new Zend_Search_Lucene_Search_QueryEntry_Subquery($query));
}
/**
* Process logical operator
*/
public function logicalOperator()
{
$this->_context->addLogicalOperator($this->_currentToken->type);
}
/**
* Process first range query term (opened interval)
*/
public function openedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (opened interval)
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function openedRQLastTerm()
{
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term');
}
require_once 'Zend/Search/Lucene/Search/Query/Range.php';
$rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, false);
require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php';
$entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
/**
* Process first range query term (closed interval)
*/
public function closedRQFirstTerm()
{
$this->_rqFirstTerm = $this->_currentToken->text;
}
/**
* Process last range query term (closed interval)
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function closedRQLastTerm()
{
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_rqFirstTerm, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$from = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$from = null;
}
$tokens = Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($this->_currentToken->text, $this->_encoding);
if (count($tokens) > 1) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Range query boundary terms must be non-multiple word terms');
} else if (count($tokens) == 1) {
require_once 'Zend/Search/Lucene/Index/Term.php';
$to = new Zend_Search_Lucene_Index_Term(reset($tokens)->getTermText(), $this->_context->getField());
} else {
$to = null;
}
if ($from === null && $to === null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('At least one range query boundary term must be non-empty term');
}
require_once 'Zend/Search/Lucene/Search/Query/Range.php';
$rangeQuery = new Zend_Search_Lucene_Search_Query_Range($from, $to, true);
require_once 'Zend/Search/Lucene/Search/QueryEntry/Subquery.php';
$entry = new Zend_Search_Lucene_Search_QueryEntry_Subquery($rangeQuery);
$this->_context->addEntry($entry);
}
}

View File

@ -0,0 +1,401 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryParserContext.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_QueryToken */
require_once 'Zend/Search/Lucene/Search/QueryToken.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryParserContext
{
/**
* Default field for the context.
*
* null means, that term should be searched through all fields
* Zend_Search_Lucene_Search_Query::rewriteQuery($index) transletes such queries to several
*
* @var string|null
*/
private $_defaultField;
/**
* Field specified for next entry
*
* @var string
*/
private $_nextEntryField = null;
/**
* True means, that term is required.
* False means, that term is prohibited.
* null means, that term is neither prohibited, nor required
*
* @var boolean
*/
private $_nextEntrySign = null;
/**
* Entries grouping mode
*/
const GM_SIGNS = 0; // Signs mode: '+term1 term2 -term3 +(subquery1) -(subquery2)'
const GM_BOOLEAN = 1; // Boolean operators mode: 'term1 and term2 or (subquery1) and not (subquery2)'
/**
* Grouping mode
*
* @var integer
*/
private $_mode = null;
/**
* Entries signs.
* Used in GM_SIGNS grouping mode
*
* @var arrays
*/
private $_signs = array();
/**
* Query entries
* Each entry is a Zend_Search_Lucene_Search_QueryEntry object or
* boolean operator (Zend_Search_Lucene_Search_QueryToken class constant)
*
* @var array
*/
private $_entries = array();
/**
* Query string encoding
*
* @var string
*/
private $_encoding;
/**
* Context object constructor
*
* @param string $encoding
* @param string|null $defaultField
*/
public function __construct($encoding, $defaultField = null)
{
$this->_encoding = $encoding;
$this->_defaultField = $defaultField;
}
/**
* Get context default field
*
* @return string|null
*/
public function getField()
{
return ($this->_nextEntryField !== null) ? $this->_nextEntryField : $this->_defaultField;
}
/**
* Set field for next entry
*
* @param string $field
*/
public function setNextEntryField($field)
{
$this->_nextEntryField = $field;
}
/**
* Set sign for next entry
*
* @param integer $sign
* @throws Zend_Search_Lucene_Exception
*/
public function setNextEntrySign($sign)
{
if ($this->_mode === self::GM_BOOLEAN) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_SIGNS;
if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_REQUIRED) {
$this->_nextEntrySign = true;
} else if ($sign == Zend_Search_Lucene_Search_QueryToken::TT_PROHIBITED) {
$this->_nextEntrySign = false;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized sign type.');
}
}
/**
* Add entry to a query
*
* @param Zend_Search_Lucene_Search_QueryEntry $entry
*/
public function addEntry(Zend_Search_Lucene_Search_QueryEntry $entry)
{
if ($this->_mode !== self::GM_BOOLEAN) {
$this->_signs[] = $this->_nextEntrySign;
}
$this->_entries[] = $entry;
$this->_nextEntryField = null;
$this->_nextEntrySign = null;
}
/**
* Process fuzzy search or proximity search modifier
*
* @throws Zend_Search_Lucene_Search_QueryParserException
*/
public function processFuzzyProximityModifier($parameter = null)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'~\' modifier must follow word or phrase.');
}
$lastEntry->processFuzzyProximityModifier($parameter);
$this->_entries[] = $lastEntry;
}
/**
* Set boost factor to the entry
*
* @param float $boostFactor
*/
public function boost($boostFactor)
{
// Check, that modifier has came just after word or phrase
if ($this->_nextEntryField !== null || $this->_nextEntrySign !== null) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry = array_pop($this->_entries);
if (!$lastEntry instanceof Zend_Search_Lucene_Search_QueryEntry) {
// there are no entries or last entry is boolean operator
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('\'^\' modifier must follow word, phrase or subquery.');
}
$lastEntry->boost($boostFactor);
$this->_entries[] = $lastEntry;
}
/**
* Process logical operator
*
* @param integer $operator
*/
public function addLogicalOperator($operator)
{
if ($this->_mode === self::GM_SIGNS) {
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('It\'s not allowed to mix boolean and signs styles in the same subquery.');
}
$this->_mode = self::GM_BOOLEAN;
$this->_entries[] = $operator;
}
/**
* Generate 'signs style' query from the context
* '+term1 term2 -term3 +(<subquery1>) ...'
*
* @return Zend_Search_Lucene_Search_Query
*/
public function _signStyleExpressionQuery()
{
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
require_once 'Zend/Search/Lucene/Search/QueryParser.php';
if (Zend_Search_Lucene_Search_QueryParser::getDefaultOperator() == Zend_Search_Lucene_Search_QueryParser::B_AND) {
$defaultSign = true; // required
} else {
// Zend_Search_Lucene_Search_QueryParser::B_OR
$defaultSign = null; // optional
}
foreach ($this->_entries as $entryId => $entry) {
$sign = ($this->_signs[$entryId] !== null) ? $this->_signs[$entryId] : $defaultSign;
$query->addSubquery($entry->getQuery($this->_encoding), $sign);
}
return $query;
}
/**
* Generate 'boolean style' query from the context
* 'term1 and term2 or term3 and (<subquery1>) and not (<subquery2>)'
*
* @return Zend_Search_Lucene_Search_Query
* @throws Zend_Search_Lucene
*/
private function _booleanExpressionQuery()
{
/**
* We treat each level of an expression as a boolean expression in
* a Disjunctive Normal Form
*
* AND operator has higher precedence than OR
*
* Thus logical query is a disjunction of one or more conjunctions of
* one or more query entries
*/
require_once 'Zend/Search/Lucene/Search/BooleanExpressionRecognizer.php';
$expressionRecognizer = new Zend_Search_Lucene_Search_BooleanExpressionRecognizer();
require_once 'Zend/Search/Lucene/Exception.php';
try {
foreach ($this->_entries as $entry) {
if ($entry instanceof Zend_Search_Lucene_Search_QueryEntry) {
$expressionRecognizer->processLiteral($entry);
} else {
switch ($entry) {
case Zend_Search_Lucene_Search_QueryToken::TT_AND_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_AND_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_OR_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_OR_OPERATOR);
break;
case Zend_Search_Lucene_Search_QueryToken::TT_NOT_LEXEME:
$expressionRecognizer->processOperator(Zend_Search_Lucene_Search_BooleanExpressionRecognizer::IN_NOT_OPERATOR);
break;
default:
throw new Zend_Search_Lucene('Boolean expression error. Unknown operator type.');
}
}
}
$conjuctions = $expressionRecognizer->finishExpression();
} catch (Zend_Search_Exception $e) {
// throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error. Error message: \'' .
// $e->getMessage() . '\'.' );
// It's query syntax error message and it should be user friendly. So FSM message is omitted
require_once 'Zend/Search/Lucene/Search/QueryParserException.php';
throw new Zend_Search_Lucene_Search_QueryParserException('Boolean expression error.', 0, $e);
}
// Remove 'only negative' conjunctions
foreach ($conjuctions as $conjuctionId => $conjuction) {
$nonNegativeEntryFound = false;
foreach ($conjuction as $conjuctionEntry) {
if ($conjuctionEntry[1]) {
$nonNegativeEntryFound = true;
break;
}
}
if (!$nonNegativeEntryFound) {
unset($conjuctions[$conjuctionId]);
}
}
$subqueries = array();
foreach ($conjuctions as $conjuction) {
// Check, if it's a one term conjuction
if (count($conjuction) == 1) {
$subqueries[] = $conjuction[0][0]->getQuery($this->_encoding);
} else {
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$subquery = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($conjuction as $conjuctionEntry) {
$subquery->addSubquery($conjuctionEntry[0]->getQuery($this->_encoding), $conjuctionEntry[1]);
}
$subqueries[] = $subquery;
}
}
if (count($subqueries) == 0) {
require_once 'Zend/Search/Lucene/Search/Query/Insignificant.php';
return new Zend_Search_Lucene_Search_Query_Insignificant();
}
if (count($subqueries) == 1) {
return $subqueries[0];
}
require_once 'Zend/Search/Lucene/Search/Query/Boolean.php';
$query = new Zend_Search_Lucene_Search_Query_Boolean();
foreach ($subqueries as $subquery) {
// Non-requirered entry/subquery
$query->addSubquery($subquery);
}
return $query;
}
/**
* Generate query from current context
*
* @return Zend_Search_Lucene_Search_Query
*/
public function getQuery()
{
if ($this->_mode === self::GM_BOOLEAN) {
return $this->_booleanExpressionQuery();
} else {
return $this->_signStyleExpressionQuery();
}
}
}

View File

@ -0,0 +1,41 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryParserException.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Zend_Search_Lucene base exception
*/
require_once 'Zend/Search/Lucene/Exception.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*
* Special exception type, which may be used to intercept wrong user input
*/
class Zend_Search_Lucene_Search_QueryParserException extends Zend_Search_Lucene_Exception
{}

View File

@ -0,0 +1,225 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: QueryToken.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_QueryToken
{
/**
* Token types.
*/
const TT_WORD = 0; // Word
const TT_PHRASE = 1; // Phrase (one or several quoted words)
const TT_FIELD = 2; // Field name in 'field:word', field:<phrase> or field:(<subquery>) pairs
const TT_FIELD_INDICATOR = 3; // ':'
const TT_REQUIRED = 4; // '+'
const TT_PROHIBITED = 5; // '-'
const TT_FUZZY_PROX_MARK = 6; // '~'
const TT_BOOSTING_MARK = 7; // '^'
const TT_RANGE_INCL_START = 8; // '['
const TT_RANGE_INCL_END = 9; // ']'
const TT_RANGE_EXCL_START = 10; // '{'
const TT_RANGE_EXCL_END = 11; // '}'
const TT_SUBQUERY_START = 12; // '('
const TT_SUBQUERY_END = 13; // ')'
const TT_AND_LEXEME = 14; // 'AND' or 'and'
const TT_OR_LEXEME = 15; // 'OR' or 'or'
const TT_NOT_LEXEME = 16; // 'NOT' or 'not'
const TT_TO_LEXEME = 17; // 'TO' or 'to'
const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, ....
/**
* Returns all possible lexeme types.
* It's used for syntax analyzer state machine initialization
*
* @return array
*/
public static function getTypes()
{
return array( self::TT_WORD,
self::TT_PHRASE,
self::TT_FIELD,
self::TT_FIELD_INDICATOR,
self::TT_REQUIRED,
self::TT_PROHIBITED,
self::TT_FUZZY_PROX_MARK,
self::TT_BOOSTING_MARK,
self::TT_RANGE_INCL_START,
self::TT_RANGE_INCL_END,
self::TT_RANGE_EXCL_START,
self::TT_RANGE_EXCL_END,
self::TT_SUBQUERY_START,
self::TT_SUBQUERY_END,
self::TT_AND_LEXEME,
self::TT_OR_LEXEME,
self::TT_NOT_LEXEME,
self::TT_TO_LEXEME,
self::TT_NUMBER
);
}
/**
* TokenCategories
*/
const TC_WORD = 0; // Word
const TC_PHRASE = 1; // Phrase (one or several quoted words)
const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8
const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^
/**
* Token type.
*
* @var integer
*/
public $type;
/**
* Token text.
*
* @var integer
*/
public $text;
/**
* Token position within query.
*
* @var integer
*/
public $position;
/**
* IndexReader constructor needs token type and token text as a parameters.
*
* @param integer $tokenCategory
* @param string $tokText
* @param integer $position
*/
public function __construct($tokenCategory, $tokenText, $position)
{
$this->text = $tokenText;
$this->position = $position + 1; // Start from 1
switch ($tokenCategory) {
case self::TC_WORD:
if ( strtolower($tokenText) == 'and') {
$this->type = self::TT_AND_LEXEME;
} else if (strtolower($tokenText) == 'or') {
$this->type = self::TT_OR_LEXEME;
} else if (strtolower($tokenText) == 'not') {
$this->type = self::TT_NOT_LEXEME;
} else if (strtolower($tokenText) == 'to') {
$this->type = self::TT_TO_LEXEME;
} else {
$this->type = self::TT_WORD;
}
break;
case self::TC_PHRASE:
$this->type = self::TT_PHRASE;
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
break;
case self::TC_SYNTAX_ELEMENT:
switch ($tokenText) {
case ':':
$this->type = self::TT_FIELD_INDICATOR;
break;
case '+':
$this->type = self::TT_REQUIRED;
break;
case '-':
$this->type = self::TT_PROHIBITED;
break;
case '~':
$this->type = self::TT_FUZZY_PROX_MARK;
break;
case '^':
$this->type = self::TT_BOOSTING_MARK;
break;
case '[':
$this->type = self::TT_RANGE_INCL_START;
break;
case ']':
$this->type = self::TT_RANGE_INCL_END;
break;
case '{':
$this->type = self::TT_RANGE_EXCL_START;
break;
case '}':
$this->type = self::TT_RANGE_EXCL_END;
break;
case '(':
$this->type = self::TT_SUBQUERY_START;
break;
case ')':
$this->type = self::TT_SUBQUERY_END;
break;
case '!':
$this->type = self::TT_NOT_LEXEME;
break;
case '&&':
$this->type = self::TT_AND_LEXEME;
break;
case '||':
$this->type = self::TT_OR_LEXEME;
break;
default:
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized query syntax lexeme: \'' . $tokenText . '\'');
}
break;
case self::TC_NUMBER:
$this->type = self::TT_NUMBER;
default:
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Unrecognized lexeme type: \'' . $tokenCategory . '\'');
}
}
}

View File

@ -0,0 +1,551 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Similarity.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Similarity
{
/**
* The Similarity implementation used by default.
*
* @var Zend_Search_Lucene_Search_Similarity
*/
private static $_defaultImpl;
/**
* Cache of decoded bytes.
* Array of floats
*
* @var array
*/
private static $_normTable = array( 0 => 0.0,
1 => 5.820766E-10,
2 => 6.9849193E-10,
3 => 8.1490725E-10,
4 => 9.313226E-10,
5 => 1.1641532E-9,
6 => 1.3969839E-9,
7 => 1.6298145E-9,
8 => 1.8626451E-9,
9 => 2.3283064E-9,
10 => 2.7939677E-9,
11 => 3.259629E-9,
12 => 3.7252903E-9,
13 => 4.656613E-9,
14 => 5.5879354E-9,
15 => 6.519258E-9,
16 => 7.4505806E-9,
17 => 9.313226E-9,
18 => 1.1175871E-8,
19 => 1.3038516E-8,
20 => 1.4901161E-8,
21 => 1.8626451E-8,
22 => 2.2351742E-8,
23 => 2.6077032E-8,
24 => 2.9802322E-8,
25 => 3.7252903E-8,
26 => 4.4703484E-8,
27 => 5.2154064E-8,
28 => 5.9604645E-8,
29 => 7.4505806E-8,
30 => 8.940697E-8,
31 => 1.0430813E-7,
32 => 1.1920929E-7,
33 => 1.4901161E-7,
34 => 1.7881393E-7,
35 => 2.0861626E-7,
36 => 2.3841858E-7,
37 => 2.9802322E-7,
38 => 3.5762787E-7,
39 => 4.172325E-7,
40 => 4.7683716E-7,
41 => 5.9604645E-7,
42 => 7.1525574E-7,
43 => 8.34465E-7,
44 => 9.536743E-7,
45 => 1.1920929E-6,
46 => 1.4305115E-6,
47 => 1.66893E-6,
48 => 1.9073486E-6,
49 => 2.3841858E-6,
50 => 2.861023E-6,
51 => 3.33786E-6,
52 => 3.8146973E-6,
53 => 4.7683716E-6,
54 => 5.722046E-6,
55 => 6.67572E-6,
56 => 7.6293945E-6,
57 => 9.536743E-6,
58 => 1.1444092E-5,
59 => 1.335144E-5,
60 => 1.5258789E-5,
61 => 1.9073486E-5,
62 => 2.2888184E-5,
63 => 2.670288E-5,
64 => 3.0517578E-5,
65 => 3.8146973E-5,
66 => 4.5776367E-5,
67 => 5.340576E-5,
68 => 6.1035156E-5,
69 => 7.6293945E-5,
70 => 9.1552734E-5,
71 => 1.0681152E-4,
72 => 1.2207031E-4,
73 => 1.5258789E-4,
74 => 1.8310547E-4,
75 => 2.1362305E-4,
76 => 2.4414062E-4,
77 => 3.0517578E-4,
78 => 3.6621094E-4,
79 => 4.272461E-4,
80 => 4.8828125E-4,
81 => 6.1035156E-4,
82 => 7.324219E-4,
83 => 8.544922E-4,
84 => 9.765625E-4,
85 => 0.0012207031,
86 => 0.0014648438,
87 => 0.0017089844,
88 => 0.001953125,
89 => 0.0024414062,
90 => 0.0029296875,
91 => 0.0034179688,
92 => 0.00390625,
93 => 0.0048828125,
94 => 0.005859375,
95 => 0.0068359375,
96 => 0.0078125,
97 => 0.009765625,
98 => 0.01171875,
99 => 0.013671875,
100 => 0.015625,
101 => 0.01953125,
102 => 0.0234375,
103 => 0.02734375,
104 => 0.03125,
105 => 0.0390625,
106 => 0.046875,
107 => 0.0546875,
108 => 0.0625,
109 => 0.078125,
110 => 0.09375,
111 => 0.109375,
112 => 0.125,
113 => 0.15625,
114 => 0.1875,
115 => 0.21875,
116 => 0.25,
117 => 0.3125,
118 => 0.375,
119 => 0.4375,
120 => 0.5,
121 => 0.625,
122 => 0.75,
123 => 0.875,
124 => 1.0,
125 => 1.25,
126 => 1.5,
127 => 1.75,
128 => 2.0,
129 => 2.5,
130 => 3.0,
131 => 3.5,
132 => 4.0,
133 => 5.0,
134 => 6.0,
135 => 7.0,
136 => 8.0,
137 => 10.0,
138 => 12.0,
139 => 14.0,
140 => 16.0,
141 => 20.0,
142 => 24.0,
143 => 28.0,
144 => 32.0,
145 => 40.0,
146 => 48.0,
147 => 56.0,
148 => 64.0,
149 => 80.0,
150 => 96.0,
151 => 112.0,
152 => 128.0,
153 => 160.0,
154 => 192.0,
155 => 224.0,
156 => 256.0,
157 => 320.0,
158 => 384.0,
159 => 448.0,
160 => 512.0,
161 => 640.0,
162 => 768.0,
163 => 896.0,
164 => 1024.0,
165 => 1280.0,
166 => 1536.0,
167 => 1792.0,
168 => 2048.0,
169 => 2560.0,
170 => 3072.0,
171 => 3584.0,
172 => 4096.0,
173 => 5120.0,
174 => 6144.0,
175 => 7168.0,
176 => 8192.0,
177 => 10240.0,
178 => 12288.0,
179 => 14336.0,
180 => 16384.0,
181 => 20480.0,
182 => 24576.0,
183 => 28672.0,
184 => 32768.0,
185 => 40960.0,
186 => 49152.0,
187 => 57344.0,
188 => 65536.0,
189 => 81920.0,
190 => 98304.0,
191 => 114688.0,
192 => 131072.0,
193 => 163840.0,
194 => 196608.0,
195 => 229376.0,
196 => 262144.0,
197 => 327680.0,
198 => 393216.0,
199 => 458752.0,
200 => 524288.0,
201 => 655360.0,
202 => 786432.0,
203 => 917504.0,
204 => 1048576.0,
205 => 1310720.0,
206 => 1572864.0,
207 => 1835008.0,
208 => 2097152.0,
209 => 2621440.0,
210 => 3145728.0,
211 => 3670016.0,
212 => 4194304.0,
213 => 5242880.0,
214 => 6291456.0,
215 => 7340032.0,
216 => 8388608.0,
217 => 1.048576E7,
218 => 1.2582912E7,
219 => 1.4680064E7,
220 => 1.6777216E7,
221 => 2.097152E7,
222 => 2.5165824E7,
223 => 2.9360128E7,
224 => 3.3554432E7,
225 => 4.194304E7,
226 => 5.0331648E7,
227 => 5.8720256E7,
228 => 6.7108864E7,
229 => 8.388608E7,
230 => 1.00663296E8,
231 => 1.17440512E8,
232 => 1.34217728E8,
233 => 1.6777216E8,
234 => 2.01326592E8,
235 => 2.34881024E8,
236 => 2.68435456E8,
237 => 3.3554432E8,
238 => 4.02653184E8,
239 => 4.69762048E8,
240 => 5.3687091E8,
241 => 6.7108864E8,
242 => 8.0530637E8,
243 => 9.395241E8,
244 => 1.07374182E9,
245 => 1.34217728E9,
246 => 1.61061274E9,
247 => 1.87904819E9,
248 => 2.14748365E9,
249 => 2.68435456E9,
250 => 3.22122547E9,
251 => 3.75809638E9,
252 => 4.2949673E9,
253 => 5.3687091E9,
254 => 6.4424509E9,
255 => 7.5161928E9 );
/**
* Set the default Similarity implementation used by indexing and search
* code.
*
* @param Zend_Search_Lucene_Search_Similarity $similarity
*/
public static function setDefault(Zend_Search_Lucene_Search_Similarity $similarity)
{
self::$_defaultImpl = $similarity;
}
/**
* Return the default Similarity implementation used by indexing and search
* code.
*
* @return Zend_Search_Lucene_Search_Similarity
*/
public static function getDefault()
{
if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Search_Similarity) {
require_once 'Zend/Search/Lucene/Search/Similarity/Default.php';
self::$_defaultImpl = new Zend_Search_Lucene_Search_Similarity_Default();
}
return self::$_defaultImpl;
}
/**
* Computes the normalization value for a field given the total number of
* terms contained in a field. These values, together with field boosts, are
* stored in an index and multipled into scores for hits on each field by the
* search code.
*
* Matches in longer fields are less precise, so implemenations of this
* method usually return smaller values when 'numTokens' is large,
* and larger values when 'numTokens' is small.
*
* That these values are computed under
* IndexWriter::addDocument(Document) and stored then using
* encodeNorm(float). Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* fieldName - name of field
* numTokens - the total number of tokens contained in fields named
* 'fieldName' of 'doc'.
* Returns a normalization factor for hits on this field of this document
*
* @param string $fieldName
* @param integer $numTokens
* @return float
*/
abstract public function lengthNorm($fieldName, $numTokens);
/**
* Computes the normalization value for a query given the sum of the squared
* weights of each of the query terms. This value is then multipled into the
* weight of each query term.
*
* This does not affect ranking, but rather just attempts to make scores
* from different queries comparable.
*
* sumOfSquaredWeights - the sum of the squares of query term weights
* Returns a normalization factor for query weights
*
* @param float $sumOfSquaredWeights
* @return float
*/
abstract public function queryNorm($sumOfSquaredWeights);
/**
* Decodes a normalization factor stored in an index.
*
* @param integer $byte
* @return float
*/
public static function decodeNorm($byte)
{
return self::$_normTable[$byte & 0xFF];
}
/**
* Encodes a normalization factor for storage in an index.
*
* The encoding uses a five-bit exponent and three-bit mantissa, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
*
* @param float $f
* @return integer
*/
static function encodeNorm($f)
{
return self::_floatToByte($f);
}
/**
* Float to byte conversion
*
* @param integer $b
* @return float
*/
private static function _floatToByte($f)
{
// round negatives up to zero
if ($f <= 0.0) {
return 0;
}
// search for appropriate value
$lowIndex = 0;
$highIndex = 255;
while ($highIndex >= $lowIndex) {
// $mid = ($highIndex - $lowIndex)/2;
$mid = ($highIndex + $lowIndex) >> 1;
$delta = $f - self::$_normTable[$mid];
if ($delta < 0) {
$highIndex = $mid-1;
} elseif ($delta > 0) {
$lowIndex = $mid+1;
} else {
return $mid; // We got it!
}
}
// round to closest value
if ($highIndex != 255 &&
$f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
return $highIndex + 1;
} else {
return $highIndex;
}
}
/**
* Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the idf(Term, Searcher)
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when 'freq' is large, and smaller values when 'freq'
* is small.
*
* freq - the frequency of a term within a document
* Returns a score factor based on a term's within-document frequency
*
* @param float $freq
* @return float
*/
abstract public function tf($freq);
/**
* Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to tf(float).
*
* A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
* return larger values when the edit distance is small and smaller values
* when it is large.
*
* distance - the edit distance of this sloppy phrase match
* Returns the frequency increment for this match
*
* @param integer $distance
* @return float
*/
abstract public function sloppyFreq($distance);
/**
* Computes a score factor for a simple term or a phrase.
*
* The default implementation is:
* return idfFreq(searcher.docFreq(term), searcher.maxDoc());
*
* input - the term in question or array of terms
* reader - reader the document collection being searched
* Returns a score factor for the term
*
* @param mixed $input
* @param Zend_Search_Lucene_Interface $reader
* @return a score factor for the term
*/
public function idf($input, Zend_Search_Lucene_Interface $reader)
{
if (!is_array($input)) {
return $this->idfFreq($reader->docFreq($input), $reader->count());
} else {
$idf = 0.0;
foreach ($input as $term) {
$idf += $this->idfFreq($reader->docFreq($term), $reader->count());
}
return $idf;
}
}
/**
* Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* tf(int) factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* Terms that occur in fewer documents are better indicators of topic, so
* implemenations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* docFreq - the number of documents which contain the term
* numDocs - the total number of documents in the collection
* Returns a score factor based on the term's document frequency
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
abstract public function idfFreq($docFreq, $numDocs);
/**
* Computes a score factor based on the fraction of all query terms that a
* document contains. This value is multiplied into scores.
*
* The presence of a large portion of the query terms indicates a better
* match with the query, so implemenations of this method usually return
* larger values when the ratio between these parameters is large and smaller
* values when the ratio between them is small.
*
* overlap - the number of query terms matched in the document
* maxOverlap - the total number of terms in the query
* Returns a score factor based on term overlap with the query
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
abstract public function coord($overlap, $maxOverlap);
}

View File

@ -0,0 +1,110 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Default.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Similarity */
require_once 'Zend/Search/Lucene/Search/Similarity.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Similarity_Default extends Zend_Search_Lucene_Search_Similarity
{
/**
* Implemented as '1/sqrt(numTerms)'.
*
* @param string $fieldName
* @param integer $numTerms
* @return float
*/
public function lengthNorm($fieldName, $numTerms)
{
if ($numTerms == 0) {
return 1E10;
}
return 1.0/sqrt($numTerms);
}
/**
* Implemented as '1/sqrt(sumOfSquaredWeights)'.
*
* @param float $sumOfSquaredWeights
* @return float
*/
public function queryNorm($sumOfSquaredWeights)
{
return 1.0/sqrt($sumOfSquaredWeights);
}
/**
* Implemented as 'sqrt(freq)'.
*
* @param float $freq
* @return float
*/
public function tf($freq)
{
return sqrt($freq);
}
/**
* Implemented as '1/(distance + 1)'.
*
* @param integer $distance
* @return float
*/
public function sloppyFreq($distance)
{
return 1.0/($distance + 1);
}
/**
* Implemented as 'log(numDocs/(docFreq+1)) + 1'.
*
* @param integer $docFreq
* @param integer $numDocs
* @return float
*/
public function idfFreq($docFreq, $numDocs)
{
return log($numDocs/(float)($docFreq+1)) + 1.0;
}
/**
* Implemented as 'overlap/maxOverlap'.
*
* @param integer $overlap
* @param integer $maxOverlap
* @return float
*/
public function coord($overlap, $maxOverlap)
{
return $overlap/(float)$maxOverlap;
}
}

View File

@ -0,0 +1,85 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Weight.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Calculate query weights and build query scorers.
*
* A Weight is constructed by a query Query->createWeight().
* The sumOfSquaredWeights() method is then called on the top-level
* query to compute the query normalization factor Similarity->queryNorm(float).
* This factor is then passed to normalize(float). At this point the weighting
* is complete.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Search_Weight
{
/**
* Normalization factor.
* This value is stored only for query expanation purpose and not used in any other place
*
* @var float
*/
protected $_queryNorm;
/**
* Weight value
*
* Weight value may be initialized in sumOfSquaredWeights() or normalize()
* because they both are invoked either in Query::_initWeight (for top-level query) or
* in corresponding methods of parent query's weights
*
* @var float
*/
protected $_value;
/**
* The weight for this query.
*
* @return float
*/
public function getValue()
{
return $this->_value;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
abstract public function sumOfSquaredWeights();
/**
* Assigns the query normalization factor to this.
*
* @param $norm
*/
abstract public function normalize($norm);
}

View File

@ -0,0 +1,137 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Boolean.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Boolean extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Queries weights
* Array of Zend_Search_Lucene_Search_Weight
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_Boolean constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getSubqueries() as $num => $subquery) {
if ($signs === null || $signs[$num] === null || $signs[$num]) {
$this->_weights[$num] = $subquery->createWeight($reader);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}

View File

@ -0,0 +1,57 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Empty.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Empty extends Zend_Search_Lucene_Search_Weight
{
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
return 1;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
}
}

View File

@ -0,0 +1,139 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: MultiTerm.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_MultiTerm extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Query terms weights
* Array of Zend_Search_Lucene_Search_Weight_Term
*
* @var array
*/
private $_weights;
/**
* Zend_Search_Lucene_Search_Weight_MultiTerm constructor
* query - the query that this concerns.
* reader - index reader
*
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
$this->_weights = array();
$signs = $query->getSigns();
foreach ($query->getTerms() as $id => $term) {
if ($signs === null || $signs[$id] === null || $signs[$id]) {
require_once 'Zend/Search/Lucene/Search/Weight/Term.php';
$this->_weights[$id] = new Zend_Search_Lucene_Search_Weight_Term($term, $query, $reader);
$query->setWeight($id, $this->_weights[$id]);
}
}
}
/**
* The weight for this query
* Standard Weight::$_value is not used for boolean queries
*
* @return float
*/
public function getValue()
{
return $this->_query->getBoost();
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
$sum = 0;
foreach ($this->_weights as $weight) {
// sum sub weights
$sum += $weight->sumOfSquaredWeights();
}
// boost each sub-weight
$sum *= $this->_query->getBoost() * $this->_query->getBoost();
// check for empty query (like '-something -another')
if ($sum == 0) {
$sum = 1.0;
}
return $sum;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
// incorporate boost
$queryNorm *= $this->_query->getBoost();
foreach ($this->_weights as $weight) {
$weight->normalize($queryNorm);
}
}
}

View File

@ -0,0 +1,108 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Phrase.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* Zend_Search_Lucene_Search_Weight
*/
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Phrase extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query_Phrase
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Zend_Search_Lucene_Search_Weight_Phrase constructor
*
* @param Zend_Search_Lucene_Search_Query_Phrase $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Search_Query_Phrase $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@ -0,0 +1,125 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Term.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Search_Weight */
require_once 'Zend/Search/Lucene/Search/Weight.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Search
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Search_Weight_Term extends Zend_Search_Lucene_Search_Weight
{
/**
* IndexReader.
*
* @var Zend_Search_Lucene_Interface
*/
private $_reader;
/**
* Term
*
* @var Zend_Search_Lucene_Index_Term
*/
private $_term;
/**
* The query that this concerns.
*
* @var Zend_Search_Lucene_Search_Query
*/
private $_query;
/**
* Score factor
*
* @var float
*/
private $_idf;
/**
* Query weight
*
* @var float
*/
private $_queryWeight;
/**
* Zend_Search_Lucene_Search_Weight_Term constructor
* reader - index reader
*
* @param Zend_Search_Lucene_Index_Term $term
* @param Zend_Search_Lucene_Search_Query $query
* @param Zend_Search_Lucene_Interface $reader
*/
public function __construct(Zend_Search_Lucene_Index_Term $term,
Zend_Search_Lucene_Search_Query $query,
Zend_Search_Lucene_Interface $reader)
{
$this->_term = $term;
$this->_query = $query;
$this->_reader = $reader;
}
/**
* The sum of squared weights of contained query clauses.
*
* @return float
*/
public function sumOfSquaredWeights()
{
// compute idf
$this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader);
// compute query weight
$this->_queryWeight = $this->_idf * $this->_query->getBoost();
// square it
return $this->_queryWeight * $this->_queryWeight;
}
/**
* Assigns the query normalization factor to this.
*
* @param float $queryNorm
*/
public function normalize($queryNorm)
{
$this->_queryNorm = $queryNorm;
// normalize query weight
$this->_queryWeight *= $queryNorm;
// idf for documents
$this->_value = $this->_queryWeight * $this->_idf;
}
}

View File

@ -0,0 +1,136 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Directory.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Storage_Directory
{
/**
* Closes the store.
*
* @return void
*/
abstract public function close();
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
abstract public function fileList();
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
*/
abstract public function createFile($filename);
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
*/
abstract public function deleteFile($filename);
/**
* Purge file if it's cached by directory object
*
* Method is used to prevent 'too many open files' error
*
* @param string $filename
* @return void
*/
abstract public function purgeFile($filename);
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
abstract public function fileExists($filename);
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
abstract public function fileLength($filename);
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
abstract public function fileModified($filename);
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
*/
abstract public function renameFile($from, $to);
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
abstract public function touchFile($filename);
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* If $shareHandler option is true, then file handler can be shared between File Object
* requests. It speed-ups performance, but makes problems with file position.
* Shared handler are good for short atomic requests.
* Non-shared handlers are useful for stream file reading (especial for compound files).
*
* @param string $filename
* @param boolean $shareHandler
* @return Zend_Search_Lucene_Storage_File
*/
abstract public function getFileObject($filename, $shareHandler = true);
}

View File

@ -0,0 +1,362 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Filesystem.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Storage_Directory */
require_once 'Zend/Search/Lucene/Storage/Directory.php';
/**
* FileSystem implementation of Directory abstraction.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_Directory_Filesystem extends Zend_Search_Lucene_Storage_Directory
{
/**
* Filesystem path to the directory
*
* @var string
*/
protected $_dirPath = null;
/**
* Cache for Zend_Search_Lucene_Storage_File_Filesystem objects
* Array: filename => Zend_Search_Lucene_Storage_File object
*
* @var array
* @throws Zend_Search_Lucene_Exception
*/
protected $_fileHandlers;
/**
* Default file permissions
*
* @var integer
*/
protected static $_defaultFilePermissions = 0666;
/**
* Get default file permissions
*
* @return integer
*/
public static function getDefaultFilePermissions()
{
return self::$_defaultFilePermissions;
}
/**
* Set default file permissions
*
* @param integer $mode
*/
public static function setDefaultFilePermissions($mode)
{
self::$_defaultFilePermissions = $mode;
}
/**
* Utility function to recursive directory creation
*
* @param string $dir
* @param integer $mode
* @param boolean $recursive
* @return boolean
*/
public static function mkdirs($dir, $mode = 0777, $recursive = true)
{
if (($dir === null) || $dir === '') {
return false;
}
if (is_dir($dir) || $dir === '/') {
return true;
}
if (self::mkdirs(dirname($dir), $mode, $recursive)) {
return mkdir($dir, $mode);
}
return false;
}
/**
* Object constructor
* Checks if $path is a directory or tries to create it.
*
* @param string $path
* @throws Zend_Search_Lucene_Exception
*/
public function __construct($path)
{
if (!is_dir($path)) {
if (file_exists($path)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Path exists, but it\'s not a directory');
} else {
if (!self::mkdirs($path)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception("Can't create directory '$path'.");
}
}
}
$this->_dirPath = $path;
$this->_fileHandlers = array();
}
/**
* Closes the store.
*
* @return void
*/
public function close()
{
foreach ($this->_fileHandlers as $fileObject) {
$fileObject->close();
}
$this->_fileHandlers = array();
}
/**
* Returns an array of strings, one for each file in the directory.
*
* @return array
*/
public function fileList()
{
$result = array();
$dirContent = opendir( $this->_dirPath );
while (($file = readdir($dirContent)) !== false) {
if (($file == '..')||($file == '.')) continue;
if( !is_dir($this->_dirPath . '/' . $file) ) {
$result[] = $file;
}
}
closedir($dirContent);
return $result;
}
/**
* Creates a new, empty file in the directory with the given $filename.
*
* @param string $filename
* @return Zend_Search_Lucene_Storage_File
* @throws Zend_Search_Lucene_Exception
*/
public function createFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php';
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($this->_dirPath . '/' . $filename, 'w+b');
// Set file permissions, but don't care about any possible failures, since file may be already
// created by anther user which has to care about right permissions
@chmod($this->_dirPath . '/' . $filename, self::$_defaultFilePermissions);
return $this->_fileHandlers[$filename];
}
/**
* Removes an existing $filename in the directory.
*
* @param string $filename
* @return void
* @throws Zend_Search_Lucene_Exception
*/
public function deleteFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
global $php_errormsg;
$trackErrors = ini_get('track_errors'); ini_set('track_errors', '1');
if (!@unlink($this->_dirPath . '/' . $filename)) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Can\'t delete file: ' . $php_errormsg);
}
ini_set('track_errors', $trackErrors);
}
/**
* Purge file if it's cached by directory object
*
* Method is used to prevent 'too many open files' error
*
* @param string $filename
* @return void
*/
public function purgeFile($filename)
{
if (isset($this->_fileHandlers[$filename])) {
$this->_fileHandlers[$filename]->close();
}
unset($this->_fileHandlers[$filename]);
}
/**
* Returns true if a file with the given $filename exists.
*
* @param string $filename
* @return boolean
*/
public function fileExists($filename)
{
return isset($this->_fileHandlers[$filename]) ||
file_exists($this->_dirPath . '/' . $filename);
}
/**
* Returns the length of a $filename in the directory.
*
* @param string $filename
* @return integer
*/
public function fileLength($filename)
{
if (isset( $this->_fileHandlers[$filename] )) {
return $this->_fileHandlers[$filename]->size();
}
return filesize($this->_dirPath .'/'. $filename);
}
/**
* Returns the UNIX timestamp $filename was last modified.
*
* @param string $filename
* @return integer
*/
public function fileModified($filename)
{
return filemtime($this->_dirPath .'/'. $filename);
}
/**
* Renames an existing file in the directory.
*
* @param string $from
* @param string $to
* @return void
* @throws Zend_Search_Lucene_Exception
*/
public function renameFile($from, $to)
{
global $php_errormsg;
if (isset($this->_fileHandlers[$from])) {
$this->_fileHandlers[$from]->close();
}
unset($this->_fileHandlers[$from]);
if (isset($this->_fileHandlers[$to])) {
$this->_fileHandlers[$to]->close();
}
unset($this->_fileHandlers[$to]);
if (file_exists($this->_dirPath . '/' . $to)) {
if (!unlink($this->_dirPath . '/' . $to)) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Delete operation failed');
}
}
$trackErrors = ini_get('track_errors');
ini_set('track_errors', '1');
$success = @rename($this->_dirPath . '/' . $from, $this->_dirPath . '/' . $to);
if (!$success) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($php_errormsg);
}
ini_set('track_errors', $trackErrors);
return $success;
}
/**
* Sets the modified time of $filename to now.
*
* @param string $filename
* @return void
*/
public function touchFile($filename)
{
return touch($this->_dirPath .'/'. $filename);
}
/**
* Returns a Zend_Search_Lucene_Storage_File object for a given $filename in the directory.
*
* If $shareHandler option is true, then file handler can be shared between File Object
* requests. It speed-ups performance, but makes problems with file position.
* Shared handler are good for short atomic requests.
* Non-shared handlers are useful for stream file reading (especial for compound files).
*
* @param string $filename
* @param boolean $shareHandler
* @return Zend_Search_Lucene_Storage_File
*/
public function getFileObject($filename, $shareHandler = true)
{
$fullFilename = $this->_dirPath . '/' . $filename;
require_once 'Zend/Search/Lucene/Storage/File/Filesystem.php';
if (!$shareHandler) {
return new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename);
}
if (isset( $this->_fileHandlers[$filename] )) {
$this->_fileHandlers[$filename]->seek(0);
return $this->_fileHandlers[$filename];
}
$this->_fileHandlers[$filename] = new Zend_Search_Lucene_Storage_File_Filesystem($fullFilename);
return $this->_fileHandlers[$filename];
}
}

View File

@ -0,0 +1,473 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: File.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
abstract class Zend_Search_Lucene_Storage_File
{
/**
* Reads $length number of bytes at the current position in the
* file and advances the file pointer.
*
* @param integer $length
* @return string
*/
abstract protected function _fread($length=1);
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
abstract public function seek($offset, $whence=SEEK_SET);
/**
* Get file position.
*
* @return integer
*/
abstract public function tell();
/**
* Flush output.
*
* Returns true on success or false on failure.
*
* @return boolean
*/
abstract public function flush();
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
abstract protected function _fwrite($data, $length=null);
/**
* Lock file
*
* Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock)
*
* @param integer $lockType
* @return boolean
*/
abstract public function lock($lockType, $nonBlockinLock = false);
/**
* Unlock file
*/
abstract public function unlock();
/**
* Reads a byte from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readByte()
{
return ord($this->_fread(1));
}
/**
* Writes a byte to the end of the file.
*
* @param integer $byte
*/
public function writeByte($byte)
{
return $this->_fwrite(chr($byte), 1);
}
/**
* Read num bytes from the current position in the file
* and advances the file pointer.
*
* @param integer $num
* @return string
*/
public function readBytes($num)
{
return $this->_fread($num);
}
/**
* Writes num bytes of data (all, if $num===null) to the end
* of the string.
*
* @param string $data
* @param integer $num
*/
public function writeBytes($data, $num=null)
{
$this->_fwrite($data, $num);
}
/**
* Reads an integer from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readInt()
{
$str = $this->_fread(4);
return ord($str[0]) << 24 |
ord($str[1]) << 16 |
ord($str[2]) << 8 |
ord($str[3]);
}
/**
* Writes an integer to the end of file.
*
* @param integer $value
*/
public function writeInt($value)
{
settype($value, 'integer');
$this->_fwrite( chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 4 );
}
/**
* Returns a long integer from the current position in the file
* and advances the file pointer.
*
* @return integer|float
* @throws Zend_Search_Lucene_Exception
*/
public function readLong()
{
/**
* Check, that we work in 64-bit mode.
* fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
$str = $this->_fread(8);
return ord($str[0]) << 56 |
ord($str[1]) << 48 |
ord($str[2]) << 40 |
ord($str[3]) << 32 |
ord($str[4]) << 24 |
ord($str[5]) << 16 |
ord($str[6]) << 8 |
ord($str[7]);
} else {
return $this->readLong32Bit();
}
}
/**
* Writes long integer to the end of file
*
* @param integer $value
* @throws Zend_Search_Lucene_Exception
*/
public function writeLong($value)
{
/**
* Check, that we work in 64-bit mode.
* fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
settype($value, 'integer');
$this->_fwrite( chr($value>>56 & 0xFF) .
chr($value>>48 & 0xFF) .
chr($value>>40 & 0xFF) .
chr($value>>32 & 0xFF) .
chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF), 8 );
} else {
$this->writeLong32Bit($value);
}
}
/**
* Returns a long integer from the current position in the file,
* advances the file pointer and return it as float (for 32-bit platforms).
*
* @return integer|float
* @throws Zend_Search_Lucene_Exception
*/
public function readLong32Bit()
{
$wordHigh = $this->readInt();
$wordLow = $this->readInt();
if ($wordHigh & (int)0x80000000) {
// It's a negative value since the highest bit is set
if ($wordHigh == (int)0xFFFFFFFF && ($wordLow & (int)0x80000000)) {
return $wordLow;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.');
}
}
if ($wordLow < 0) {
// Value is large than 0x7FFF FFFF. Represent low word as float.
$wordLow &= 0x7FFFFFFF;
$wordLow += (float)0x80000000;
}
if ($wordHigh == 0) {
// Return value as integer if possible
return $wordLow;
}
return $wordHigh*(float)0x100000000/* 0x00000001 00000000 */ + $wordLow;
}
/**
* Writes long integer to the end of file (32-bit platforms implementation)
*
* @param integer|float $value
* @throws Zend_Search_Lucene_Exception
*/
public function writeLong32Bit($value)
{
if ($value < (int)0x80000000) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.');
}
if ($value < 0) {
$wordHigh = (int)0xFFFFFFFF;
$wordLow = (int)$value;
} else {
$wordHigh = (int)($value/(float)0x100000000/* 0x00000001 00000000 */);
$wordLow = $value - $wordHigh*(float)0x100000000/* 0x00000001 00000000 */;
if ($wordLow > 0x7FFFFFFF) {
// Highest bit of low word is set. Translate it to the corresponding negative integer value
$wordLow -= 0x80000000;
$wordLow |= 0x80000000;
}
}
$this->writeInt($wordHigh);
$this->writeInt($wordLow);
}
/**
* Returns a variable-length integer from the current
* position in the file and advances the file pointer.
*
* @return integer
*/
public function readVInt()
{
$nextByte = ord($this->_fread(1));
$val = $nextByte & 0x7F;
for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) {
$nextByte = ord($this->_fread(1));
$val |= ($nextByte & 0x7F) << $shift;
}
return $val;
}
/**
* Writes a variable-length integer to the end of file.
*
* @param integer $value
*/
public function writeVInt($value)
{
settype($value, 'integer');
while ($value > 0x7F) {
$this->_fwrite(chr( ($value & 0x7F)|0x80 ));
$value >>= 7;
}
$this->_fwrite(chr($value));
}
/**
* Reads a string from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readString()
{
$strlen = $this->readVInt();
if ($strlen == 0) {
return '';
} else {
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
$str_val = $this->_fread($strlen);
for ($count = 0; $count < $strlen; $count++ ) {
if (( ord($str_val[$count]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($str_val[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
if (ord($str_val[$count]) & 0x10 ) {
$addBytes++;
}
}
$str_val .= $this->_fread($addBytes);
$strlen += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($str_val[$count]) == 0xC0 &&
ord($str_val[$count+1]) == 0x80 ) {
$str_val[$count] = 0;
$str_val = substr($str_val,0,$count+1)
. substr($str_val,$count+2);
}
$count += $addBytes;
}
}
return $str_val;
}
}
/**
* Writes a string to the end of file.
*
* @param string $str
* @throws Zend_Search_Lucene_Exception
*/
public function writeString($str)
{
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
// convert input to a string before iterating string characters
settype($str, 'string');
$chars = $strlen = strlen($str);
$containNullChars = false;
for ($count = 0; $count < $strlen; $count++ ) {
/**
* String is already in Java 2 representation.
* We should only calculate actual string length and replace
* \x00 by \xC0\x80
*/
if ((ord($str[$count]) & 0xC0) == 0xC0) {
$addBytes = 1;
if (ord($str[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
// and we dont't support non-BMP characters
if (ord($str[$count]) & 0x10 ) {
$addBytes++;
}
}
$chars -= $addBytes;
if (ord($str[$count]) == 0 ) {
$containNullChars = true;
}
$count += $addBytes;
}
}
if ($chars < 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string');
}
$this->writeVInt($chars);
if ($containNullChars) {
$this->_fwrite(str_replace($str, "\x00", "\xC0\x80"));
} else {
$this->_fwrite($str);
}
}
/**
* Reads binary data from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readBinary()
{
return $this->_fread($this->readVInt());
}
}

View File

@ -0,0 +1,220 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Filesystem.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Storage_File */
require_once 'Zend/Search/Lucene/Storage/File.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_File_Filesystem extends Zend_Search_Lucene_Storage_File
{
/**
* Resource of the open file
*
* @var resource
*/
protected $_fileHandle;
/**
* Class constructor. Open the file.
*
* @param string $filename
* @param string $mode
*/
public function __construct($filename, $mode='r+b')
{
global $php_errormsg;
if (strpos($mode, 'w') === false && !is_readable($filename)) {
// opening for reading non-readable file
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('File \'' . $filename . '\' is not readable.');
}
$trackErrors = ini_get('track_errors');
ini_set('track_errors', '1');
$this->_fileHandle = @fopen($filename, $mode);
if ($this->_fileHandle === false) {
ini_set('track_errors', $trackErrors);
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception($php_errormsg);
}
ini_set('track_errors', $trackErrors);
}
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* SEEK_CUR is the only supported offset type for compound files
*
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
public function seek($offset, $whence=SEEK_SET)
{
return fseek($this->_fileHandle, $offset, $whence);
}
/**
* Get file position.
*
* @return integer
*/
public function tell()
{
return ftell($this->_fileHandle);
}
/**
* Flush output.
*
* Returns true on success or false on failure.
*
* @return boolean
*/
public function flush()
{
return fflush($this->_fileHandle);
}
/**
* Close File object
*/
public function close()
{
if ($this->_fileHandle !== null ) {
@fclose($this->_fileHandle);
$this->_fileHandle = null;
}
}
/**
* Get the size of the already opened file
*
* @return integer
*/
public function size()
{
$position = ftell($this->_fileHandle);
fseek($this->_fileHandle, 0, SEEK_END);
$size = ftell($this->_fileHandle);
fseek($this->_fileHandle,$position);
return $size;
}
/**
* Read a $length bytes from the file and advance the file pointer.
*
* @param integer $length
* @return string
*/
protected function _fread($length=1)
{
if ($length == 0) {
return '';
}
if ($length < 1024) {
return fread($this->_fileHandle, $length);
}
$data = '';
while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) {
$data .= $nextBlock;
$length -= strlen($nextBlock);
}
return $data;
}
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
protected function _fwrite($data, $length=null)
{
if ($length === null ) {
fwrite($this->_fileHandle, $data);
} else {
fwrite($this->_fileHandle, $data, $length);
}
}
/**
* Lock file
*
* Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock)
*
* @param integer $lockType
* @param boolean $nonBlockingLock
* @return boolean
*/
public function lock($lockType, $nonBlockingLock = false)
{
if ($nonBlockingLock) {
return flock($this->_fileHandle, $lockType | LOCK_NB);
} else {
return flock($this->_fileHandle, $lockType);
}
}
/**
* Unlock file
*
* Returns true on success
*
* @return boolean
*/
public function unlock()
{
if ($this->_fileHandle !== null ) {
return flock($this->_fileHandle, LOCK_UN);
} else {
return true;
}
}
}

View File

@ -0,0 +1,601 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: Memory.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Storage_File */
require_once 'Zend/Search/Lucene/Storage/File.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Storage
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_Storage_File_Memory extends Zend_Search_Lucene_Storage_File
{
/**
* FileData
*
* @var string
*/
private $_data;
/**
* File Position
*
* @var integer
*/
private $_position = 0;
/**
* Object constractor
*
* @param string $data
*/
public function __construct($data)
{
$this->_data = $data;
}
/**
* Reads $length number of bytes at the current position in the
* file and advances the file pointer.
*
* @param integer $length
* @return string
*/
protected function _fread($length = 1)
{
$returnValue = substr($this->_data, $this->_position, $length);
$this->_position += $length;
return $returnValue;
}
/**
* Sets the file position indicator and advances the file pointer.
* The new position, measured in bytes from the beginning of the file,
* is obtained by adding offset to the position specified by whence,
* whose values are defined as follows:
* SEEK_SET - Set position equal to offset bytes.
* SEEK_CUR - Set position to current location plus offset.
* SEEK_END - Set position to end-of-file plus offset. (To move to
* a position before the end-of-file, you need to pass a negative value
* in offset.)
* Upon success, returns 0; otherwise, returns -1
*
* @param integer $offset
* @param integer $whence
* @return integer
*/
public function seek($offset, $whence=SEEK_SET)
{
switch ($whence) {
case SEEK_SET:
$this->_position = $offset;
break;
case SEEK_CUR:
$this->_position += $offset;
break;
case SEEK_END:
$this->_position = strlen($this->_data);
$this->_position += $offset;
break;
default:
break;
}
}
/**
* Get file position.
*
* @return integer
*/
public function tell()
{
return $this->_position;
}
/**
* Flush output.
*
* Returns true on success or false on failure.
*
* @return boolean
*/
public function flush()
{
// Do nothing
return true;
}
/**
* Writes $length number of bytes (all, if $length===null) to the end
* of the file.
*
* @param string $data
* @param integer $length
*/
protected function _fwrite($data, $length=null)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
if ($length !== null) {
$this->_data .= substr($data, 0, $length);
} else {
$this->_data .= $data;
}
$this->_position = strlen($this->_data);
}
/**
* Lock file
*
* Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock)
*
* @param integer $lockType
* @return boolean
*/
public function lock($lockType, $nonBlockinLock = false)
{
// Memory files can't be shared
// do nothing
return true;
}
/**
* Unlock file
*/
public function unlock()
{
// Memory files can't be shared
// do nothing
}
/**
* Reads a byte from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readByte()
{
return ord($this->_data[$this->_position++]);
}
/**
* Writes a byte to the end of the file.
*
* @param integer $byte
*/
public function writeByte($byte)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
$this->_data .= chr($byte);
$this->_position = strlen($this->_data);
return 1;
}
/**
* Read num bytes from the current position in the file
* and advances the file pointer.
*
* @param integer $num
* @return string
*/
public function readBytes($num)
{
$returnValue = substr($this->_data, $this->_position, $num);
$this->_position += $num;
return $returnValue;
}
/**
* Writes num bytes of data (all, if $num===null) to the end
* of the string.
*
* @param string $data
* @param integer $num
*/
public function writeBytes($data, $num=null)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
if ($num !== null) {
$this->_data .= substr($data, 0, $num);
} else {
$this->_data .= $data;
}
$this->_position = strlen($this->_data);
}
/**
* Reads an integer from the current position in the file
* and advances the file pointer.
*
* @return integer
*/
public function readInt()
{
$str = substr($this->_data, $this->_position, 4);
$this->_position += 4;
return ord($str[0]) << 24 |
ord($str[1]) << 16 |
ord($str[2]) << 8 |
ord($str[3]);
}
/**
* Writes an integer to the end of file.
*
* @param integer $value
*/
public function writeInt($value)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
settype($value, 'integer');
$this->_data .= chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF);
$this->_position = strlen($this->_data);
}
/**
* Returns a long integer from the current position in the file
* and advances the file pointer.
*
* @return integer
* @throws Zend_Search_Lucene_Exception
*/
public function readLong()
{
/**
* Check, that we work in 64-bit mode.
* fseek() uses long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
$str = substr($this->_data, $this->_position, 8);
$this->_position += 8;
return ord($str[0]) << 56 |
ord($str[1]) << 48 |
ord($str[2]) << 40 |
ord($str[3]) << 32 |
ord($str[4]) << 24 |
ord($str[5]) << 16 |
ord($str[6]) << 8 |
ord($str[7]);
} else {
return $this->readLong32Bit();
}
}
/**
* Writes long integer to the end of file
*
* @param integer $value
* @throws Zend_Search_Lucene_Exception
*/
public function writeLong($value)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
/**
* Check, that we work in 64-bit mode.
* fseek() and ftell() use long for offset. Thus, largest index segment file size in 32bit mode is 2Gb
*/
if (PHP_INT_SIZE > 4) {
settype($value, 'integer');
$this->_data .= chr($value>>56 & 0xFF) .
chr($value>>48 & 0xFF) .
chr($value>>40 & 0xFF) .
chr($value>>32 & 0xFF) .
chr($value>>24 & 0xFF) .
chr($value>>16 & 0xFF) .
chr($value>>8 & 0xFF) .
chr($value & 0xFF);
} else {
$this->writeLong32Bit($value);
}
$this->_position = strlen($this->_data);
}
/**
* Returns a long integer from the current position in the file,
* advances the file pointer and return it as float (for 32-bit platforms).
*
* @return integer|float
* @throws Zend_Search_Lucene_Exception
*/
public function readLong32Bit()
{
$wordHigh = $this->readInt();
$wordLow = $this->readInt();
if ($wordHigh & (int)0x80000000) {
// It's a negative value since the highest bit is set
if ($wordHigh == (int)0xFFFFFFFF && ($wordLow & (int)0x80000000)) {
return $wordLow;
} else {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.');
}
}
if ($wordLow < 0) {
// Value is large than 0x7FFF FFFF. Represent low word as float.
$wordLow &= 0x7FFFFFFF;
$wordLow += (float)0x80000000;
}
if ($wordHigh == 0) {
// Return value as integer if possible
return $wordLow;
}
return $wordHigh*(float)0x100000000/* 0x00000001 00000000 */ + $wordLow;
}
/**
* Writes long integer to the end of file (32-bit platforms implementation)
*
* @param integer|float $value
* @throws Zend_Search_Lucene_Exception
*/
public function writeLong32Bit($value)
{
if ($value < (int)0x80000000) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Long integers lower than -2147483648 (0x80000000) are not supported on 32-bit platforms.');
}
if ($value < 0) {
$wordHigh = (int)0xFFFFFFFF;
$wordLow = (int)$value;
} else {
$wordHigh = (int)($value/(float)0x100000000/* 0x00000001 00000000 */);
$wordLow = $value - $wordHigh*(float)0x100000000/* 0x00000001 00000000 */;
if ($wordLow > 0x7FFFFFFF) {
// Highest bit of low word is set. Translate it to the corresponding negative integer value
$wordLow -= 0x80000000;
$wordLow |= 0x80000000;
}
}
$this->writeInt($wordHigh);
$this->writeInt($wordLow);
}
/**
* Returns a variable-length integer from the current
* position in the file and advances the file pointer.
*
* @return integer
*/
public function readVInt()
{
$nextByte = ord($this->_data[$this->_position++]);
$val = $nextByte & 0x7F;
for ($shift=7; ($nextByte & 0x80) != 0; $shift += 7) {
$nextByte = ord($this->_data[$this->_position++]);
$val |= ($nextByte & 0x7F) << $shift;
}
return $val;
}
/**
* Writes a variable-length integer to the end of file.
*
* @param integer $value
*/
public function writeVInt($value)
{
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
settype($value, 'integer');
while ($value > 0x7F) {
$this->_data .= chr( ($value & 0x7F)|0x80 );
$value >>= 7;
}
$this->_data .= chr($value);
$this->_position = strlen($this->_data);
}
/**
* Reads a string from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readString()
{
$strlen = $this->readVInt();
if ($strlen == 0) {
return '';
} else {
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
$str_val = substr($this->_data, $this->_position, $strlen);
$this->_position += $strlen;
for ($count = 0; $count < $strlen; $count++ ) {
if (( ord($str_val[$count]) & 0xC0 ) == 0xC0) {
$addBytes = 1;
if (ord($str_val[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
if (ord($str_val[$count]) & 0x10 ) {
$addBytes++;
}
}
$str_val .= substr($this->_data, $this->_position, $addBytes);
$this->_position += $addBytes;
$strlen += $addBytes;
// Check for null character. Java2 encodes null character
// in two bytes.
if (ord($str_val[$count]) == 0xC0 &&
ord($str_val[$count+1]) == 0x80 ) {
$str_val[$count] = 0;
$str_val = substr($str_val,0,$count+1)
. substr($str_val,$count+2);
}
$count += $addBytes;
}
}
return $str_val;
}
}
/**
* Writes a string to the end of file.
*
* @param string $str
* @throws Zend_Search_Lucene_Exception
*/
public function writeString($str)
{
/**
* This implementation supports only Basic Multilingual Plane
* (BMP) characters (from 0x0000 to 0xFFFF) and doesn't support
* "supplementary characters" (characters whose code points are
* greater than 0xFFFF)
* Java 2 represents these characters as a pair of char (16-bit)
* values, the first from the high-surrogates range (0xD800-0xDBFF),
* the second from the low-surrogates range (0xDC00-0xDFFF). Then
* they are encoded as usual UTF-8 characters in six bytes.
* Standard UTF-8 representation uses four bytes for supplementary
* characters.
*/
// We do not need to check if file position points to the end of "file".
// Only append operation is supported now
// convert input to a string before iterating string characters
settype($str, 'string');
$chars = $strlen = strlen($str);
$containNullChars = false;
for ($count = 0; $count < $strlen; $count++ ) {
/**
* String is already in Java 2 representation.
* We should only calculate actual string length and replace
* \x00 by \xC0\x80
*/
if ((ord($str[$count]) & 0xC0) == 0xC0) {
$addBytes = 1;
if (ord($str[$count]) & 0x20 ) {
$addBytes++;
// Never used. Java2 doesn't encode strings in four bytes
// and we dont't support non-BMP characters
if (ord($str[$count]) & 0x10 ) {
$addBytes++;
}
}
$chars -= $addBytes;
if (ord($str[$count]) == 0 ) {
$containNullChars = true;
}
$count += $addBytes;
}
}
if ($chars < 0) {
require_once 'Zend/Search/Lucene/Exception.php';
throw new Zend_Search_Lucene_Exception('Invalid UTF-8 string');
}
$this->writeVInt($chars);
if ($containNullChars) {
$this->_data .= str_replace($str, "\x00", "\xC0\x80");
} else {
$this->_data .= $str;
}
$this->_position = strlen($this->_data);
}
/**
* Reads binary data from the current position in the file
* and advances the file pointer.
*
* @return string
*/
public function readBinary()
{
$length = $this->readVInt();
$returnValue = substr($this->_data, $this->_position, $length);
$this->_position += $length;
return $returnValue;
}
}

View File

@ -0,0 +1,176 @@
<?php
/**
* Zend Framework
*
* LICENSE
*
* This source file is subject to the new BSD license that is bundled
* with this package in the file LICENSE.txt.
* It is also available through the world-wide-web at this URL:
* http://framework.zend.com/license/new-bsd
* If you did not receive a copy of the license and are unable to
* obtain it through the world-wide-web, please send an email
* to license@zend.com so we can send you a copy immediately.
*
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
* @version $Id: TermStreamsPriorityQueue.php 20096 2010-01-06 02:05:09Z bkarwin $
*/
/** Zend_Search_Lucene_Index_TermsStream_Interface */
require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
/**
* @category Zend
* @package Zend_Search_Lucene
* @subpackage Index
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
* @license http://framework.zend.com/license/new-bsd New BSD License
*/
class Zend_Search_Lucene_TermStreamsPriorityQueue implements Zend_Search_Lucene_Index_TermsStream_Interface
{
/**
* Array of term streams (Zend_Search_Lucene_Index_TermsStream_Interface objects)
*
* @var array
*/
protected $_termStreams;
/**
* Terms stream queue
*
* @var Zend_Search_Lucene_Index_TermsPriorityQueue
*/
protected $_termsStreamQueue = null;
/**
* Last Term in a terms stream
*
* @var Zend_Search_Lucene_Index_Term
*/
protected $_lastTerm = null;
/**
* Object constructor
*
* @param array $termStreams array of term streams (Zend_Search_Lucene_Index_TermsStream_Interface objects)
*/
public function __construct(array $termStreams)
{
$this->_termStreams = $termStreams;
$this->resetTermsStream();
}
/**
* Reset terms stream.
*/
public function resetTermsStream()
{
/** Zend_Search_Lucene_Index_TermsPriorityQueue */
require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
$this->_termsStreamQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
foreach ($this->_termStreams as $termStream) {
$termStream->resetTermsStream();
// Skip "empty" containers
if ($termStream->currentTerm() !== null) {
$this->_termsStreamQueue->put($termStream);
}
}
$this->nextTerm();
}
/**
* Skip terms stream up to specified term preffix.
*
* Prefix contains fully specified field info and portion of searched term
*
* @param Zend_Search_Lucene_Index_Term $prefix
*/
public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
{
$termStreams = array();
while (($termStream = $this->_termsStreamQueue->pop()) !== null) {
$termStreams[] = $termStream;
}
foreach ($termStreams as $termStream) {
$termStream->skipTo($prefix);
if ($termStream->currentTerm() !== null) {
$this->_termsStreamQueue->put($termStream);
}
}
$this->nextTerm();
}
/**
* Scans term streams and returns next term
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function nextTerm()
{
while (($termStream = $this->_termsStreamQueue->pop()) !== null) {
if ($this->_termsStreamQueue->top() === null ||
$this->_termsStreamQueue->top()->currentTerm()->key() !=
$termStream->currentTerm()->key()) {
// We got new term
$this->_lastTerm = $termStream->currentTerm();
if ($termStream->nextTerm() !== null) {
// Put segment back into the priority queue
$this->_termsStreamQueue->put($termStream);
}
return $this->_lastTerm;
}
if ($termStream->nextTerm() !== null) {
// Put segment back into the priority queue
$this->_termsStreamQueue->put($termStream);
}
}
// End of stream
$this->_lastTerm = null;
return null;
}
/**
* Returns term in current position
*
* @return Zend_Search_Lucene_Index_Term|null
*/
public function currentTerm()
{
return $this->_lastTerm;
}
/**
* Close terms stream
*
* Should be used for resources clean up if stream is not read up to the end
*/
public function closeTermsStream()
{
while (($termStream = $this->_termsStreamQueue->pop()) !== null) {
$termStream->closeTermsStream();
}
$this->_termsStreamQueue = null;
$this->_lastTerm = null;
}
}