mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
NEW SolrCellTextExtractor
This commit is contained in:
parent
14816075b8
commit
9af389f51b
67
README.md
67
README.md
@ -15,15 +15,80 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
||||
|
||||
* SilverStripe 3.0
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
|
||||
### Supported Formats
|
||||
|
||||
* HTML (built-in)
|
||||
* PDF (with XPDF or Solr)
|
||||
* Microsoft Word, Excel, Powerpoint (Solr)
|
||||
* OpenOffice (Solr)
|
||||
* CSV (Solr)
|
||||
* RTF (Solr)
|
||||
* EPub (Solr)
|
||||
|
||||
## Installation
|
||||
|
||||
The recommended installation is through [composer](http://getcomposer.org).
|
||||
Add the following to your `composer.json`:
|
||||
|
||||
:::js
|
||||
{
|
||||
"require": {
|
||||
"silverstripe/textextraction": "*"
|
||||
}
|
||||
}
|
||||
|
||||
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
||||
which is automatically checked out by composer. Alternatively, install Guzzle
|
||||
through PEAR and ensure its in your `include_path`.
|
||||
|
||||
## Configuration
|
||||
|
||||
No configuration is required, unless you want to make
|
||||
### Basic
|
||||
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config.php`:
|
||||
|
||||
DataObject::add_extension('File', 'FileTextExtractable');
|
||||
|
||||
### XPDF
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
||||
|
||||
:::yml
|
||||
PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
|
||||
### Apache Solr
|
||||
|
||||
Apache Solr is a fulltext search engine, an aspect which is often used
|
||||
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
|
||||
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
|
||||
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
|
||||
The textextraction module retrieves the output of this service, rather than altering the index.
|
||||
With the raw text output, you can decide to store it in a database column for fulltext search
|
||||
in your database driver, or even pass it back to Solr as part of a full index update.
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
|
||||
uses multiple cores by default, and comes prepackaged with a Solr server.
|
||||
Its a stripped-down version of Solr, follow the module README on how to add
|
||||
Apache Tika text extraction capabilities.
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
2
_config/config.yml
Normal file
2
_config/config.yml
Normal file
@ -0,0 +1,2 @@
|
||||
SolrCellTextExtractor:
|
||||
# base_url: 'http://localhost:8983/solr/update/extract'
|
64
code/extractors/SolrCellTextExtractor.php
Normal file
64
code/extractors/SolrCellTextExtractor.php
Normal file
@ -0,0 +1,64 @@
|
||||
<?php
|
||||
use Guzzle\Http\Client;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
||||
* Does not alter the Solr index itself, but uses it purely
|
||||
* for its file parsing abilities.
|
||||
*
|
||||
* @author ischommer
|
||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||
*/
|
||||
class SolrCellTextExtractor extends FileTextExtractor {
|
||||
|
||||
public static $base_url;
|
||||
|
||||
public static $priority = 75;
|
||||
|
||||
protected $httpClient;
|
||||
|
||||
public function getHttpClient() {
|
||||
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
public function setHttpClient($client) {
|
||||
$this->httpClient = $client;
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
$url = $this->config()->get('base_url');
|
||||
if(!$url) return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see http://tika.apache.org/1.3/formats.html
|
||||
* @return Array
|
||||
*/
|
||||
public function supportedExtensions() {
|
||||
return array(
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
);
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
if (!$path) return ""; // no file
|
||||
|
||||
$fileName = basename($path);
|
||||
$client = $this->getHttpClient();
|
||||
$request = $client
|
||||
->post('?extractOnly=true&extractFormat=text')
|
||||
->addPostFiles(array('myfile' => $path));
|
||||
$response = $request->send();
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*)\<\/str\>/s', preg_quote($fileName)),
|
||||
(string)$response->getBody(),
|
||||
$matches
|
||||
);
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
@ -18,6 +18,7 @@
|
||||
"require": {
|
||||
"php": ">=5.3.2",
|
||||
"composer/installers": "*",
|
||||
"silverstripe/framework": "~3.0"
|
||||
"silverstripe/framework": "~3.0",
|
||||
"guzzle/http": "*"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user