NEW SolrCellTextExtractor

2024-10-22 11:06:00 +02:00 · 2013-02-01 15:35:16 +01:00 · 2013-02-01 15:35:16 +01:00 · 9af389f51b
commit 9af389f51b
parent 14816075b8
4 changed files with 134 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -15,15 +15,80 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil

 * SilverStripe 3.0
 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
+ * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
+
+### Supported Formats
+
+ * HTML (built-in)
+ * PDF (with XPDF or Solr)
+ * Microsoft Word, Excel, Powerpoint (Solr)
+ * OpenOffice (Solr)
+ * CSV (Solr)
+ * RTF (Solr)
+ * EPub (Solr)
+
+## Installation
+
+The recommended installation is through [composer](http://getcomposer.org).
+Add the following to your `composer.json`:
+
+	:::js
+	{
+		"require": {
+			"silverstripe/textextraction": "*"
+		}
+	}
+
+The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
+which is automatically checked out by composer. Alternatively, install Guzzle
+through PEAR and ensure its in your `include_path`.

 ## Configuration

-No configuration is required, unless you want to make
+### Basic
+
+By default, only extraction from HTML documents is supported.
+No configuration is required for that, unless you want to make
 the content available through your `DataObject` subclass.
 In this case, add the following to `mysite/_config.php`:

 	DataObject::add_extension('File', 'FileTextExtractable');

+### XPDF
+
+PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
+commandline utility. Follow their installation instructions, its presence will be automatically
+detected. You can optionally set the binary path in `mysite/_config/config.yml`:
+
+	:::yml
+	PDFTextExtractor:
+		binary_location: /my/path/pdftotext
+
+### Apache Solr
+
+Apache Solr is a fulltext search engine, an aspect which is often used
+alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
+through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
+This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
+The textextraction module retrieves the output of this service, rather than altering the index.
+With the raw text output, you can decide to store it in a database column for fulltext search
+in your database driver, or even pass it back to Solr as part of a full index update.
+
+In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
+
+	SolrCellTextExtractor:
+		base_url: 'http://localhost:8983/solr/update/extract'
+
+Note that in case you're using multiple cores, you'll need to add the core name to the URL 
+(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
+The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
+uses multiple cores by default, and comes prepackaged with a Solr server.
+Its a stripped-down version of Solr, follow the module README on how to add
+Apache Tika text extraction capabilities.
+
+Note: This isn't a terribly efficient way to process large amounts of files, since 
+each HTTP request is run synchronously.
+
 ## Usage

 Manual extraction:
--- a/_config/config.yml
+++ b/_config/config.yml
@ -0,0 +1,2 @@
+SolrCellTextExtractor:
+#  base_url: 'http://localhost:8983/solr/update/extract'
--- a/code/extractors/SolrCellTextExtractor.php
+++ b/code/extractors/SolrCellTextExtractor.php
@ -0,0 +1,64 @@
+<?php
+use Guzzle\Http\Client;
+
+/**
+ * Text extractor that calls an Apache Solr instance
+ * and extracts content via the "ExtractingRequestHandler" endpoint.
+ * Does not alter the Solr index itself, but uses it purely
+ * for its file parsing abilities.
+ * 
+ * @author ischommer
+ * @see  http://wiki.apache.org/solr/ExtractingRequestHandler
+ */
+class SolrCellTextExtractor extends FileTextExtractor {
+
+	public static $base_url;
+
+	public static $priority = 75;
+
+	protected $httpClient;
+
+	public function getHttpClient() {
+		if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
+		return $this->httpClient;
+	}
+
+	public function setHttpClient($client) {
+		$this->httpClient = $client;
+	}
+
+	public function isAvailable() {
+		$url = $this->config()->get('base_url');
+		if(!$url) return false;
+	}
+	
+	/**
+	 * @see  http://tika.apache.org/1.3/formats.html
+	 * @return Array
+	 */
+	public function supportedExtensions() {
+		return array(
+			'pdf', 'doc', 'docx', 'xls', 'xlsx',
+			'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
+			'ppt', 'pptx', 'odp', 'fodp', 'csv'
+		);
+	}
+	
+	public function getContent($path) {
+		if (!$path) return ""; // no file
+		
+		$fileName = basename($path);
+		$client = $this->getHttpClient();
+		$request = $client
+			->post('?extractOnly=true&extractFormat=text')
+			->addPostFiles(array('myfile' => $path));
+		$response = $request->send();
+		// Use preg match to avoid SimpleXML running out of memory on large text nodes
+		preg_match(
+			sprintf('/\<str name\="%s"\>(.*)\<\/str\>/s', preg_quote($fileName)),
+			(string)$response->getBody(), 
+			$matches
+		);
+		return $matches ? $matches[1] : null;
+	}
+}
--- a/composer.json
+++ b/composer.json
@ -18,6 +18,7 @@
 	"require": {
 		"php": ">=5.3.2",
 		"composer/installers": "*",
-		"silverstripe/framework": "~3.0"
+		"silverstripe/framework": "~3.0",
+		"guzzle/http": "*"
 	}
 }