Merge pull request #5 from tractorcow/pulls/tika-support

API Support tika server
2024-10-22 11:06:00 +02:00 · 2015-02-26 22:50:36 +13:00 · 2015-02-26 22:50:36 +13:00 · c813d234f0
commit c813d234f0
parent 23d83b7d01 1ad9e46727
9 changed files with 300 additions and 56 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -9,8 +9,9 @@ env:
  - DB=MYSQL CORE_RELEASE=3

 before_script:
-  - mkdir $HOME/bin
+  - mkdir -p $HOME/bin
  - export PATH=$PATH:$HOME/bin
+  - export SS_TIKA_ENDPOINT="http://localhost:9998/"
  - ./.travis/install_tika.sh
  - sudo ./.travis/install_pdftotext.sh
  - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
@ -18,4 +19,5 @@ before_script:
  - cd ~/builds/ss

 script:
+  - ($HOME/bin/tika-rest-server &) &> /dev/null
  - vendor/bin/phpunit --verbose textextraction/tests/
--- a/.travis/install_tika.sh
+++ b/.travis/install_tika.sh
@ -1,6 +1,14 @@
 #!/usr/bin/env bash
-mkdir $HOME/bin
-wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
+
+## Install tika app
+wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
 echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
 chmod ug+x $HOME/bin/tika
 $HOME/bin/tika --version
+
+
+## Install tika server
+wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar"
+echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server
+chmod ug+x $HOME/bin/tika-rest-server
+
--- a/README.md
+++ b/README.md
@ -36,13 +36,13 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
 The recommended installation is through [composer](http://getcomposer.org).
 Add the following to your `composer.json`:

-	```js
-	{
-		"require": {
-			"silverstripe/textextraction": "2.0.x-dev"
-		}
+```js
+{
+	"require": {
+		"silverstripe/textextraction": "2.0.x-dev"
 	}
-	```
+}
+```

 The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
 which is automatically checked out by composer. Alternatively, install Guzzle
@ -57,11 +57,11 @@ No configuration is required for that, unless you want to make
 the content available through your `DataObject` subclass.
 In this case, add the following to `mysite/_config/config.yml`:

-	```yaml
-	File:
-	  extensions:
-	    - FileTextExtractable
-	```
+```yaml
+File:
+  extensions:
+	- FileTextExtractable
+```

 ### XPDF

@ -69,10 +69,10 @@ PDFs require special handling, for example through the [XPDF](http://www.foolabs
 commandline utility. Follow their installation instructions, its presence will be automatically
 detected. You can optionally set the binary path in `mysite/_config/config.yml`:

-	```yml
-	PDFTextExtractor:
-		binary_location: /my/path/pdftotext
-	```
+```yml
+PDFTextExtractor:
+	binary_location: /my/path/pdftotext
+```

 ### Apache Solr

@ -86,10 +86,10 @@ in your database driver, or even pass it back to Solr as part of a full index up

 In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):

-	```yml
-	SolrCellTextExtractor:
-		base_url: 'http://localhost:8983/solr/update/extract'
-	```
+```yml
+SolrCellTextExtractor:
+	base_url: 'http://localhost:8983/solr/update/extract'
+```

 Note that in case you're using multiple cores, you'll need to add the core name to the URL 
 (e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
@ -103,28 +103,28 @@ returns the contents, either by directly accessing `FileTextExtractable->extract
 or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
 The property should be listed in your `SolrIndex` subclass, e.g. as follows:

-	```php
-	class MyDocument extends DataObject {
-		static $db = array('Path' => 'Text');
-		function getContent() {
-			$extractor = FileTextExtractor::for_file($this->Path);
-			return $extractor ? $extractor->getContent($this->Path) : null;		
-		}
+```php
+class MyDocument extends DataObject {
+	static $db = array('Path' => 'Text');
+	function getContent() {
+		$extractor = FileTextExtractor::for_file($this->Path);
+		return $extractor ? $extractor->getContent($this->Path) : null;		
 	}
-	class MySolrIndex extends SolrIndex {
-		function init() {
-			$this->addClass('MyDocument');
-			$this->addStoredField('Content', 'HTMLText');
-		}
+}
+class MySolrIndex extends SolrIndex {
+	function init() {
+		$this->addClass('MyDocument');
+		$this->addStoredField('Content', 'HTMLText');
 	}
-	```
+}
+```

 Note: This isn't a terribly efficient way to process large amounts of files, since 
 each HTTP request is run synchronously.

 ### Tika

-Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
+Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.

 See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
 configuring this.
@ -132,15 +132,54 @@ configuring this.
 This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
 installed to perform mime detection. Tika validates support via mime type rather than file extensions.

+### Tika - CLI
+
+Ensure that your machine has a 'tika' command available which will run the CLI script.
+
+```bash
+#!/bin/bash
+exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@"
+```
+
+### Tika Rest Server
+
+Tika can also be run as a server.
+
+You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config.
+
+```yaml
+TikaServerTextExtractor:
+  server_endpoint: 'http://localhost:9998'
+```
+
+Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file.
+
+
+Then startup your server as below
+
+```bash
+java -jar tika-server-1.7.jar --host=localhost --port=9998
+```
+
 ## Usage

 Manual extraction:

-	$myFile = '/my/path/myfile.pdf';
-	$extractor = FileTextExtractor::for_file($myFile);
-	$content = $extractor->getContent($myFile);
+```php
+$myFile = '/my/path/myfile.pdf';
+$extractor = FileTextExtractor::for_file($myFile);
+$content = $extractor->getContent($myFile);
+```

 Extraction with `FileTextExtractable` extension applied:

-	$myFileObj = File::get()->First();
-	$content = $myFileObj->extractFileAsText();
+```php
+$myFileObj = File::get()->First();
+$content = $myFileObj->getFileContent();
+```
+
+This content can also be embedded directly within a template.
+
+```
+$MyFile.FileContent
+```
--- a/code/extractors/FileTextExtractor.php
+++ b/code/extractors/FileTextExtractor.php
@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object {
 		// Generate the sorted list of extractors on demand.
 		$classes = ClassInfo::subclassesFor("FileTextExtractor");
 		array_shift($classes);
-		foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
-		arsort($classes);
+		$classPriorities = array();
+		foreach($classes as $class) {
+			$classPriorities[$class] = Config::inst()->get($class, 'priority');
+		}
+		arsort($classPriorities);

 		// Save classes
-		$sortedClasses = array_keys($classes);
+		$sortedClasses = array_keys($classPriorities);
 		return self::$sorted_extractor_classes = $sortedClasses;
 	}

--- a/code/extractors/TikaServerTextExtractor.php
+++ b/code/extractors/TikaServerTextExtractor.php
@ -0,0 +1,104 @@
+<?php
+
+/**
+ * Enables text extraction of file content via the Tika Rest Server
+ *
+ * {@link http://tika.apache.org/1.7/gettingstarted.html}
+ */
+class TikaServerTextExtractor extends FileTextExtractor {
+
+	/**
+	 * Tika server is pretty efficient so use it immediately if available
+	 *
+	 * @var integer
+	 * @config
+	 */
+	private static $priority = 80;
+
+	/**
+	 * Server endpoint
+	 *
+	 * @var string
+	 * @config
+	 */
+	private static $server_endpoint;
+
+	/**
+	 * @var TikaRestClient
+	 */
+	protected $client = null;
+
+	/**
+	 * @return TikaRestClient
+	 */
+	public function getClient() {
+		return $this->client ?:
+			($this->client =
+				Injector::inst()->createWithArgs(
+					'TikaRestClient',
+					array($this->getServerEndpoint())
+				)
+			);
+	}
+
+	public function getServerEndpoint() {
+		if(defined('SS_TIKA_ENDPOINT')) {
+			return SS_TIKA_ENDPOINT;
+		}
+
+		if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
+
+		// Default to configured endpoint
+		return $this->config()->server_endpoint;
+	}
+
+	/**
+	 * Get the version of tika installed, or 0 if not installed
+	 *
+	 * @return float version of tika
+	 */
+	public function getVersion() {
+		return $this
+			->getClient()
+			->getVersion();
+	}
+
+	public function isAvailable() {
+		return $this->getServerEndpoint() &&
+			$this->getClient()->isAvailable() &&
+			$this->getVersion() >= 1.7;
+	}
+
+	public function supportsExtension($extension) {
+		// Determine support via mime type only
+		return false;
+	}
+
+
+	/**
+	 * Cache of supported mime types
+	 *
+	 * @var array
+	 */
+	protected $supportedMimes = array();
+
+	public function supportsMime($mime) {
+		$supported = $this->supportedMimes ?:
+			($this->supportedMimes = $this->getClient()->getSupportedMimes());
+
+		// Check if supported (most common / quickest lookup)
+		if(isset($supported[$mime])) return true;
+
+		// Check aliases
+		foreach($supported as $info) {
+			if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
+		}
+
+		return false;
+	}
+
+	public function getContent($path) {
+		return $this->getClient()->tika($path);
+	}
+
+}
--- a/code/extractors/TikaTextExtractor.php
+++ b/code/extractors/TikaTextExtractor.php
@ -7,15 +7,6 @@
 */
 class TikaTextExtractor extends FileTextExtractor {

-	/**
-	 * Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
-	 * as the placeholder for the default locale
-	 *
-	 * @var string
-	 * @config
-	 */
-	private static $locale = '{default}.utf-8';
-
 	/**
 	 * Text extraction mode. Defaults to -t (plain text)
 	 *
--- a/code/tika/TikaRestClient.php
+++ b/code/tika/TikaRestClient.php
@ -0,0 +1,82 @@
+<?php
+
+use GuzzleHttp\Client;
+use GuzzleHttp\Exception\RequestException;
+
+class TikaRestClient extends Client {
+
+	public function __construct($url) {
+		parent::__construct(array(
+			'base_url' => $url
+		));
+	}
+
+	/**
+	 * Detect if the service is available
+	 *
+	 * @return bool
+	 */
+	public function isAvailable() {
+		try {
+			return $this
+				->get()
+				->getStatusCode() == 200;
+		} catch (RequestException $ex) {
+			return false;
+		}
+	}
+
+	/**
+	 * Get version code
+	 *
+	 * @return float
+	 */
+	public function getVersion() {
+		$response = $this->get('version');
+		// Parse output
+		if($response->getStatusCode() == 200 &&
+			preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
+		) {
+			return (float)$matches['version'];
+		}
+
+		return 0.0;
+	}
+
+	protected $mimes = array();
+
+	/**
+	 * Gets supported mime data. May include aliased mime types.
+	 *
+	 * @return array
+	 */
+	public function getSupportedMimes() {
+		if($this->mimes) return $this->mimes;
+
+		$response = $this->get(
+			'mime-types',
+			array(
+				'headers' => array("Accept" => "application/json")
+			)
+		);
+		return $this->mimes = $response->json();
+	}
+
+	/**
+	 * Extract text content from a given file
+	 *
+	 * @param string $file Full filesystem path to a file to post
+	 * @return string Content of the file extracted as plain text
+	 */
+	public function tika($file) {
+		$response = $this->put(
+			'tika',
+			array(
+				'body' => file_get_contents($file),
+				'headers' => array("Accept" => "text/plain")
+			)
+		);
+		return $response->getBody()->getContents();
+	}
+
+}
--- a/composer.json
+++ b/composer.json
@ -19,7 +19,7 @@
 		"php": ">=5.3.2",
 		"composer/installers": "*",
 		"silverstripe/framework": "~3.1",
-		"guzzle/http": "*"
+        "guzzlehttp/guzzle": "~4.0"
 	},
 	"require-dev": {
 		"phpunit/PHPUnit": "~3.7@stable"
--- a/tests/TikaTextExtractorTest.php
+++ b/tests/TikaTextExtractorTest.php
@ -7,7 +7,7 @@ class TikaTextExtractorTest extends SapphireTest {
 	
 	function testExtraction() {
 		$extractor = new TikaTextExtractor();
-		if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
+		if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');

 		// Check file
 		$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
@ -20,4 +20,19 @@ class TikaTextExtractorTest extends SapphireTest {
 		$this->assertFalse($extractor->supportsMime('application/not-supported'));
 	}

-}
+	function testServerExtraction() {
+		$extractor = new TikaServerTextExtractor();
+		if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
+
+		// Check file
+		$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
+		$content = $extractor->getContent($file);
+		$this->assertContains('This is a test file with a link', $content);
+
+		// Check mime validation
+		$this->assertTrue($extractor->supportsMime('application/pdf'));
+		$this->assertTrue($extractor->supportsMime('text/html'));
+		$this->assertFalse($extractor->supportsMime('application/not-supported'));
+	}
+
+}