From 1ad9e4672767b72961cc79881bf060066807afb5 Mon Sep 17 00:00:00 2001 From: Damian Mooyman Date: Wed, 25 Feb 2015 14:44:03 +1300 Subject: [PATCH] API Support tika server --- .travis.yml | 4 +- .travis/install_tika.sh | 12 +- README.md | 115 +++++++++++++------- code/extractors/FileTextExtractor.php | 9 +- code/extractors/TikaServerTextExtractor.php | 104 ++++++++++++++++++ code/extractors/TikaTextExtractor.php | 9 -- code/tika/TikaRestClient.php | 82 ++++++++++++++ composer.json | 2 +- tests/TikaTextExtractorTest.php | 19 +++- 9 files changed, 300 insertions(+), 56 deletions(-) create mode 100644 code/extractors/TikaServerTextExtractor.php create mode 100644 code/tika/TikaRestClient.php diff --git a/.travis.yml b/.travis.yml index 8f43db4..0c07f3f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,8 +9,9 @@ env: - DB=MYSQL CORE_RELEASE=3 before_script: - - mkdir $HOME/bin + - mkdir -p $HOME/bin - export PATH=$PATH:$HOME/bin + - export SS_TIKA_ENDPOINT="http://localhost:9998/" - ./.travis/install_tika.sh - sudo ./.travis/install_pdftotext.sh - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support @@ -18,4 +19,5 @@ before_script: - cd ~/builds/ss script: + - ($HOME/bin/tika-rest-server &) &> /dev/null - vendor/bin/phpunit --verbose textextraction/tests/ diff --git a/.travis/install_tika.sh b/.travis/install_tika.sh index fc14055..92c61c4 100755 --- a/.travis/install_tika.sh +++ b/.travis/install_tika.sh @@ -1,6 +1,14 @@ #!/usr/bin/env bash -mkdir $HOME/bin -wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar" + +## Install tika app +wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar" echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika chmod ug+x $HOME/bin/tika $HOME/bin/tika --version + + +## Install tika server +wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar" +echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server +chmod ug+x $HOME/bin/tika-rest-server + diff --git a/README.md b/README.md index ee16c8f..a327817 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,13 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil The recommended installation is through [composer](http://getcomposer.org). Add the following to your `composer.json`: - ```js - { - "require": { - "silverstripe/textextraction": "2.0.x-dev" - } +```js +{ + "require": { + "silverstripe/textextraction": "2.0.x-dev" } - ``` +} +``` The module depends on the [Guzzle HTTP Library](http://guzzlephp.org), which is automatically checked out by composer. Alternatively, install Guzzle @@ -57,11 +57,11 @@ No configuration is required for that, unless you want to make the content available through your `DataObject` subclass. In this case, add the following to `mysite/_config/config.yml`: - ```yaml - File: - extensions: - - FileTextExtractable - ``` +```yaml +File: + extensions: + - FileTextExtractable +``` ### XPDF @@ -69,10 +69,10 @@ PDFs require special handling, for example through the [XPDF](http://www.foolabs commandline utility. Follow their installation instructions, its presence will be automatically detected. You can optionally set the binary path in `mysite/_config/config.yml`: - ```yml - PDFTextExtractor: - binary_location: /my/path/pdftotext - ``` +```yml +PDFTextExtractor: + binary_location: /my/path/pdftotext +``` ### Apache Solr @@ -86,10 +86,10 @@ in your database driver, or even pass it back to Solr as part of a full index up In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): - ```yml - SolrCellTextExtractor: - base_url: 'http://localhost:8983/solr/update/extract' - ``` +```yml +SolrCellTextExtractor: + base_url: 'http://localhost:8983/solr/update/extract' +``` Note that in case you're using multiple cores, you'll need to add the core name to the URL (e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract'). @@ -103,28 +103,28 @@ returns the contents, either by directly accessing `FileTextExtractable->extract or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below). The property should be listed in your `SolrIndex` subclass, e.g. as follows: - ```php - class MyDocument extends DataObject { - static $db = array('Path' => 'Text'); - function getContent() { - $extractor = FileTextExtractor::for_file($this->Path); - return $extractor ? $extractor->getContent($this->Path) : null; - } +```php +class MyDocument extends DataObject { + static $db = array('Path' => 'Text'); + function getContent() { + $extractor = FileTextExtractor::for_file($this->Path); + return $extractor ? $extractor->getContent($this->Path) : null; } - class MySolrIndex extends SolrIndex { - function init() { - $this->addClass('MyDocument'); - $this->addStoredField('Content', 'HTMLText'); - } +} +class MySolrIndex extends SolrIndex { + function init() { + $this->addClass('MyDocument'); + $this->addStoredField('Content', 'HTMLText'); } - ``` +} +``` Note: This isn't a terribly efficient way to process large amounts of files, since each HTTP request is run synchronously. ### Tika -Support for Apache Tika (1.7 and above) is included for the standalone command line utility. +Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI. See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and configuring this. @@ -132,15 +132,54 @@ configuring this. This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) installed to perform mime detection. Tika validates support via mime type rather than file extensions. +### Tika - CLI + +Ensure that your machine has a 'tika' command available which will run the CLI script. + +```bash +#!/bin/bash +exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@" +``` + +### Tika Rest Server + +Tika can also be run as a server. + +You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config. + +```yaml +TikaServerTextExtractor: + server_endpoint: 'http://localhost:9998' +``` + +Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file. + + +Then startup your server as below + +```bash +java -jar tika-server-1.7.jar --host=localhost --port=9998 +``` + ## Usage Manual extraction: - $myFile = '/my/path/myfile.pdf'; - $extractor = FileTextExtractor::for_file($myFile); - $content = $extractor->getContent($myFile); +```php +$myFile = '/my/path/myfile.pdf'; +$extractor = FileTextExtractor::for_file($myFile); +$content = $extractor->getContent($myFile); +``` Extraction with `FileTextExtractable` extension applied: - $myFileObj = File::get()->First(); - $content = $myFileObj->extractFileAsText(); +```php +$myFileObj = File::get()->First(); +$content = $myFileObj->getFileContent(); +``` + +This content can also be embedded directly within a template. + +``` +$MyFile.FileContent +``` diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php index 576037b..208dd47 100644 --- a/code/extractors/FileTextExtractor.php +++ b/code/extractors/FileTextExtractor.php @@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object { // Generate the sorted list of extractors on demand. $classes = ClassInfo::subclassesFor("FileTextExtractor"); array_shift($classes); - foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority'); - arsort($classes); + $classPriorities = array(); + foreach($classes as $class) { + $classPriorities[$class] = Config::inst()->get($class, 'priority'); + } + arsort($classPriorities); // Save classes - $sortedClasses = array_keys($classes); + $sortedClasses = array_keys($classPriorities); return self::$sorted_extractor_classes = $sortedClasses; } diff --git a/code/extractors/TikaServerTextExtractor.php b/code/extractors/TikaServerTextExtractor.php new file mode 100644 index 0000000..b4a74e5 --- /dev/null +++ b/code/extractors/TikaServerTextExtractor.php @@ -0,0 +1,104 @@ +client ?: + ($this->client = + Injector::inst()->createWithArgs( + 'TikaRestClient', + array($this->getServerEndpoint()) + ) + ); + } + + public function getServerEndpoint() { + if(defined('SS_TIKA_ENDPOINT')) { + return SS_TIKA_ENDPOINT; + } + + if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT'); + + // Default to configured endpoint + return $this->config()->server_endpoint; + } + + /** + * Get the version of tika installed, or 0 if not installed + * + * @return float version of tika + */ + public function getVersion() { + return $this + ->getClient() + ->getVersion(); + } + + public function isAvailable() { + return $this->getServerEndpoint() && + $this->getClient()->isAvailable() && + $this->getVersion() >= 1.7; + } + + public function supportsExtension($extension) { + // Determine support via mime type only + return false; + } + + + /** + * Cache of supported mime types + * + * @var array + */ + protected $supportedMimes = array(); + + public function supportsMime($mime) { + $supported = $this->supportedMimes ?: + ($this->supportedMimes = $this->getClient()->getSupportedMimes()); + + // Check if supported (most common / quickest lookup) + if(isset($supported[$mime])) return true; + + // Check aliases + foreach($supported as $info) { + if(isset($info['alias']) && in_array($mime, $info['alias'])) return true; + } + + return false; + } + + public function getContent($path) { + return $this->getClient()->tika($path); + } + +} diff --git a/code/extractors/TikaTextExtractor.php b/code/extractors/TikaTextExtractor.php index 4bd3f68..871bcca 100644 --- a/code/extractors/TikaTextExtractor.php +++ b/code/extractors/TikaTextExtractor.php @@ -7,15 +7,6 @@ */ class TikaTextExtractor extends FileTextExtractor { - /** - * Text extraction locale. Use {locale} as a placeholder for the current locale, {default} - * as the placeholder for the default locale - * - * @var string - * @config - */ - private static $locale = '{default}.utf-8'; - /** * Text extraction mode. Defaults to -t (plain text) * diff --git a/code/tika/TikaRestClient.php b/code/tika/TikaRestClient.php new file mode 100644 index 0000000..d5af31b --- /dev/null +++ b/code/tika/TikaRestClient.php @@ -0,0 +1,82 @@ + $url + )); + } + + /** + * Detect if the service is available + * + * @return bool + */ + public function isAvailable() { + try { + return $this + ->get() + ->getStatusCode() == 200; + } catch (RequestException $ex) { + return false; + } + } + + /** + * Get version code + * + * @return float + */ + public function getVersion() { + $response = $this->get('version'); + // Parse output + if($response->getStatusCode() == 200 && + preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody()->getContents(), $matches) + ) { + return (float)$matches['version']; + } + + return 0.0; + } + + protected $mimes = array(); + + /** + * Gets supported mime data. May include aliased mime types. + * + * @return array + */ + public function getSupportedMimes() { + if($this->mimes) return $this->mimes; + + $response = $this->get( + 'mime-types', + array( + 'headers' => array("Accept" => "application/json") + ) + ); + return $this->mimes = $response->json(); + } + + /** + * Extract text content from a given file + * + * @param string $file Full filesystem path to a file to post + * @return string Content of the file extracted as plain text + */ + public function tika($file) { + $response = $this->put( + 'tika', + array( + 'body' => file_get_contents($file), + 'headers' => array("Accept" => "text/plain") + ) + ); + return $response->getBody()->getContents(); + } + +} diff --git a/composer.json b/composer.json index b9bac55..7715180 100644 --- a/composer.json +++ b/composer.json @@ -19,7 +19,7 @@ "php": ">=5.3.2", "composer/installers": "*", "silverstripe/framework": "~3.1", - "guzzle/http": "*" + "guzzlehttp/guzzle": "~4.0" }, "require-dev": { "phpunit/PHPUnit": "~3.7@stable" diff --git a/tests/TikaTextExtractorTest.php b/tests/TikaTextExtractorTest.php index 0a7d912..a35c636 100644 --- a/tests/TikaTextExtractorTest.php +++ b/tests/TikaTextExtractorTest.php @@ -7,7 +7,7 @@ class TikaTextExtractorTest extends SapphireTest { function testExtraction() { $extractor = new TikaTextExtractor(); - if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available'); + if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available'); // Check file $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; @@ -20,4 +20,19 @@ class TikaTextExtractorTest extends SapphireTest { $this->assertFalse($extractor->supportsMime('application/not-supported')); } -} \ No newline at end of file + function testServerExtraction() { + $extractor = new TikaServerTextExtractor(); + if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available'); + + // Check file + $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; + $content = $extractor->getContent($file); + $this->assertContains('This is a test file with a link', $content); + + // Check mime validation + $this->assertTrue($extractor->supportsMime('application/pdf')); + $this->assertTrue($extractor->supportsMime('text/html')); + $this->assertFalse($extractor->supportsMime('application/not-supported')); + } + +}