Merge pull request #5 from tractorcow/pulls/tika-support

API Support tika server
This commit is contained in:
Ingo Schommer 2015-02-26 22:50:36 +13:00
commit c813d234f0
9 changed files with 300 additions and 56 deletions

View File

@ -9,8 +9,9 @@ env:
- DB=MYSQL CORE_RELEASE=3
before_script:
- mkdir $HOME/bin
- mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
- ./.travis/install_tika.sh
- sudo ./.travis/install_pdftotext.sh
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
@ -18,4 +19,5 @@ before_script:
- cd ~/builds/ss
script:
- ($HOME/bin/tika-rest-server &) &> /dev/null
- vendor/bin/phpunit --verbose textextraction/tests/

View File

@ -1,6 +1,14 @@
#!/usr/bin/env bash
mkdir $HOME/bin
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
## Install tika app
wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
chmod ug+x $HOME/bin/tika
$HOME/bin/tika --version
## Install tika server
wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server
chmod ug+x $HOME/bin/tika-rest-server

115
README.md
View File

@ -36,13 +36,13 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
The recommended installation is through [composer](http://getcomposer.org).
Add the following to your `composer.json`:
```js
{
"require": {
"silverstripe/textextraction": "2.0.x-dev"
}
```js
{
"require": {
"silverstripe/textextraction": "2.0.x-dev"
}
```
}
```
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
which is automatically checked out by composer. Alternatively, install Guzzle
@ -57,11 +57,11 @@ No configuration is required for that, unless you want to make
the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config/config.yml`:
```yaml
File:
extensions:
- FileTextExtractable
```
```yaml
File:
extensions:
- FileTextExtractable
```
### XPDF
@ -69,10 +69,10 @@ PDFs require special handling, for example through the [XPDF](http://www.foolabs
commandline utility. Follow their installation instructions, its presence will be automatically
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
```yml
PDFTextExtractor:
binary_location: /my/path/pdftotext
```
```yml
PDFTextExtractor:
binary_location: /my/path/pdftotext
```
### Apache Solr
@ -86,10 +86,10 @@ in your database driver, or even pass it back to Solr as part of a full index up
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
```yml
SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract'
```
```yml
SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract'
```
Note that in case you're using multiple cores, you'll need to add the core name to the URL
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
@ -103,28 +103,28 @@ returns the contents, either by directly accessing `FileTextExtractable->extract
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
```php
class MyDocument extends DataObject {
static $db = array('Path' => 'Text');
function getContent() {
$extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null;
}
```php
class MyDocument extends DataObject {
static $db = array('Path' => 'Text');
function getContent() {
$extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null;
}
class MySolrIndex extends SolrIndex {
function init() {
$this->addClass('MyDocument');
$this->addStoredField('Content', 'HTMLText');
}
}
class MySolrIndex extends SolrIndex {
function init() {
$this->addClass('MyDocument');
$this->addStoredField('Content', 'HTMLText');
}
```
}
```
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.
### Tika
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
configuring this.
@ -132,15 +132,54 @@ configuring this.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
### Tika - CLI
Ensure that your machine has a 'tika' command available which will run the CLI script.
```bash
#!/bin/bash
exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@"
```
### Tika Rest Server
Tika can also be run as a server.
You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config.
```yaml
TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998'
```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file.
Then startup your server as below
```bash
java -jar tika-server-1.7.jar --host=localhost --port=9998
```
## Usage
Manual extraction:
$myFile = '/my/path/myfile.pdf';
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```php
$myFile = '/my/path/myfile.pdf';
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied:
$myFileObj = File::get()->First();
$content = $myFileObj->extractFileAsText();
```php
$myFileObj = File::get()->First();
$content = $myFileObj->getFileContent();
```
This content can also be embedded directly within a template.
```
$MyFile.FileContent
```

View File

@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object {
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
arsort($classes);
$classPriorities = array();
foreach($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
// Save classes
$sortedClasses = array_keys($classes);
$sortedClasses = array_keys($classPriorities);
return self::$sorted_extractor_classes = $sortedClasses;
}

View File

@ -0,0 +1,104 @@
<?php
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaServerTextExtractor extends FileTextExtractor {
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* @return TikaRestClient
*/
public function getClient() {
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
array($this->getServerEndpoint())
)
);
}
public function getServerEndpoint() {
if(defined('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT;
}
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
// Default to configured endpoint
return $this->config()->server_endpoint;
}
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
return $this
->getClient()
->getVersion();
}
public function isAvailable() {
return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() &&
$this->getVersion() >= 1.7;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
public function supportsMime($mime) {
$supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
// Check if supported (most common / quickest lookup)
if(isset($supported[$mime])) return true;
// Check aliases
foreach($supported as $info) {
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
}
return false;
}
public function getContent($path) {
return $this->getClient()->tika($path);
}
}

View File

@ -7,15 +7,6 @@
*/
class TikaTextExtractor extends FileTextExtractor {
/**
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
* as the placeholder for the default locale
*
* @var string
* @config
*/
private static $locale = '{default}.utf-8';
/**
* Text extraction mode. Defaults to -t (plain text)
*

View File

@ -0,0 +1,82 @@
<?php
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
class TikaRestClient extends Client {
public function __construct($url) {
parent::__construct(array(
'base_url' => $url
));
}
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable() {
try {
return $this
->get()
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
/**
* Get version code
*
* @return float
*/
public function getVersion() {
$response = $this->get('version');
// Parse output
if($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
) {
return (float)$matches['version'];
}
return 0.0;
}
protected $mimes = array();
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes() {
if($this->mimes) return $this->mimes;
$response = $this->get(
'mime-types',
array(
'headers' => array("Accept" => "application/json")
)
);
return $this->mimes = $response->json();
}
/**
* Extract text content from a given file
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file) {
$response = $this->put(
'tika',
array(
'body' => file_get_contents($file),
'headers' => array("Accept" => "text/plain")
)
);
return $response->getBody()->getContents();
}
}

View File

@ -19,7 +19,7 @@
"php": ">=5.3.2",
"composer/installers": "*",
"silverstripe/framework": "~3.1",
"guzzle/http": "*"
"guzzlehttp/guzzle": "~4.0"
},
"require-dev": {
"phpunit/PHPUnit": "~3.7@stable"

View File

@ -7,7 +7,7 @@ class TikaTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
@ -20,4 +20,19 @@ class TikaTextExtractorTest extends SapphireTest {
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}
function testServerExtraction() {
$extractor = new TikaServerTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}