API Support tika server

This commit is contained in:
Damian Mooyman 2015-02-25 14:44:03 +13:00
parent 23d83b7d01
commit 1ad9e46727
9 changed files with 300 additions and 56 deletions

View File

@ -9,8 +9,9 @@ env:
- DB=MYSQL CORE_RELEASE=3
before_script:
- mkdir $HOME/bin
- mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
- ./.travis/install_tika.sh
- sudo ./.travis/install_pdftotext.sh
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
@ -18,4 +19,5 @@ before_script:
- cd ~/builds/ss
script:
- ($HOME/bin/tika-rest-server &) &> /dev/null
- vendor/bin/phpunit --verbose textextraction/tests/

View File

@ -1,6 +1,14 @@
#!/usr/bin/env bash
mkdir $HOME/bin
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
## Install tika app
wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
chmod ug+x $HOME/bin/tika
$HOME/bin/tika --version
## Install tika server
wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server
chmod ug+x $HOME/bin/tika-rest-server

View File

@ -124,7 +124,7 @@ each HTTP request is run synchronously.
### Tika
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
configuring this.
@ -132,15 +132,54 @@ configuring this.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
### Tika - CLI
Ensure that your machine has a 'tika' command available which will run the CLI script.
```bash
#!/bin/bash
exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@"
```
### Tika Rest Server
Tika can also be run as a server.
You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config.
```yaml
TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998'
```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file.
Then startup your server as below
```bash
java -jar tika-server-1.7.jar --host=localhost --port=9998
```
## Usage
Manual extraction:
```php
$myFile = '/my/path/myfile.pdf';
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied:
```php
$myFileObj = File::get()->First();
$content = $myFileObj->extractFileAsText();
$content = $myFileObj->getFileContent();
```
This content can also be embedded directly within a template.
```
$MyFile.FileContent
```

View File

@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object {
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
arsort($classes);
$classPriorities = array();
foreach($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
// Save classes
$sortedClasses = array_keys($classes);
$sortedClasses = array_keys($classPriorities);
return self::$sorted_extractor_classes = $sortedClasses;
}

View File

@ -0,0 +1,104 @@
<?php
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaServerTextExtractor extends FileTextExtractor {
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* @return TikaRestClient
*/
public function getClient() {
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
array($this->getServerEndpoint())
)
);
}
public function getServerEndpoint() {
if(defined('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT;
}
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
// Default to configured endpoint
return $this->config()->server_endpoint;
}
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
return $this
->getClient()
->getVersion();
}
public function isAvailable() {
return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() &&
$this->getVersion() >= 1.7;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
public function supportsMime($mime) {
$supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
// Check if supported (most common / quickest lookup)
if(isset($supported[$mime])) return true;
// Check aliases
foreach($supported as $info) {
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
}
return false;
}
public function getContent($path) {
return $this->getClient()->tika($path);
}
}

View File

@ -7,15 +7,6 @@
*/
class TikaTextExtractor extends FileTextExtractor {
/**
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
* as the placeholder for the default locale
*
* @var string
* @config
*/
private static $locale = '{default}.utf-8';
/**
* Text extraction mode. Defaults to -t (plain text)
*

View File

@ -0,0 +1,82 @@
<?php
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
class TikaRestClient extends Client {
public function __construct($url) {
parent::__construct(array(
'base_url' => $url
));
}
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable() {
try {
return $this
->get()
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
/**
* Get version code
*
* @return float
*/
public function getVersion() {
$response = $this->get('version');
// Parse output
if($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
) {
return (float)$matches['version'];
}
return 0.0;
}
protected $mimes = array();
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes() {
if($this->mimes) return $this->mimes;
$response = $this->get(
'mime-types',
array(
'headers' => array("Accept" => "application/json")
)
);
return $this->mimes = $response->json();
}
/**
* Extract text content from a given file
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file) {
$response = $this->put(
'tika',
array(
'body' => file_get_contents($file),
'headers' => array("Accept" => "text/plain")
)
);
return $response->getBody()->getContents();
}
}

View File

@ -19,7 +19,7 @@
"php": ">=5.3.2",
"composer/installers": "*",
"silverstripe/framework": "~3.1",
"guzzle/http": "*"
"guzzlehttp/guzzle": "~4.0"
},
"require-dev": {
"phpunit/PHPUnit": "~3.7@stable"

View File

@ -7,7 +7,22 @@ class TikaTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
function testServerExtraction() {
$extractor = new TikaServerTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';