mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API Support tika server
This commit is contained in:
parent
23d83b7d01
commit
1ad9e46727
@ -9,8 +9,9 @@ env:
|
|||||||
- DB=MYSQL CORE_RELEASE=3
|
- DB=MYSQL CORE_RELEASE=3
|
||||||
|
|
||||||
before_script:
|
before_script:
|
||||||
- mkdir $HOME/bin
|
- mkdir -p $HOME/bin
|
||||||
- export PATH=$PATH:$HOME/bin
|
- export PATH=$PATH:$HOME/bin
|
||||||
|
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
|
||||||
- ./.travis/install_tika.sh
|
- ./.travis/install_tika.sh
|
||||||
- sudo ./.travis/install_pdftotext.sh
|
- sudo ./.travis/install_pdftotext.sh
|
||||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||||
@ -18,4 +19,5 @@ before_script:
|
|||||||
- cd ~/builds/ss
|
- cd ~/builds/ss
|
||||||
|
|
||||||
script:
|
script:
|
||||||
|
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
||||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
- vendor/bin/phpunit --verbose textextraction/tests/
|
||||||
|
@ -1,6 +1,14 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
mkdir $HOME/bin
|
|
||||||
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
## Install tika app
|
||||||
|
wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
||||||
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
|
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
|
||||||
chmod ug+x $HOME/bin/tika
|
chmod ug+x $HOME/bin/tika
|
||||||
$HOME/bin/tika --version
|
$HOME/bin/tika --version
|
||||||
|
|
||||||
|
|
||||||
|
## Install tika server
|
||||||
|
wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar"
|
||||||
|
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server
|
||||||
|
chmod ug+x $HOME/bin/tika-rest-server
|
||||||
|
|
||||||
|
115
README.md
115
README.md
@ -36,13 +36,13 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
|||||||
The recommended installation is through [composer](http://getcomposer.org).
|
The recommended installation is through [composer](http://getcomposer.org).
|
||||||
Add the following to your `composer.json`:
|
Add the following to your `composer.json`:
|
||||||
|
|
||||||
```js
|
```js
|
||||||
{
|
{
|
||||||
"require": {
|
"require": {
|
||||||
"silverstripe/textextraction": "2.0.x-dev"
|
"silverstripe/textextraction": "2.0.x-dev"
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
}
|
||||||
|
```
|
||||||
|
|
||||||
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
||||||
which is automatically checked out by composer. Alternatively, install Guzzle
|
which is automatically checked out by composer. Alternatively, install Guzzle
|
||||||
@ -57,11 +57,11 @@ No configuration is required for that, unless you want to make
|
|||||||
the content available through your `DataObject` subclass.
|
the content available through your `DataObject` subclass.
|
||||||
In this case, add the following to `mysite/_config/config.yml`:
|
In this case, add the following to `mysite/_config/config.yml`:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
File:
|
File:
|
||||||
extensions:
|
extensions:
|
||||||
- FileTextExtractable
|
- FileTextExtractable
|
||||||
```
|
```
|
||||||
|
|
||||||
### XPDF
|
### XPDF
|
||||||
|
|
||||||
@ -69,10 +69,10 @@ PDFs require special handling, for example through the [XPDF](http://www.foolabs
|
|||||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||||
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
PDFTextExtractor:
|
PDFTextExtractor:
|
||||||
binary_location: /my/path/pdftotext
|
binary_location: /my/path/pdftotext
|
||||||
```
|
```
|
||||||
|
|
||||||
### Apache Solr
|
### Apache Solr
|
||||||
|
|
||||||
@ -86,10 +86,10 @@ in your database driver, or even pass it back to Solr as part of a full index up
|
|||||||
|
|
||||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
SolrCellTextExtractor:
|
SolrCellTextExtractor:
|
||||||
base_url: 'http://localhost:8983/solr/update/extract'
|
base_url: 'http://localhost:8983/solr/update/extract'
|
||||||
```
|
```
|
||||||
|
|
||||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||||
@ -103,28 +103,28 @@ returns the contents, either by directly accessing `FileTextExtractable->extract
|
|||||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
class MyDocument extends DataObject {
|
class MyDocument extends DataObject {
|
||||||
static $db = array('Path' => 'Text');
|
static $db = array('Path' => 'Text');
|
||||||
function getContent() {
|
function getContent() {
|
||||||
$extractor = FileTextExtractor::for_file($this->Path);
|
$extractor = FileTextExtractor::for_file($this->Path);
|
||||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
class MySolrIndex extends SolrIndex {
|
}
|
||||||
function init() {
|
class MySolrIndex extends SolrIndex {
|
||||||
$this->addClass('MyDocument');
|
function init() {
|
||||||
$this->addStoredField('Content', 'HTMLText');
|
$this->addClass('MyDocument');
|
||||||
}
|
$this->addStoredField('Content', 'HTMLText');
|
||||||
}
|
}
|
||||||
```
|
}
|
||||||
|
```
|
||||||
|
|
||||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||||
each HTTP request is run synchronously.
|
each HTTP request is run synchronously.
|
||||||
|
|
||||||
### Tika
|
### Tika
|
||||||
|
|
||||||
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
|
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||||
|
|
||||||
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||||
configuring this.
|
configuring this.
|
||||||
@ -132,15 +132,54 @@ configuring this.
|
|||||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||||
|
|
||||||
|
### Tika - CLI
|
||||||
|
|
||||||
|
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tika Rest Server
|
||||||
|
|
||||||
|
Tika can also be run as a server.
|
||||||
|
|
||||||
|
You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
TikaServerTextExtractor:
|
||||||
|
server_endpoint: 'http://localhost:9998'
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file.
|
||||||
|
|
||||||
|
|
||||||
|
Then startup your server as below
|
||||||
|
|
||||||
|
```bash
|
||||||
|
java -jar tika-server-1.7.jar --host=localhost --port=9998
|
||||||
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Manual extraction:
|
Manual extraction:
|
||||||
|
|
||||||
$myFile = '/my/path/myfile.pdf';
|
```php
|
||||||
$extractor = FileTextExtractor::for_file($myFile);
|
$myFile = '/my/path/myfile.pdf';
|
||||||
$content = $extractor->getContent($myFile);
|
$extractor = FileTextExtractor::for_file($myFile);
|
||||||
|
$content = $extractor->getContent($myFile);
|
||||||
|
```
|
||||||
|
|
||||||
Extraction with `FileTextExtractable` extension applied:
|
Extraction with `FileTextExtractable` extension applied:
|
||||||
|
|
||||||
$myFileObj = File::get()->First();
|
```php
|
||||||
$content = $myFileObj->extractFileAsText();
|
$myFileObj = File::get()->First();
|
||||||
|
$content = $myFileObj->getFileContent();
|
||||||
|
```
|
||||||
|
|
||||||
|
This content can also be embedded directly within a template.
|
||||||
|
|
||||||
|
```
|
||||||
|
$MyFile.FileContent
|
||||||
|
```
|
||||||
|
@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object {
|
|||||||
// Generate the sorted list of extractors on demand.
|
// Generate the sorted list of extractors on demand.
|
||||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||||
array_shift($classes);
|
array_shift($classes);
|
||||||
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
|
$classPriorities = array();
|
||||||
arsort($classes);
|
foreach($classes as $class) {
|
||||||
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||||
|
}
|
||||||
|
arsort($classPriorities);
|
||||||
|
|
||||||
// Save classes
|
// Save classes
|
||||||
$sortedClasses = array_keys($classes);
|
$sortedClasses = array_keys($classPriorities);
|
||||||
return self::$sorted_extractor_classes = $sortedClasses;
|
return self::$sorted_extractor_classes = $sortedClasses;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
104
code/extractors/TikaServerTextExtractor.php
Normal file
104
code/extractors/TikaServerTextExtractor.php
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables text extraction of file content via the Tika Rest Server
|
||||||
|
*
|
||||||
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||||
|
*/
|
||||||
|
class TikaServerTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tika server is pretty efficient so use it immediately if available
|
||||||
|
*
|
||||||
|
* @var integer
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $priority = 80;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Server endpoint
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $server_endpoint;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var TikaRestClient
|
||||||
|
*/
|
||||||
|
protected $client = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return TikaRestClient
|
||||||
|
*/
|
||||||
|
public function getClient() {
|
||||||
|
return $this->client ?:
|
||||||
|
($this->client =
|
||||||
|
Injector::inst()->createWithArgs(
|
||||||
|
'TikaRestClient',
|
||||||
|
array($this->getServerEndpoint())
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getServerEndpoint() {
|
||||||
|
if(defined('SS_TIKA_ENDPOINT')) {
|
||||||
|
return SS_TIKA_ENDPOINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
|
||||||
|
|
||||||
|
// Default to configured endpoint
|
||||||
|
return $this->config()->server_endpoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the version of tika installed, or 0 if not installed
|
||||||
|
*
|
||||||
|
* @return float version of tika
|
||||||
|
*/
|
||||||
|
public function getVersion() {
|
||||||
|
return $this
|
||||||
|
->getClient()
|
||||||
|
->getVersion();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function isAvailable() {
|
||||||
|
return $this->getServerEndpoint() &&
|
||||||
|
$this->getClient()->isAvailable() &&
|
||||||
|
$this->getVersion() >= 1.7;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supportsExtension($extension) {
|
||||||
|
// Determine support via mime type only
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache of supported mime types
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
|
protected $supportedMimes = array();
|
||||||
|
|
||||||
|
public function supportsMime($mime) {
|
||||||
|
$supported = $this->supportedMimes ?:
|
||||||
|
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||||
|
|
||||||
|
// Check if supported (most common / quickest lookup)
|
||||||
|
if(isset($supported[$mime])) return true;
|
||||||
|
|
||||||
|
// Check aliases
|
||||||
|
foreach($supported as $info) {
|
||||||
|
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getContent($path) {
|
||||||
|
return $this->getClient()->tika($path);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -7,15 +7,6 @@
|
|||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends FileTextExtractor {
|
class TikaTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
/**
|
|
||||||
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
|
|
||||||
* as the placeholder for the default locale
|
|
||||||
*
|
|
||||||
* @var string
|
|
||||||
* @config
|
|
||||||
*/
|
|
||||||
private static $locale = '{default}.utf-8';
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extraction mode. Defaults to -t (plain text)
|
* Text extraction mode. Defaults to -t (plain text)
|
||||||
*
|
*
|
||||||
|
82
code/tika/TikaRestClient.php
Normal file
82
code/tika/TikaRestClient.php
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
use GuzzleHttp\Client;
|
||||||
|
use GuzzleHttp\Exception\RequestException;
|
||||||
|
|
||||||
|
class TikaRestClient extends Client {
|
||||||
|
|
||||||
|
public function __construct($url) {
|
||||||
|
parent::__construct(array(
|
||||||
|
'base_url' => $url
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect if the service is available
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
public function isAvailable() {
|
||||||
|
try {
|
||||||
|
return $this
|
||||||
|
->get()
|
||||||
|
->getStatusCode() == 200;
|
||||||
|
} catch (RequestException $ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get version code
|
||||||
|
*
|
||||||
|
* @return float
|
||||||
|
*/
|
||||||
|
public function getVersion() {
|
||||||
|
$response = $this->get('version');
|
||||||
|
// Parse output
|
||||||
|
if($response->getStatusCode() == 200 &&
|
||||||
|
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
|
||||||
|
) {
|
||||||
|
return (float)$matches['version'];
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected $mimes = array();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets supported mime data. May include aliased mime types.
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function getSupportedMimes() {
|
||||||
|
if($this->mimes) return $this->mimes;
|
||||||
|
|
||||||
|
$response = $this->get(
|
||||||
|
'mime-types',
|
||||||
|
array(
|
||||||
|
'headers' => array("Accept" => "application/json")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
return $this->mimes = $response->json();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract text content from a given file
|
||||||
|
*
|
||||||
|
* @param string $file Full filesystem path to a file to post
|
||||||
|
* @return string Content of the file extracted as plain text
|
||||||
|
*/
|
||||||
|
public function tika($file) {
|
||||||
|
$response = $this->put(
|
||||||
|
'tika',
|
||||||
|
array(
|
||||||
|
'body' => file_get_contents($file),
|
||||||
|
'headers' => array("Accept" => "text/plain")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
return $response->getBody()->getContents();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -19,7 +19,7 @@
|
|||||||
"php": ">=5.3.2",
|
"php": ">=5.3.2",
|
||||||
"composer/installers": "*",
|
"composer/installers": "*",
|
||||||
"silverstripe/framework": "~3.1",
|
"silverstripe/framework": "~3.1",
|
||||||
"guzzle/http": "*"
|
"guzzlehttp/guzzle": "~4.0"
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"phpunit/PHPUnit": "~3.7@stable"
|
"phpunit/PHPUnit": "~3.7@stable"
|
||||||
|
@ -7,7 +7,7 @@ class TikaTextExtractorTest extends SapphireTest {
|
|||||||
|
|
||||||
function testExtraction() {
|
function testExtraction() {
|
||||||
$extractor = new TikaTextExtractor();
|
$extractor = new TikaTextExtractor();
|
||||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
|
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
|
||||||
|
|
||||||
// Check file
|
// Check file
|
||||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||||
@ -20,4 +20,19 @@ class TikaTextExtractorTest extends SapphireTest {
|
|||||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
function testServerExtraction() {
|
||||||
|
$extractor = new TikaServerTextExtractor();
|
||||||
|
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
|
||||||
|
|
||||||
|
// Check file
|
||||||
|
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
|
||||||
|
// Check mime validation
|
||||||
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
|
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||||
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user