mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API Support tika server
This commit is contained in:
parent
23d83b7d01
commit
1ad9e46727
@ -9,8 +9,9 @@ env:
|
||||
- DB=MYSQL CORE_RELEASE=3
|
||||
|
||||
before_script:
|
||||
- mkdir $HOME/bin
|
||||
- mkdir -p $HOME/bin
|
||||
- export PATH=$PATH:$HOME/bin
|
||||
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
|
||||
- ./.travis/install_tika.sh
|
||||
- sudo ./.travis/install_pdftotext.sh
|
||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||
@ -18,4 +19,5 @@ before_script:
|
||||
- cd ~/builds/ss
|
||||
|
||||
script:
|
||||
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
||||
|
@ -1,6 +1,14 @@
|
||||
#!/usr/bin/env bash
|
||||
mkdir $HOME/bin
|
||||
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
||||
|
||||
## Install tika app
|
||||
wget 'https://archive.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
||||
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
|
||||
chmod ug+x $HOME/bin/tika
|
||||
$HOME/bin/tika --version
|
||||
|
||||
|
||||
## Install tika server
|
||||
wget 'https://archive.apache.org/dist/tika/tika-server-1.7.jar' -O "$HOME/bin/tika-rest-server.jar"
|
||||
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika-rest-server.jar "$@"' >> $HOME/bin/tika-rest-server
|
||||
chmod ug+x $HOME/bin/tika-rest-server
|
||||
|
||||
|
89
README.md
89
README.md
@ -36,13 +36,13 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
||||
The recommended installation is through [composer](http://getcomposer.org).
|
||||
Add the following to your `composer.json`:
|
||||
|
||||
```js
|
||||
{
|
||||
```js
|
||||
{
|
||||
"require": {
|
||||
"silverstripe/textextraction": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
```
|
||||
}
|
||||
```
|
||||
|
||||
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
||||
which is automatically checked out by composer. Alternatively, install Guzzle
|
||||
@ -57,11 +57,11 @@ No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
|
||||
```yaml
|
||||
File:
|
||||
```yaml
|
||||
File:
|
||||
extensions:
|
||||
- FileTextExtractable
|
||||
```
|
||||
```
|
||||
|
||||
### XPDF
|
||||
|
||||
@ -69,10 +69,10 @@ PDFs require special handling, for example through the [XPDF](http://www.foolabs
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
||||
|
||||
```yml
|
||||
PDFTextExtractor:
|
||||
```yml
|
||||
PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
```
|
||||
```
|
||||
|
||||
### Apache Solr
|
||||
|
||||
@ -86,10 +86,10 @@ in your database driver, or even pass it back to Solr as part of a full index up
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
```yml
|
||||
SolrCellTextExtractor:
|
||||
```yml
|
||||
SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
```
|
||||
```
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
@ -103,28 +103,28 @@ returns the contents, either by directly accessing `FileTextExtractable->extract
|
||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
|
||||
```php
|
||||
class MyDocument extends DataObject {
|
||||
```php
|
||||
class MyDocument extends DataObject {
|
||||
static $db = array('Path' => 'Text');
|
||||
function getContent() {
|
||||
$extractor = FileTextExtractor::for_file($this->Path);
|
||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||
}
|
||||
}
|
||||
class MySolrIndex extends SolrIndex {
|
||||
}
|
||||
class MySolrIndex extends SolrIndex {
|
||||
function init() {
|
||||
$this->addClass('MyDocument');
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
}
|
||||
```
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
### Tika
|
||||
|
||||
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
|
||||
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||
configuring this.
|
||||
@ -132,15 +132,54 @@ configuring this.
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
### Tika - CLI
|
||||
|
||||
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
exec java -jar /usr/local/Cellar/tika/1.7/libexec/tika-app-1.7.jar "$@"
|
||||
```
|
||||
|
||||
### Tika Rest Server
|
||||
|
||||
Tika can also be run as a server.
|
||||
|
||||
You can configure your server pointpoint either by the SS_TIKA_ENDPOINT define, or by setting the url via config.
|
||||
|
||||
```yaml
|
||||
TikaServerTextExtractor:
|
||||
server_endpoint: 'http://localhost:9998'
|
||||
```
|
||||
|
||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file.
|
||||
|
||||
|
||||
Then startup your server as below
|
||||
|
||||
```bash
|
||||
java -jar tika-server-1.7.jar --host=localhost --port=9998
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```php
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Extraction with `FileTextExtractable` extension applied:
|
||||
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->extractFileAsText();
|
||||
```php
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->getFileContent();
|
||||
```
|
||||
|
||||
This content can also be embedded directly within a template.
|
||||
|
||||
```
|
||||
$MyFile.FileContent
|
||||
```
|
||||
|
@ -35,11 +35,14 @@ abstract class FileTextExtractor extends Object {
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
array_shift($classes);
|
||||
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
|
||||
arsort($classes);
|
||||
$classPriorities = array();
|
||||
foreach($classes as $class) {
|
||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||
}
|
||||
arsort($classPriorities);
|
||||
|
||||
// Save classes
|
||||
$sortedClasses = array_keys($classes);
|
||||
$sortedClasses = array_keys($classPriorities);
|
||||
return self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
|
||||
|
104
code/extractors/TikaServerTextExtractor.php
Normal file
104
code/extractors/TikaServerTextExtractor.php
Normal file
@ -0,0 +1,104 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaServerTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Tika server is pretty efficient so use it immediately if available
|
||||
*
|
||||
* @var integer
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 80;
|
||||
|
||||
/**
|
||||
* Server endpoint
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $server_endpoint;
|
||||
|
||||
/**
|
||||
* @var TikaRestClient
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient() {
|
||||
return $this->client ?:
|
||||
($this->client =
|
||||
Injector::inst()->createWithArgs(
|
||||
'TikaRestClient',
|
||||
array($this->getServerEndpoint())
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function getServerEndpoint() {
|
||||
if(defined('SS_TIKA_ENDPOINT')) {
|
||||
return SS_TIKA_ENDPOINT;
|
||||
}
|
||||
|
||||
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->server_endpoint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
*/
|
||||
public function getVersion() {
|
||||
return $this
|
||||
->getClient()
|
||||
->getVersion();
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
return $this->getServerEndpoint() &&
|
||||
$this->getClient()->isAvailable() &&
|
||||
$this->getVersion() >= 1.7;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = array();
|
||||
|
||||
public function supportsMime($mime) {
|
||||
$supported = $this->supportedMimes ?:
|
||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if(isset($supported[$mime])) return true;
|
||||
|
||||
// Check aliases
|
||||
foreach($supported as $info) {
|
||||
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
return $this->getClient()->tika($path);
|
||||
}
|
||||
|
||||
}
|
@ -7,15 +7,6 @@
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
|
||||
* as the placeholder for the default locale
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $locale = '{default}.utf-8';
|
||||
|
||||
/**
|
||||
* Text extraction mode. Defaults to -t (plain text)
|
||||
*
|
||||
|
82
code/tika/TikaRestClient.php
Normal file
82
code/tika/TikaRestClient.php
Normal file
@ -0,0 +1,82 @@
|
||||
<?php
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
|
||||
class TikaRestClient extends Client {
|
||||
|
||||
public function __construct($url) {
|
||||
parent::__construct(array(
|
||||
'base_url' => $url
|
||||
));
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if the service is available
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable() {
|
||||
try {
|
||||
return $this
|
||||
->get()
|
||||
->getStatusCode() == 200;
|
||||
} catch (RequestException $ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get version code
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function getVersion() {
|
||||
$response = $this->get('version');
|
||||
// Parse output
|
||||
if($response->getStatusCode() == 200 &&
|
||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
|
||||
) {
|
||||
return (float)$matches['version'];
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
protected $mimes = array();
|
||||
|
||||
/**
|
||||
* Gets supported mime data. May include aliased mime types.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getSupportedMimes() {
|
||||
if($this->mimes) return $this->mimes;
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
array(
|
||||
'headers' => array("Accept" => "application/json")
|
||||
)
|
||||
);
|
||||
return $this->mimes = $response->json();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file) {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array(
|
||||
'body' => file_get_contents($file),
|
||||
'headers' => array("Accept" => "text/plain")
|
||||
)
|
||||
);
|
||||
return $response->getBody()->getContents();
|
||||
}
|
||||
|
||||
}
|
@ -19,7 +19,7 @@
|
||||
"php": ">=5.3.2",
|
||||
"composer/installers": "*",
|
||||
"silverstripe/framework": "~3.1",
|
||||
"guzzle/http": "*"
|
||||
"guzzlehttp/guzzle": "~4.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/PHPUnit": "~3.7@stable"
|
||||
|
@ -7,7 +7,22 @@ class TikaTextExtractorTest extends SapphireTest {
|
||||
|
||||
function testExtraction() {
|
||||
$extractor = new TikaTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
|
||||
function testServerExtraction() {
|
||||
$extractor = new TikaServerTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
|
Loading…
Reference in New Issue
Block a user