Merge pull request #4 from tractorcow/pulls/tika-support

API Implement Tika support
This commit is contained in:
Ingo Schommer 2015-02-23 10:52:34 +13:00
commit 23d83b7d01
14 changed files with 400 additions and 83 deletions

View File

@ -2,17 +2,20 @@
language: php
php:
- 5.3
- 5.4
env:
- DB=MYSQL CORE_RELEASE=3.0
- DB=MYSQL CORE_RELEASE=3.1
- DB=PGSQL CORE_RELEASE=master
- DB=MYSQL CORE_RELEASE=3
before_script:
- mkdir $HOME/bin
- export PATH=$PATH:$HOME/bin
- ./.travis/install_tika.sh
- sudo ./.travis/install_pdftotext.sh
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
- cd ~/builds/ss
script:
- phpunit textextraction/tests/
- vendor/bin/phpunit --verbose textextraction/tests/

3
.travis/install_pdftotext.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
apt-get update
apt-get install -y xpdf

6
.travis/install_tika.sh Executable file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
mkdir $HOME/bin
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
chmod ug+x $HOME/bin/tika
$HOME/bin/tika --version

View File

@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
* SilverStripe 3.1
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
### Supported Formats
@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
* CSV (Solr)
* RTF (Solr)
* EPub (Solr)
* Many others (Tika)
## Installation
@ -37,7 +39,7 @@ Add the following to your `composer.json`:
```js
{
"require": {
"silverstripe/textextraction": "*"
"silverstripe/textextraction": "2.0.x-dev"
}
}
```
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
By default, only extraction from HTML documents is supported.
No configuration is required for that, unless you want to make
the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config.php`:
In this case, add the following to `mysite/_config/config.yml`:
DataObject::add_extension('File', 'FileTextExtractable');
```yaml
File:
extensions:
- FileTextExtractable
```
### XPDF
@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
class MySolrIndex extends SolrIndex {
function init() {
$this->addClass('MyDocument');
$this->addFulltextField('Content', 'HTMLText');
$this->addStoredField('Content', 'HTMLText');
}
}
```
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.
### Tika
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
configuring this.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
## Usage
Manual extraction:

View File

@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
'FileContentCache' => 'Text'
);
private static $casting = array(
'FileContent' => 'Text'
);
/**
* Helper function for template
*
* @return string
*/
public function getFileContent() {
return $this->extractFileAsText();
}
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
*
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
* cached version
* @return String
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing the cached version
* @return string
*/
function extractFileAsText($forceParse = false) {
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
public function extractFileAsText($disableCache = false) {
if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
return $text;
}
}
?>

View File

@ -6,40 +6,89 @@
*
*/
abstract class FileTextExtractor extends Object {
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
*
* @config
* @var int
* @var integer
*/
private static $priority = 50;
/**
* Cache of extractor class names, sorted by priority
*
* @var array
*/
protected static $sorted_extractor_classes = null;
/**
* @param String $path
* Gets the list of prioritised extractor classes
*
* @return array
*/
protected static function get_extractor_classes() {
// Check cache
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
arsort($classes);
// Save classes
$sortedClasses = array_keys($classes);
return self::$sorted_extractor_classes = $sortedClasses;
}
/**
* Get the text file extractor for the given class
*
* @param string $class
* @return FileTextExtractor
*/
protected static function get_extractor($class) {
return Injector::inst()->get($class);
}
/**
* Attempt to detect mime type for given file
*
* @param string $path
* @return string Mime type if found
*/
protected static function get_mime($path) {
if(!class_exists('finfo')) return null;
// Check mime of file
$finfo = new finfo(FILEINFO_MIME_TYPE);
return $finfo->file($path);
}
/**
* @param string $path
* @return FileTextExtractor
*/
static function for_file($path) {
$extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
foreach(self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
if (!self::$sorted_extractor_classes) {
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$sortedClasses = array();
foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
arsort($sortedClasses);
// Skip unavailable extractors
if(!$extractor->isAvailable()) continue;
self::$sorted_extractor_classes = $sortedClasses;
// Check extension
if($extension && $extractor->supportsExtension($extension)) {
return $extractor;
}
// Check mime
if($mime && $extractor->supportsMime($mime)) {
return $extractor;
}
foreach(self::$sorted_extractor_classes as $className => $priority) {
$formatter = new $className();
$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
return (strtolower($compare) == strtolower($extension));
});
if($matched) return $formatter;
}
}
@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
*
* @return boolean
*/
abstract function isAvailable();
abstract public function isAvailable();
/**
* Return an array of content types that the extractor can handle.
* @return unknown_type
* Determine if this extractor supports the given extension.
* If support is determined by mime/type only, then this should return false.
*
* @param string $extension
* @return boolean
*/
abstract function supportedExtensions();
abstract public function supportsExtension($extension);
/**
* Determine if this extractor suports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime
* @return boolean
*/
abstract public function supportsMime($mime);
/**
* Given a file path, extract the contents as text.
*
* @param $path
* @return unknown_type
* @param string $path
* @return string
*/
abstract function getContent($path);
abstract public function getContent($path);
}
class FileTextExtractor_Exception extends Exception {}

View File

@ -7,16 +7,26 @@
*/
class HTMLTextExtractor extends FileTextExtractor {
function isAvailable() {
public function isAvailable() {
return true;
}
function supportedExtensions() {
return array("html", "htm", "xhtml");
public function supportsExtension($extension) {
return in_array(
strtolower($extension),
array("html", "htm", "xhtml")
);
}
public function supportsMime($mime) {
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param [type] $path [description]
* @return [type] [description]
* @param string $path
* @return string
*/
function getContent($path) {
public function getContent($path) {
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
return strip_tags($content);
}
}
?>

View File

@ -7,32 +7,64 @@
*/
class PDFTextExtractor extends FileTextExtractor {
function isAvailable() {
public function isAvailable() {
$bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin));
}
function supportedExtensions() {
return array("pdf");
public function supportsExtension($extension) {
return strtolower($extension) === 'pdf';
}
public function supportsMime($mime) {
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
/**
* Accessor to get the location of the binary
* @param $prog
* @return unknown_type
*
* @param string $prog Name of binary
* @return string
*/
function bin($prog='') {
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
else $path = '.'; // Hope it's in path
protected function bin($prog = '') {
if ($this->config()->binary_location) {
// By config
$path = $this->config()->binary_location;
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
return ( $path ? $path . '/' : '' ) . $prog;
}
function getContent($path) {
public function getContent($path) {
if(!$path) return ""; // no file
exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err);
$content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path) {
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if($err) {
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
}
return implode('', $content);
}
}
?>
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input) {
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}

View File

@ -13,8 +13,11 @@ use Guzzle\Http\Client;
class SolrCellTextExtractor extends FileTextExtractor {
/**
* Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var [type]
* @var string
*/
private static $base_url;
@ -39,18 +42,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
if(!$url) return false;
}
/**
* @see http://tika.apache.org/1.3/formats.html
* @return Array
*/
public function supportedExtensions() {
return array(
public function supportsExtension($extension) {
return in_array(
strtolower($extension),
array(
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
);
}
public function supportsMime($mime) {
// Rely on supportsExtension
return false;
}
public function getContent($path) {
if (!$path) return ""; // no file

View File

@ -0,0 +1,103 @@
<?php
/**
* Enables text extraction of file content via the Tika CLI
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaTextExtractor extends FileTextExtractor {
/**
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
* as the placeholder for the default locale
*
* @var string
* @config
*/
private static $locale = '{default}.utf-8';
/**
* Text extraction mode. Defaults to -t (plain text)
*
* @var string
* @config
*/
private static $output_mode = '-t';
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
$code = $this->runShell('tika --version', $stdout);
// Parse output
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
return $matches['version'];
}
return 0;
}
/**
* Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) return 255;
// Send content as input
fwrite($pipes[0], $input);
fclose($pipes[0]);
// Get output
$stdout = stream_get_contents($pipes[1]);
fclose($pipes[1]);
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]);
// Get result
return proc_close($proc);
}
public function getContent($path) {
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if($code == 0) return $output;
}
public function isAvailable() {
return $this->getVersion() > 0;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
public function supportsMime($mime) {
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if($code) return false; // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
}

View File

@ -18,7 +18,18 @@
"require": {
"php": ">=5.3.2",
"composer/installers": "*",
"silverstripe/framework": "~3.0",
"silverstripe/framework": "~3.1",
"guzzle/http": "*"
},
"require-dev": {
"phpunit/PHPUnit": "~3.7@stable"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
}
}

View File

@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
'File' => array('FileTextExtractable')
);
public function setUp() {
parent::setUp();
// Ensure that html is a valid extension
Config::inst()
->nest()
->update('File', 'allowed_extensions', array('html'));
}
public function tearDown() {
Config::unnest();
parent::tearDown();
}
function testExtractFileAsText() {
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)

View File

@ -0,0 +1,23 @@
<?php
/**
* Tests the {@see TikaTextExtractor} class
*/
class TikaTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}