Merge pull request #4 from tractorcow/pulls/tika-support

API Implement Tika support
This commit is contained in:
Ingo Schommer 2015-02-23 10:52:34 +13:00
commit 23d83b7d01
14 changed files with 400 additions and 83 deletions

View File

@ -2,17 +2,20 @@
language: php language: php
php: php:
- 5.3 - 5.4
env: env:
- DB=MYSQL CORE_RELEASE=3.0
- DB=MYSQL CORE_RELEASE=3.1 - DB=MYSQL CORE_RELEASE=3.1
- DB=PGSQL CORE_RELEASE=master - DB=MYSQL CORE_RELEASE=3
before_script: before_script:
- mkdir $HOME/bin
- export PATH=$PATH:$HOME/bin
- ./.travis/install_tika.sh
- sudo ./.travis/install_pdftotext.sh
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
- cd ~/builds/ss - cd ~/builds/ss
script: script:
- phpunit textextraction/tests/ - vendor/bin/phpunit --verbose textextraction/tests/

3
.travis/install_pdftotext.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
apt-get update
apt-get install -y xpdf

6
.travis/install_tika.sh Executable file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
mkdir $HOME/bin
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
chmod ug+x $HOME/bin/tika
$HOME/bin/tika --version

View File

@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
* SilverStripe 3.1 * SilverStripe 3.1
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
### Supported Formats ### Supported Formats
@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
* CSV (Solr) * CSV (Solr)
* RTF (Solr) * RTF (Solr)
* EPub (Solr) * EPub (Solr)
* Many others (Tika)
## Installation ## Installation
@ -37,7 +39,7 @@ Add the following to your `composer.json`:
```js ```js
{ {
"require": { "require": {
"silverstripe/textextraction": "*" "silverstripe/textextraction": "2.0.x-dev"
} }
} }
``` ```
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
By default, only extraction from HTML documents is supported. By default, only extraction from HTML documents is supported.
No configuration is required for that, unless you want to make No configuration is required for that, unless you want to make
the content available through your `DataObject` subclass. the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config.php`: In this case, add the following to `mysite/_config/config.yml`:
DataObject::add_extension('File', 'FileTextExtractable'); ```yaml
File:
extensions:
- FileTextExtractable
```
### XPDF ### XPDF
@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
class MySolrIndex extends SolrIndex { class MySolrIndex extends SolrIndex {
function init() { function init() {
$this->addClass('MyDocument'); $this->addClass('MyDocument');
$this->addFulltextField('Content', 'HTMLText'); $this->addStoredField('Content', 'HTMLText');
} }
} }
``` ```
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
Note: This isn't a terribly efficient way to process large amounts of files, since Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously. each HTTP request is run synchronously.
### Tika
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
configuring this.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
## Usage ## Usage
Manual extraction: Manual extraction:

View File

@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); );
private static $casting = array(
'FileContent' => 'Text'
);
/**
* Helper function for template
*
* @return string
*/
public function getFileContent() {
return $this->extractFileAsText();
}
/** /**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself. * The value is also cached into the File record itself.
* *
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the * @param boolean $disableCache If false, the file content is only parsed on demand.
* cached version * If true, the content parsing is forced, bypassing the cached version
* @return String * @return string
*/ */
function extractFileAsText($forceParse = false) { public function extractFileAsText($disableCache = false) {
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache; if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
// Determine which extractor can process this file. // Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath); $extractor = FileTextExtractor::for_file($this->owner->FullPath);
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
return $text; return $text;
} }
} }
?>

View File

@ -6,40 +6,89 @@
* *
*/ */
abstract class FileTextExtractor extends Object { abstract class FileTextExtractor extends Object {
/** /**
* Set priority from 0-100. * Set priority from 0-100.
* The highest priority extractor for a given content type will be selected. * The highest priority extractor for a given content type will be selected.
* *
* @config * @config
* @var int * @var integer
*/ */
private static $priority = 50; private static $priority = 50;
/**
* Cache of extractor class names, sorted by priority
*
* @var array
*/
protected static $sorted_extractor_classes = null; protected static $sorted_extractor_classes = null;
/** /**
* @param String $path * Gets the list of prioritised extractor classes
*
* @return array
*/
protected static function get_extractor_classes() {
// Check cache
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
arsort($classes);
// Save classes
$sortedClasses = array_keys($classes);
return self::$sorted_extractor_classes = $sortedClasses;
}
/**
* Get the text file extractor for the given class
*
* @param string $class
* @return FileTextExtractor
*/
protected static function get_extractor($class) {
return Injector::inst()->get($class);
}
/**
* Attempt to detect mime type for given file
*
* @param string $path
* @return string Mime type if found
*/
protected static function get_mime($path) {
if(!class_exists('finfo')) return null;
// Check mime of file
$finfo = new finfo(FILEINFO_MIME_TYPE);
return $finfo->file($path);
}
/**
* @param string $path
* @return FileTextExtractor * @return FileTextExtractor
*/ */
static function for_file($path) { static function for_file($path) {
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
foreach(self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
if (!self::$sorted_extractor_classes) { // Skip unavailable extractors
// Generate the sorted list of extractors on demand. if(!$extractor->isAvailable()) continue;
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$sortedClasses = array();
foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
arsort($sortedClasses);
self::$sorted_extractor_classes = $sortedClasses; // Check extension
if($extension && $extractor->supportsExtension($extension)) {
return $extractor;
}
// Check mime
if($mime && $extractor->supportsMime($mime)) {
return $extractor;
} }
foreach(self::$sorted_extractor_classes as $className => $priority) {
$formatter = new $className();
$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
return (strtolower($compare) == strtolower($extension));
});
if($matched) return $formatter;
} }
} }
@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
* *
* @return boolean * @return boolean
*/ */
abstract function isAvailable(); abstract public function isAvailable();
/** /**
* Return an array of content types that the extractor can handle. * Determine if this extractor supports the given extension.
* @return unknown_type * If support is determined by mime/type only, then this should return false.
*
* @param string $extension
* @return boolean
*/ */
abstract function supportedExtensions(); abstract public function supportsExtension($extension);
/**
* Determine if this extractor suports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime
* @return boolean
*/
abstract public function supportsMime($mime);
/** /**
* Given a file path, extract the contents as text. * Given a file path, extract the contents as text.
* *
* @param $path * @param string $path
* @return unknown_type * @return string
*/ */
abstract function getContent($path); abstract public function getContent($path);
} }
class FileTextExtractor_Exception extends Exception {} class FileTextExtractor_Exception extends Exception {}

View File

@ -7,16 +7,26 @@
*/ */
class HTMLTextExtractor extends FileTextExtractor { class HTMLTextExtractor extends FileTextExtractor {
function isAvailable() { public function isAvailable() {
return true; return true;
} }
function supportedExtensions() { public function supportsExtension($extension) {
return array("html", "htm", "xhtml"); return in_array(
strtolower($extension),
array("html", "htm", "xhtml")
);
}
public function supportsMime($mime) {
return strtolower($mime) === 'text/html';
} }
/** /**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it * Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/ */
private static $priority = 10; private static $priority = 10;
@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
* combined with regular expressions to remove non-content tags like <style> or <script>, * combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags. * as well as adding line breaks after block tags.
* *
* @param [type] $path [description] * @param string $path
* @return [type] [description] * @return string
*/ */
function getContent($path) { public function getContent($path) {
$content = file_get_contents($path); $content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil. // Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job. // Since we don't care about well-formedness or markup here, it does the job.
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
return strip_tags($content); return strip_tags($content);
} }
} }
?>

View File

@ -7,32 +7,64 @@
*/ */
class PDFTextExtractor extends FileTextExtractor { class PDFTextExtractor extends FileTextExtractor {
function isAvailable() { public function isAvailable() {
$bin = $this->bin('pdftotext'); $bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin)); return (file_exists($bin) && is_executable($bin));
} }
function supportedExtensions() { public function supportsExtension($extension) {
return array("pdf"); return strtolower($extension) === 'pdf';
}
public function supportsMime($mime) {
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
} }
/** /**
* Accessor to get the location of the binary * Accessor to get the location of the binary
* @param $prog *
* @return unknown_type * @param string $prog Name of binary
* @return string
*/ */
function bin($prog='') { protected function bin($prog = '') {
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php if ($this->config()->binary_location) {
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories // By config
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin'; $path = $this->config()->binary_location;
else $path = '.'; // Hope it's in path } elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
return ( $path ? $path . '/' : '' ) . $prog; return ( $path ? $path . '/' : '' ) . $prog;
} }
function getContent($path) { public function getContent($path) {
if(!$path) return ""; // no file if(!$path) return ""; // no file
exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err); $content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path) {
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if($err) { if($err) {
throw new FileTextExtractor_Exception(sprintf( throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', 'PDFTextExtractor->getContent() failed for %s: %s',
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
} }
return implode('', $content); return implode('', $content);
} }
}
?> /**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input) {
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}

View File

@ -13,8 +13,11 @@ use Guzzle\Http\Client;
class SolrCellTextExtractor extends FileTextExtractor { class SolrCellTextExtractor extends FileTextExtractor {
/** /**
* Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config * @config
* @var [type] * @var string
*/ */
private static $base_url; private static $base_url;
@ -39,18 +42,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
if(!$url) return false; if(!$url) return false;
} }
/** public function supportsExtension($extension) {
* @see http://tika.apache.org/1.3/formats.html return in_array(
* @return Array strtolower($extension),
*/ array(
public function supportedExtensions() {
return array(
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv' 'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
); );
} }
public function supportsMime($mime) {
// Rely on supportsExtension
return false;
}
public function getContent($path) { public function getContent($path) {
if (!$path) return ""; // no file if (!$path) return ""; // no file

View File

@ -0,0 +1,103 @@
<?php
/**
* Enables text extraction of file content via the Tika CLI
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaTextExtractor extends FileTextExtractor {
/**
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
* as the placeholder for the default locale
*
* @var string
* @config
*/
private static $locale = '{default}.utf-8';
/**
* Text extraction mode. Defaults to -t (plain text)
*
* @var string
* @config
*/
private static $output_mode = '-t';
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
$code = $this->runShell('tika --version', $stdout);
// Parse output
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
return $matches['version'];
}
return 0;
}
/**
* Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) return 255;
// Send content as input
fwrite($pipes[0], $input);
fclose($pipes[0]);
// Get output
$stdout = stream_get_contents($pipes[1]);
fclose($pipes[1]);
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]);
// Get result
return proc_close($proc);
}
public function getContent($path) {
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if($code == 0) return $output;
}
public function isAvailable() {
return $this->getVersion() > 0;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
public function supportsMime($mime) {
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if($code) return false; // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
}

View File

@ -18,7 +18,18 @@
"require": { "require": {
"php": ">=5.3.2", "php": ">=5.3.2",
"composer/installers": "*", "composer/installers": "*",
"silverstripe/framework": "~3.0", "silverstripe/framework": "~3.1",
"guzzle/http": "*" "guzzle/http": "*"
},
"require-dev": {
"phpunit/PHPUnit": "~3.7@stable"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
} }
} }

View File

@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
'File' => array('FileTextExtractable') 'File' => array('FileTextExtractable')
); );
public function setUp() {
parent::setUp();
// Ensure that html is a valid extension
Config::inst()
->nest()
->update('File', 'allowed_extensions', array('html'));
}
public function tearDown() {
Config::unnest();
parent::tearDown();
}
function testExtractFileAsText() { function testExtractFileAsText() {
// Create a copy of the file, as it may be clobbered by the test // Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write) // ($file->extractFileAsText() calls $file->write)

View File

@ -0,0 +1,23 @@
<?php
/**
* Tests the {@see TikaTextExtractor} class
*/
class TikaTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}