From 2977f85cb5b4f3c81c473a2819ce80bf8a49d37b Mon Sep 17 00:00:00 2001 From: Damian Mooyman Date: Wed, 18 Feb 2015 15:31:38 +1300 Subject: [PATCH] API Implement Tika support API Implement support for detection via mime-type as well as file extension API Implement FileContent property for safe usage in templates API instead of returning the list of extensions / mime types supported, support is determined on a per-file bases Marking dev-master as version 2.0 as this contains breaking changes --- .travis.yml | 23 +++-- .travis/install_pdftotext.sh | 3 + .travis/install_tika.sh | 6 ++ README.md | 24 ++++- code/extensions/FileTextExtractable.php | 25 +++-- code/extractors/FileTextExtractor.php | 109 +++++++++++++++++----- code/extractors/HTMLTextExtractor.php | 26 ++++-- code/extractors/PDFTextExtractor.php | 81 +++++++++++++--- code/extractors/SolrCellTextExtractor.php | 31 +++--- code/extractors/TikaTextExtractor.php | 103 ++++++++++++++++++++ composer.json | 13 ++- tests/FileTextExtractableTest.php | 14 +++ tests/PDFTextExtractorTest.php | 2 +- tests/TikaTextExtractorTest.php | 23 +++++ 14 files changed, 400 insertions(+), 83 deletions(-) create mode 100755 .travis/install_pdftotext.sh create mode 100755 .travis/install_tika.sh create mode 100644 code/extractors/TikaTextExtractor.php create mode 100644 tests/TikaTextExtractorTest.php diff --git a/.travis.yml b/.travis.yml index 5e99dc3..8f43db4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,18 +1,21 @@ # See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details language: php -php: - - 5.3 +php: + - 5.4 env: - - DB=MYSQL CORE_RELEASE=3.0 - - DB=MYSQL CORE_RELEASE=3.1 - - DB=PGSQL CORE_RELEASE=master + - DB=MYSQL CORE_RELEASE=3.1 + - DB=MYSQL CORE_RELEASE=3 before_script: - - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support - - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - - cd ~/builds/ss + - mkdir $HOME/bin + - export PATH=$PATH:$HOME/bin + - ./.travis/install_tika.sh + - sudo ./.travis/install_pdftotext.sh + - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support + - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss + - cd ~/builds/ss -script: - - phpunit textextraction/tests/ \ No newline at end of file +script: + - vendor/bin/phpunit --verbose textextraction/tests/ diff --git a/.travis/install_pdftotext.sh b/.travis/install_pdftotext.sh new file mode 100755 index 0000000..40e5d0d --- /dev/null +++ b/.travis/install_pdftotext.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +apt-get update +apt-get install -y xpdf diff --git a/.travis/install_tika.sh b/.travis/install_tika.sh new file mode 100755 index 0000000..fc14055 --- /dev/null +++ b/.travis/install_tika.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +mkdir $HOME/bin +wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar" +echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika +chmod ug+x $HOME/bin/tika +$HOME/bin/tika --version diff --git a/README.md b/README.md index 1d6306e..ee16c8f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil * SilverStripe 3.1 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) + * (optional) [Apache Tika](http://tika.apache.org/) ### Supported Formats @@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil * CSV (Solr) * RTF (Solr) * EPub (Solr) + * Many others (Tika) ## Installation @@ -37,7 +39,7 @@ Add the following to your `composer.json`: ```js { "require": { - "silverstripe/textextraction": "*" + "silverstripe/textextraction": "2.0.x-dev" } } ``` @@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`. By default, only extraction from HTML documents is supported. No configuration is required for that, unless you want to make the content available through your `DataObject` subclass. -In this case, add the following to `mysite/_config.php`: +In this case, add the following to `mysite/_config/config.yml`: - DataObject::add_extension('File', 'FileTextExtractable'); + ```yaml + File: + extensions: + - FileTextExtractable + ``` ### XPDF @@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows: class MySolrIndex extends SolrIndex { function init() { $this->addClass('MyDocument'); - $this->addFulltextField('Content', 'HTMLText'); + $this->addStoredField('Content', 'HTMLText'); } } ``` @@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows: Note: This isn't a terribly efficient way to process large amounts of files, since each HTTP request is run synchronously. +### Tika + +Support for Apache Tika (1.7 and above) is included for the standalone command line utility. + +See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and +configuring this. + +This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) +installed to perform mime detection. Tika validates support via mime type rather than file extensions. + ## Usage Manual extraction: diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php index 39daa22..16a1756 100644 --- a/code/extensions/FileTextExtractable.php +++ b/code/extensions/FileTextExtractable.php @@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension { 'FileContentCache' => 'Text' ); + private static $casting = array( + 'FileContent' => 'Text' + ); + + /** + * Helper function for template + * + * @return string + */ + public function getFileContent() { + return $this->extractFileAsText(); + } + /** * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. * The value is also cached into the File record itself. * - * @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the - * cached version - * @return String + * @param boolean $disableCache If false, the file content is only parsed on demand. + * If true, the content parsing is forced, bypassing the cached version + * @return string */ - function extractFileAsText($forceParse = false) { - if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache; + public function extractFileAsText($disableCache = false) { + if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache; // Determine which extractor can process this file. $extractor = FileTextExtractor::for_file($this->owner->FullPath); @@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension { return $text; } } - -?> \ No newline at end of file diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php index a43427f..576037b 100644 --- a/code/extractors/FileTextExtractor.php +++ b/code/extractors/FileTextExtractor.php @@ -6,40 +6,89 @@ * */ abstract class FileTextExtractor extends Object { + /** * Set priority from 0-100. * The highest priority extractor for a given content type will be selected. * * @config - * @var int + * @var integer */ private static $priority = 50; + /** + * Cache of extractor class names, sorted by priority + * + * @var array + */ protected static $sorted_extractor_classes = null; /** - * @param String $path + * Gets the list of prioritised extractor classes + * + * @return array + */ + protected static function get_extractor_classes() { + // Check cache + if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes; + + // Generate the sorted list of extractors on demand. + $classes = ClassInfo::subclassesFor("FileTextExtractor"); + array_shift($classes); + foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority'); + arsort($classes); + + // Save classes + $sortedClasses = array_keys($classes); + return self::$sorted_extractor_classes = $sortedClasses; + } + + /** + * Get the text file extractor for the given class + * + * @param string $class + * @return FileTextExtractor + */ + protected static function get_extractor($class) { + return Injector::inst()->get($class); + } + + /** + * Attempt to detect mime type for given file + * + * @param string $path + * @return string Mime type if found + */ + protected static function get_mime($path) { + if(!class_exists('finfo')) return null; + + // Check mime of file + $finfo = new finfo(FILEINFO_MIME_TYPE); + return $finfo->file($path); + } + + /** + * @param string $path * @return FileTextExtractor */ static function for_file($path) { $extension = pathinfo($path, PATHINFO_EXTENSION); + $mime = self::get_mime($path); + foreach(self::get_extractor_classes() as $className) { + $extractor = self::get_extractor($className); - if (!self::$sorted_extractor_classes) { - // Generate the sorted list of extractors on demand. - $classes = ClassInfo::subclassesFor("FileTextExtractor"); - array_shift($classes); - $sortedClasses = array(); - foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority'); - arsort($sortedClasses); + // Skip unavailable extractors + if(!$extractor->isAvailable()) continue; - self::$sorted_extractor_classes = $sortedClasses; - } - foreach(self::$sorted_extractor_classes as $className => $priority) { - $formatter = new $className(); - $matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) { - return (strtolower($compare) == strtolower($extension)); - }); - if($matched) return $formatter; + // Check extension + if($extension && $extractor->supportsExtension($extension)) { + return $extractor; + } + + // Check mime + if($mime && $extractor->supportsMime($mime)) { + return $extractor; + } } } @@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object { * * @return boolean */ - abstract function isAvailable(); + abstract public function isAvailable(); /** - * Return an array of content types that the extractor can handle. - * @return unknown_type + * Determine if this extractor supports the given extension. + * If support is determined by mime/type only, then this should return false. + * + * @param string $extension + * @return boolean */ - abstract function supportedExtensions(); + abstract public function supportsExtension($extension); + + /** + * Determine if this extractor suports the given mime type. + * Will only be called if supportsExtension returns false. + * + * @param string $mime + * @return boolean + */ + abstract public function supportsMime($mime); /** * Given a file path, extract the contents as text. * - * @param $path - * @return unknown_type + * @param string $path + * @return string */ - abstract function getContent($path); + abstract public function getContent($path); } class FileTextExtractor_Exception extends Exception {} \ No newline at end of file diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php index 803d99e..a41fe23 100644 --- a/code/extractors/HTMLTextExtractor.php +++ b/code/extractors/HTMLTextExtractor.php @@ -7,16 +7,26 @@ */ class HTMLTextExtractor extends FileTextExtractor { - function isAvailable() { - return true; + public function isAvailable() { + return true; } - function supportedExtensions() { - return array("html", "htm", "xhtml"); + public function supportsExtension($extension) { + return in_array( + strtolower($extension), + array("html", "htm", "xhtml") + ); + } + + public function supportsMime($mime) { + return strtolower($mime) === 'text/html'; } /** * Lower priority because its not the most clever HTML extraction. If there is something better, use it + * + * @config + * @var integer */ private static $priority = 10; @@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor { * combined with regular expressions to remove non-content tags like