API Implement Tika support

API Implement support for detection via mime-type as well as file extension API Implement FileContent property for safe usage in templates API instead of returning the list of extensions / mime types supported, support is determined on a per-file bases Marking dev-master as version 2.0 as this contains breaking changes
2024-10-22 11:06:00 +02:00 · 2015-02-18 15:31:38 +13:00 · 2015-02-18 15:31:38 +13:00 · 2977f85cb5
commit 2977f85cb5
parent 526de4586c
14 changed files with 400 additions and 83 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,17 +2,20 @@

 language: php 
 php:
- - 5.3
+  - 5.4

 env:
- - DB=MYSQL CORE_RELEASE=3.0
  - DB=MYSQL CORE_RELEASE=3.1
- - DB=PGSQL CORE_RELEASE=master
+  - DB=MYSQL CORE_RELEASE=3

 before_script:
+  - mkdir $HOME/bin
+  - export PATH=$PATH:$HOME/bin
+  - ./.travis/install_tika.sh
+  - sudo ./.travis/install_pdftotext.sh
  - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
  - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
  - cd ~/builds/ss

 script:
- - phpunit textextraction/tests/
+  - vendor/bin/phpunit --verbose textextraction/tests/
--- a/.travis/install_pdftotext.sh
+++ b/.travis/install_pdftotext.sh
@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+apt-get update
+apt-get install -y xpdf
--- a/.travis/install_tika.sh
+++ b/.travis/install_tika.sh
@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+mkdir $HOME/bin
+wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
+echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
+chmod ug+x $HOME/bin/tika
+$HOME/bin/tika --version
--- a/README.md
+++ b/README.md
@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
 * SilverStripe 3.1
 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
 * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
+ * (optional) [Apache Tika](http://tika.apache.org/)

 ### Supported Formats

@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
 * CSV (Solr)
 * RTF (Solr)
 * EPub (Solr)
+ * Many others (Tika)

 ## Installation

@ -37,7 +39,7 @@ Add the following to your `composer.json`:
 	```js
 	{
 		"require": {
-			"silverstripe/textextraction": "*"
+			"silverstripe/textextraction": "2.0.x-dev"
 		}
 	}
 	```
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
 By default, only extraction from HTML documents is supported.
 No configuration is required for that, unless you want to make
 the content available through your `DataObject` subclass.
-In this case, add the following to `mysite/_config.php`:
+In this case, add the following to `mysite/_config/config.yml`:

-	DataObject::add_extension('File', 'FileTextExtractable');
+	```yaml
+	File:
+	  extensions:
+	    - FileTextExtractable
+	```

 ### XPDF

@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
 	class MySolrIndex extends SolrIndex {
 		function init() {
 			$this->addClass('MyDocument');
-			$this->addFulltextField('Content', 'HTMLText');
+			$this->addStoredField('Content', 'HTMLText');
 		}
 	}
 	```
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
 Note: This isn't a terribly efficient way to process large amounts of files, since 
 each HTTP request is run synchronously.

+### Tika
+
+Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
+
+See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
+configuring this.
+
+This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
+installed to perform mime detection. Tika validates support via mime type rather than file extensions.
+
 ## Usage

 Manual extraction:
--- a/code/extensions/FileTextExtractable.php
+++ b/code/extensions/FileTextExtractable.php
@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
 		'FileContentCache' => 'Text'
 	);

+	private static $casting = array(
+		'FileContent' => 'Text'
+	);
+
+	/**
+	 * Helper function for template
+	 *
+	 * @return string
+	 */
+	public function getFileContent() {
+		return $this->extractFileAsText();
+	}
+
 	/**
 	 * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
 	 * The value is also cached into the File record itself.
 	 * 
-	 * @param $forceParse		If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
-	 * 	cached version
-	 * @return String
+	 * @param boolean $disableCache If false, the file content is only parsed on demand.
+	 * If true, the content parsing is forced, bypassing the cached version
+	 * @return string
 	 */
-	function extractFileAsText($forceParse = false) {
-		if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
+	public function extractFileAsText($disableCache = false) {
+		if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;

 		// Determine which extractor can process this file.
 		$extractor = FileTextExtractor::for_file($this->owner->FullPath);
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
 		return $text;
 	}
 }
-
-?>
--- a/code/extractors/FileTextExtractor.php
+++ b/code/extractors/FileTextExtractor.php
@ -6,40 +6,89 @@
 *
 */
 abstract class FileTextExtractor extends Object {
+
 	/**
 	 * Set priority from 0-100.
 	 * The highest priority extractor for a given content type will be selected.
 	 *
 	 * @config
-	 * @var int
+	 * @var integer
 	 */
 	private static $priority = 50;

+	/**
+	 * Cache of extractor class names, sorted by priority
+	 *
+	 * @var array
+	 */
 	protected static $sorted_extractor_classes = null;

 	/**
-	 * @param  String $path
+	 * Gets the list of prioritised extractor classes
+	 *
+	 * @return array
+	 */
+	protected static function get_extractor_classes() {
+		// Check cache
+		if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
+		
+		// Generate the sorted list of extractors on demand.
+		$classes = ClassInfo::subclassesFor("FileTextExtractor");
+		array_shift($classes);
+		foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
+		arsort($classes);
+
+		// Save classes
+		$sortedClasses = array_keys($classes);
+		return self::$sorted_extractor_classes = $sortedClasses;
+	}
+
+	/**
+	 * Get the text file extractor for the given class
+	 *
+	 * @param string $class
+	 * @return FileTextExtractor
+	 */
+	protected static function get_extractor($class) {
+		return Injector::inst()->get($class);
+	}
+
+	/**
+	 * Attempt to detect mime type for given file
+	 *
+	 * @param string $path
+	 * @return string Mime type if found
+	 */
+	protected static function get_mime($path) {
+		if(!class_exists('finfo')) return null;
+
+		// Check mime of file
+		$finfo = new finfo(FILEINFO_MIME_TYPE);
+		return $finfo->file($path);
+	}
+
+	/**
+	 * @param string $path
 	 * @return FileTextExtractor
 	 */
 	static function for_file($path) {
 		$extension = pathinfo($path, PATHINFO_EXTENSION);
+		$mime = self::get_mime($path);
+		foreach(self::get_extractor_classes() as $className) {
+			$extractor = self::get_extractor($className);

-		if (!self::$sorted_extractor_classes) {
-			// Generate the sorted list of extractors on demand.
-			$classes = ClassInfo::subclassesFor("FileTextExtractor");
-			array_shift($classes);
-			$sortedClasses = array();
-			foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
-			arsort($sortedClasses);
+			// Skip unavailable extractors
+			if(!$extractor->isAvailable()) continue;

-			self::$sorted_extractor_classes = $sortedClasses;
+			// Check extension
+			if($extension && $extractor->supportsExtension($extension)) {
+				return $extractor;
+			}
+
+			// Check mime
+			if($mime && $extractor->supportsMime($mime)) {
+				return $extractor;
 			}
-		foreach(self::$sorted_extractor_classes as $className => $priority) {
-			$formatter = new $className();
-			$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
-				return (strtolower($compare) == strtolower($extension));
-			});
-			if($matched) return $formatter;
 		}
 	}

@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
 	 * 
 	 * @return boolean
 	 */
-	abstract function isAvailable();
+	abstract public function isAvailable();

 	/**
-	 * Return an array of content types that the extractor can handle.
-	 * @return unknown_type
+	 * Determine if this extractor supports the given extension.
+	 * If support is determined by mime/type only, then this should return false.
+	 *
+	 * @param string $extension
+	 * @return boolean
 	 */
-	abstract function supportedExtensions();
+	abstract public function supportsExtension($extension);
+
+	/**
+	 * Determine if this extractor suports the given mime type.
+	 * Will only be called if supportsExtension returns false.
+	 * 
+	 * @param string $mime
+	 * @return boolean
+	 */
+	abstract public function supportsMime($mime);

 	/**
 	 * Given a file path, extract the contents as text.
 	 * 
-	 * @param $path
-	 * @return unknown_type
+	 * @param string $path
+	 * @return string
 	 */
-	abstract function getContent($path);
+	abstract public function getContent($path);
 }

 class FileTextExtractor_Exception extends Exception {}
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@ -7,16 +7,26 @@
 */
 class HTMLTextExtractor extends FileTextExtractor {
 	
-	function isAvailable() {
+	public function isAvailable() {
 		return true;
 	}

-	function supportedExtensions() {
-		return array("html", "htm", "xhtml");
+	public function supportsExtension($extension) {
+		return in_array(
+			strtolower($extension),
+			array("html", "htm", "xhtml")
+		);
+	}
+
+	public function supportsMime($mime) {
+		return strtolower($mime) === 'text/html';
 	}

 	/**
 	 * Lower priority because its not the most clever HTML extraction. If there is something better, use it
+	 *
+	 * @config
+	 * @var integer
 	 */
 	private static $priority = 10;

@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
 	 * combined with regular expressions to remove non-content tags like <style> or <script>,
 	 * as well as adding line breaks after block tags.
 	 * 
-	 * @param  [type] $path [description]
-	 * @return [type]       [description]
+	 * @param string $path
+	 * @return string
 	 */
-	function getContent($path) {
+	public function getContent($path) {
 		$content = file_get_contents($path);
 		// Yes, yes, regex'ing HTML is evil.
 		// Since we don't care about well-formedness or markup here, it does the job.
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
 		return strip_tags($content);
 	}
 }
-
-?>
--- a/code/extractors/PDFTextExtractor.php
+++ b/code/extractors/PDFTextExtractor.php
@ -7,32 +7,64 @@
 */
 class PDFTextExtractor extends FileTextExtractor {

-	function isAvailable() {
+	public function isAvailable() {
 		$bin = $this->bin('pdftotext');
 		return (file_exists($bin) && is_executable($bin));
 	}
 	
-	function supportedExtensions() {
-		return array("pdf");
+	public function supportsExtension($extension) {
+		return strtolower($extension) === 'pdf';
+	}
+
+	public function supportsMime($mime) {
+		return in_array(
+			strtolower($mime),
+			array(
+				'application/pdf',
+				'application/x-pdf',
+				'application/x-bzpdf',
+				'application/x-gzpdf'
+			)
+		);
 	}

 	/**
 	 * Accessor to get the location of the binary
-	 * @param $prog
-	 * @return unknown_type
+	 *
+	 * @param string $prog Name of binary
+	 * @return string
 	 */
-	function bin($prog='') {
-		if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
-		elseif (file_exists('/usr/bin/pdftotext'))  $path = '/usr/bin';                      // By searching common directories
-		elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
-		else $path = '.'; // Hope it's in path
+	protected function bin($prog = '') {
+		if ($this->config()->binary_location) {
+			// By config
+			$path = $this->config()->binary_location;
+		} elseif (file_exists('/usr/bin/pdftotext')) {
+			// By searching common directories
+			$path = '/usr/bin';
+		} elseif (file_exists('/usr/local/bin/pdftotext')) {
+			$path = '/usr/local/bin';
+		} else {
+			$path = '.'; // Hope it's in path
+		}

 		return ( $path ? $path . '/' : '' ) . $prog;
 	}
 	
-	function getContent($path) {
-		if (!$path) return ""; // no file
-		exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err);
+	public function getContent($path) {
+		if(!$path) return ""; // no file
+		$content = $this->getRawOutput($path);
+		return $this->cleanupLigatures($content);
+	}
+
+	/**
+	 * Invoke pdftotext with the given path
+	 *
+	 * @param string $path
+	 * @return string Output
+	 * @throws FileTextExtractor_Exception
+	 */
+	protected function getRawOutput($path) {
+		exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
 		if($err) {
 			throw new FileTextExtractor_Exception(sprintf(
 				'PDFTextExtractor->getContent() failed for %s: %s',
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
 		}
 		return implode('', $content);
 	}
-}

-?>
+	/**
+	 * Removes utf-8 ligatures.
+	 *
+	 * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
+	 *
+	 * @param string $input
+	 * @return string
+	 */
+	protected function cleanupLigatures($input) {
+		$mapping = array(
+			'ﬀ' => 'ff',
+			'ﬁ' => 'fi',
+			'ﬂ' => 'fl',
+			'ﬃ' => 'ffi',
+			'ﬄ' => 'ffl',
+			'ﬅ' => 'ft',
+			'ﬆ' => 'st'
+		);
+		return str_replace(array_keys($mapping), array_values($mapping), $input);
+	}
+}
--- a/code/extractors/SolrCellTextExtractor.php
+++ b/code/extractors/SolrCellTextExtractor.php
@ -13,8 +13,11 @@ use Guzzle\Http\Client;
 class SolrCellTextExtractor extends FileTextExtractor {

 	/**
+	 * Base URL to use for solr text extraction.
+	 * E.g. http://localhost:8983/solr/update/extract
+	 *
 	 * @config
-	 * @var [type]
+	 * @var string
 	 */
 	private static $base_url;

@ -39,18 +42,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
 		if(!$url) return false;
 	}

-	/**
-	 * @see  http://tika.apache.org/1.3/formats.html
-	 * @return Array
-	 */
-	public function supportedExtensions() {
-		return array(
+	public function supportsExtension($extension) {
+		return in_array(
+			strtolower($extension),
+			array(
 				'pdf', 'doc', 'docx', 'xls', 'xlsx',
 				'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
 				'ppt', 'pptx', 'odp', 'fodp', 'csv'
+			)
 		);
 	}

+	public function supportsMime($mime) {
+		// Rely on supportsExtension
+		return false;
+	}
+	
 	public function getContent($path) {
 		if (!$path) return ""; // no file
 		
--- a/code/extractors/TikaTextExtractor.php
+++ b/code/extractors/TikaTextExtractor.php
@ -0,0 +1,103 @@
+<?php
+
+/**
+ * Enables text extraction of file content via the Tika CLI
+ * 
+ * {@link http://tika.apache.org/1.7/gettingstarted.html}
+ */
+class TikaTextExtractor extends FileTextExtractor {
+
+	/**
+	 * Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
+	 * as the placeholder for the default locale
+	 *
+	 * @var string
+	 * @config
+	 */
+	private static $locale = '{default}.utf-8';
+
+	/**
+	 * Text extraction mode. Defaults to -t (plain text)
+	 *
+	 * @var string
+	 * @config
+	 */
+	private static $output_mode = '-t';
+
+	/**
+	 * Get the version of tika installed, or 0 if not installed
+	 *
+	 * @return float version of tika
+	 */
+	public function getVersion() {
+		$code = $this->runShell('tika --version', $stdout);
+
+		// Parse output
+		if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
+			return $matches['version'];
+		}
+
+		return 0;
+	}
+
+	/**
+	 * Runs an arbitrary and safely escaped shell command
+	 *
+	 * @param string $command Full command including arguments
+	 * @param string &$stdout Standand output
+	 * @param string &$stderr Standard error
+	 * @param string $input Content to pass via standard input
+	 * @return int Exit code. 0 is success
+	 */
+	protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
+		$descriptorSpecs = array(
+			0 => array("pipe", "r"),
+			1 => array("pipe", "w"),
+			2 => array("pipe", "w")
+		);
+		// Invoke command
+		$pipes = array();
+		$proc = proc_open($command, $descriptorSpecs, $pipes);
+		if (!is_resource($proc)) return 255;
+
+		// Send content as input
+		fwrite($pipes[0], $input);
+		fclose($pipes[0]);
+
+		// Get output
+		$stdout = stream_get_contents($pipes[1]);
+		fclose($pipes[1]);
+		$stderr = stream_get_contents($pipes[2]);
+		fclose($pipes[2]);
+
+		// Get result
+		return proc_close($proc);
+	}
+	
+	public function getContent($path) {
+		$mode = $this->config()->output_mode;
+		$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
+		$code = $this->runShell($command, $output);
+		if($code == 0) return $output;
+	}
+
+	public function isAvailable() {
+		return $this->getVersion() > 0;
+	}
+
+	public function supportsExtension($extension) {
+		// Determine support via mime type only
+		return false;
+	}
+
+	public function supportsMime($mime) {
+		// Get list of supported mime types
+		$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
+		if($code) return false; // Error case
+
+		// Check if the mime type is inside the result
+		$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
+		return (bool)preg_match($pattern, $supportedTypes);
+	}
+
+}
--- a/composer.json
+++ b/composer.json
@ -18,7 +18,18 @@
 	"require": {
 		"php": ">=5.3.2",
 		"composer/installers": "*",
-		"silverstripe/framework": "~3.0",
+		"silverstripe/framework": "~3.1",
 		"guzzle/http": "*"
+	},
+	"require-dev": {
+		"phpunit/PHPUnit": "~3.7@stable"
+	},
+	"suggest": {
+		"ext-fileinfo": "Improved support for file mime detection"
+	},
+	"extra": {
+		"branch-alias": {
+			"dev-master": "2.0.x-dev"
+		}
 	}
 }
--- a/tests/FileTextExtractableTest.php
+++ b/tests/FileTextExtractableTest.php
@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
 		'File' => array('FileTextExtractable')
 	);

+	public function setUp() {
+		parent::setUp();
+
+		// Ensure that html is a valid extension
+		Config::inst()
+			->nest()
+			->update('File', 'allowed_extensions', array('html'));
+	}
+
+	public function tearDown() {
+		Config::unnest();
+		parent::tearDown();
+	}
+
 	function testExtractFileAsText() {
 		// Create a copy of the file, as it may be clobbered by the test
 		// ($file->extractFileAsText() calls $file->write)
--- a/tests/TikaTextExtractorTest.php
+++ b/tests/TikaTextExtractorTest.php
@ -0,0 +1,23 @@
+<?php
+
+/**
+ * Tests the {@see TikaTextExtractor} class
+ */
+class TikaTextExtractorTest extends SapphireTest {
+	
+	function testExtraction() {
+		$extractor = new TikaTextExtractor();
+		if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
+
+		// Check file
+		$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
+		$content = $extractor->getContent($file);
+		$this->assertContains('This is a test file with a link', $content);
+
+		// Check mime validation
+		$this->assertTrue($extractor->supportsMime('application/pdf'));
+		$this->assertTrue($extractor->supportsMime('text/html'));
+		$this->assertFalse($extractor->supportsMime('application/not-supported'));
+	}
+
+}