API Implement Tika support

API Implement support for detection via mime-type as well as file extension API Implement FileContent property for safe usage in templates API instead of returning the list of extensions / mime types supported, support is determined on a per-file bases Marking dev-master as version 2.0 as this contains breaking changes
2024-06-23 13:09:26 +02:00 · 2015-02-18 15:31:38 +13:00 · 2015-02-18 15:31:38 +13:00 · 2977f85cb5
commit 2977f85cb5
parent 526de4586c
14 changed files with 400 additions and 83 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,17 +2,20 @@
 language: php 
 php:
- - 5.3
+  - 5.4
 env:
 - DB=MYSQL CORE_RELEASE=3.0
  - DB=MYSQL CORE_RELEASE=3.1
- - DB=PGSQL CORE_RELEASE=master
+  - DB=MYSQL CORE_RELEASE=3
 before_script:
  - mkdir $HOME/bin
  - export PATH=$PATH:$HOME/bin
  - ./.travis/install_tika.sh
  - sudo ./.travis/install_pdftotext.sh
  - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
  - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
  - cd ~/builds/ss
 script:
- - phpunit textextraction/tests/
+  - vendor/bin/phpunit --verbose textextraction/tests/
--- a/.travis/install_pdftotext.sh
+++ b/.travis/install_pdftotext.sh
@ -0,0 +1,3 @@
 #!/usr/bin/env bash
 apt-get update
 apt-get install -y xpdf
--- a/.travis/install_tika.sh
+++ b/.travis/install_tika.sh
@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 mkdir $HOME/bin
 wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
 echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
 chmod ug+x $HOME/bin/tika
 $HOME/bin/tika --version
--- a/README.md
+++ b/README.md
@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
 * SilverStripe 3.1
 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
 * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
 * (optional) [Apache Tika](http://tika.apache.org/)
 ### Supported Formats
@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
 * CSV (Solr)
 * RTF (Solr)
 * EPub (Solr)
 * Many others (Tika)
 ## Installation
@ -37,7 +39,7 @@ Add the following to your `composer.json`:
 	```js
 	{
 		"require": {
-			"silverstripe/textextraction": "*"
+			"silverstripe/textextraction": "2.0.x-dev"
 		}
 	}
 	```
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
 By default, only extraction from HTML documents is supported.
 No configuration is required for that, unless you want to make
 the content available through your `DataObject` subclass.
-In this case, add the following to `mysite/_config.php`:
+In this case, add the following to `mysite/_config/config.yml`:
-	DataObject::add_extension('File', 'FileTextExtractable');
+	```yaml
 	File:
 	  extensions:
 	    - FileTextExtractable
 	```
 ### XPDF
@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
 	class MySolrIndex extends SolrIndex {
 		function init() {
 			$this->addClass('MyDocument');
-			$this->addFulltextField('Content', 'HTMLText');
+			$this->addStoredField('Content', 'HTMLText');
 		}
 	}
 	```
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
 Note: This isn't a terribly efficient way to process large amounts of files, since 
 each HTTP request is run synchronously.
 ### Tika
 Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
 See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
 configuring this.
 This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
 installed to perform mime detection. Tika validates support via mime type rather than file extensions.
 ## Usage
 Manual extraction:
--- a/code/extensions/FileTextExtractable.php
+++ b/code/extensions/FileTextExtractable.php
@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
 		'FileContentCache' => 'Text'
 	);
 	private static $casting = array(
 		'FileContent' => 'Text'
 	);
 	/**
 	 * Helper function for template
 	 *
 	 * @return string
 	 */
 	public function getFileContent() {
 		return $this->extractFileAsText();
 	}
 	/**
 	 * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
 	 * The value is also cached into the File record itself.
 	 * 
-	 * @param $forceParse		If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
+	 * @param boolean $disableCache If false, the file content is only parsed on demand.
-	 * 	cached version
+	 * If true, the content parsing is forced, bypassing the cached version
-	 * @return String
+	 * @return string
 	 */
-	function extractFileAsText($forceParse = false) {
+	public function extractFileAsText($disableCache = false) {
-		if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
+		if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
 		// Determine which extractor can process this file.
 		$extractor = FileTextExtractor::for_file($this->owner->FullPath);
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
 		return $text;
 	}
 }
 ?>
--- a/code/extractors/FileTextExtractor.php
+++ b/code/extractors/FileTextExtractor.php
@ -6,40 +6,89 @@
 *
 */
 abstract class FileTextExtractor extends Object {
 	/**
 	 * Set priority from 0-100.
 	 * The highest priority extractor for a given content type will be selected.
 	 *
 	 * @config
-	 * @var int
+	 * @var integer
 	 */
 	private static $priority = 50;
 	/**
 	 * Cache of extractor class names, sorted by priority
 	 *
 	 * @var array
 	 */
 	protected static $sorted_extractor_classes = null;
 	/**
-	 * @param  String $path
+	 * Gets the list of prioritised extractor classes
 	 *
 	 * @return array
 	 */
 	protected static function get_extractor_classes() {
 		// Check cache
 		if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
 		// Generate the sorted list of extractors on demand.
 		$classes = ClassInfo::subclassesFor("FileTextExtractor");
 		array_shift($classes);
 		foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
 		arsort($classes);
 		// Save classes
 		$sortedClasses = array_keys($classes);
 		return self::$sorted_extractor_classes = $sortedClasses;
 	}
 	/**
 	 * Get the text file extractor for the given class
 	 *
 	 * @param string $class
 	 * @return FileTextExtractor
 	 */
 	protected static function get_extractor($class) {
 		return Injector::inst()->get($class);
 	}
 	/**
 	 * Attempt to detect mime type for given file
 	 *
 	 * @param string $path
 	 * @return string Mime type if found
 	 */
 	protected static function get_mime($path) {
 		if(!class_exists('finfo')) return null;
 		// Check mime of file
 		$finfo = new finfo(FILEINFO_MIME_TYPE);
 		return $finfo->file($path);
 	}
 	/**
 	 * @param string $path
 	 * @return FileTextExtractor
 	 */
 	static function for_file($path) {
 		$extension = pathinfo($path, PATHINFO_EXTENSION);
 		$mime = self::get_mime($path);
 		foreach(self::get_extractor_classes() as $className) {
 			$extractor = self::get_extractor($className);
-		if (!self::$sorted_extractor_classes) {
+			// Skip unavailable extractors
-			// Generate the sorted list of extractors on demand.
+			if(!$extractor->isAvailable()) continue;
 			$classes = ClassInfo::subclassesFor("FileTextExtractor");
 			array_shift($classes);
 			$sortedClasses = array();
 			foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
 			arsort($sortedClasses);
-			self::$sorted_extractor_classes = $sortedClasses;
+			// Check extension
 			if($extension && $extractor->supportsExtension($extension)) {
 				return $extractor;
 			}
 			// Check mime
 			if($mime && $extractor->supportsMime($mime)) {
 				return $extractor;
 			}
 		foreach(self::$sorted_extractor_classes as $className => $priority) {
 			$formatter = new $className();
 			$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
 				return (strtolower($compare) == strtolower($extension));
 			});
 			if($matched) return $formatter;
 		}
 	}
@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
 	 * 
 	 * @return boolean
 	 */
-	abstract function isAvailable();
+	abstract public function isAvailable();
 	/**
-	 * Return an array of content types that the extractor can handle.
+	 * Determine if this extractor supports the given extension.
-	 * @return unknown_type
+	 * If support is determined by mime/type only, then this should return false.
 	 *
 	 * @param string $extension
 	 * @return boolean
 	 */
-	abstract function supportedExtensions();
+	abstract public function supportsExtension($extension);
 	/**
 	 * Determine if this extractor suports the given mime type.
 	 * Will only be called if supportsExtension returns false.
 	 * 
 	 * @param string $mime
 	 * @return boolean
 	 */
 	abstract public function supportsMime($mime);
 	/**
 	 * Given a file path, extract the contents as text.
 	 * 
-	 * @param $path
+	 * @param string $path
-	 * @return unknown_type
+	 * @return string
 	 */
-	abstract function getContent($path);
+	abstract public function getContent($path);
 }
 class FileTextExtractor_Exception extends Exception {}
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@ -7,16 +7,26 @@
 */
 class HTMLTextExtractor extends FileTextExtractor {
-	function isAvailable() {
+	public function isAvailable() {
 		return true;
 	}
-	function supportedExtensions() {
+	public function supportsExtension($extension) {
-		return array("html", "htm", "xhtml");
+		return in_array(
 			strtolower($extension),
 			array("html", "htm", "xhtml")
 		);
 	}
 	public function supportsMime($mime) {
 		return strtolower($mime) === 'text/html';
 	}
 	/**
 	 * Lower priority because its not the most clever HTML extraction. If there is something better, use it
 	 *
 	 * @config
 	 * @var integer
 	 */
 	private static $priority = 10;
@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
 	 * combined with regular expressions to remove non-content tags like <style> or <script>,
 	 * as well as adding line breaks after block tags.
 	 * 
-	 * @param  [type] $path [description]
+	 * @param string $path
-	 * @return [type]       [description]
+	 * @return string
 	 */
-	function getContent($path) {
+	public function getContent($path) {
 		$content = file_get_contents($path);
 		// Yes, yes, regex'ing HTML is evil.
 		// Since we don't care about well-formedness or markup here, it does the job.
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
 		return strip_tags($content);
 	}
 }
 ?>
--- a/code/extractors/PDFTextExtractor.php
+++ b/code/extractors/PDFTextExtractor.php
@ -7,32 +7,64 @@
 */
 class PDFTextExtractor extends FileTextExtractor {
-	function isAvailable() {
+	public function isAvailable() {
 		$bin = $this->bin('pdftotext');
 		return (file_exists($bin) && is_executable($bin));
 	}
-	function supportedExtensions() {
+	public function supportsExtension($extension) {
-		return array("pdf");
+		return strtolower($extension) === 'pdf';
 	}
 	public function supportsMime($mime) {
 		return in_array(
 			strtolower($mime),
 			array(
 				'application/pdf',
 				'application/x-pdf',
 				'application/x-bzpdf',
 				'application/x-gzpdf'
 			)
 		);
 	}
 	/**
 	 * Accessor to get the location of the binary
-	 * @param $prog
+	 *
-	 * @return unknown_type
+	 * @param string $prog Name of binary
 	 * @return string
 	 */
-	function bin($prog='') {
+	protected function bin($prog = '') {
-		if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
+		if ($this->config()->binary_location) {
-		elseif (file_exists('/usr/bin/pdftotext'))  $path = '/usr/bin';                      // By searching common directories
+			// By config
-		elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
+			$path = $this->config()->binary_location;
-		else $path = '.'; // Hope it's in path
+		} elseif (file_exists('/usr/bin/pdftotext')) {
 			// By searching common directories
 			$path = '/usr/bin';
 		} elseif (file_exists('/usr/local/bin/pdftotext')) {
 			$path = '/usr/local/bin';
 		} else {
 			$path = '.'; // Hope it's in path
 		}
 		return ( $path ? $path . '/' : '' ) . $prog;
 	}
-	function getContent($path) {
+	public function getContent($path) {
-		if (!$path) return ""; // no file
+		if(!$path) return ""; // no file
-		exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err);
+		$content = $this->getRawOutput($path);
 		return $this->cleanupLigatures($content);
 	}
 	/**
 	 * Invoke pdftotext with the given path
 	 *
 	 * @param string $path
 	 * @return string Output
 	 * @throws FileTextExtractor_Exception
 	 */
 	protected function getRawOutput($path) {
 		exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
 		if($err) {
 			throw new FileTextExtractor_Exception(sprintf(
 				'PDFTextExtractor->getContent() failed for %s: %s',
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
 		}
 		return implode('', $content);
 	}
 }
-?>
+	/**
 	 * Removes utf-8 ligatures.
 	 *
 	 * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
 	 *
 	 * @param string $input
 	 * @return string
 	 */
 	protected function cleanupLigatures($input) {
 		$mapping = array(
 			'ﬀ' => 'ff',
 			'ﬁ' => 'fi',
 			'ﬂ' => 'fl',
 			'ﬃ' => 'ffi',
 			'ﬄ' => 'ffl',
 			'ﬅ' => 'ft',
 			'ﬆ' => 'st'
 		);
 		return str_replace(array_keys($mapping), array_values($mapping), $input);
 	}
 }
--- a/code/extractors/SolrCellTextExtractor.php
+++ b/code/extractors/SolrCellTextExtractor.php
@ -13,8 +13,11 @@ use Guzzle\Http\Client;
 class SolrCellTextExtractor extends FileTextExtractor {
 	/**
 	 * Base URL to use for solr text extraction.
 	 * E.g. http://localhost:8983/solr/update/extract
 	 *
 	 * @config
-	 * @var [type]
+	 * @var string
 	 */
 	private static $base_url;
@ -39,18 +42,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
 		if(!$url) return false;
 	}
-	/**
+	public function supportsExtension($extension) {
-	 * @see  http://tika.apache.org/1.3/formats.html
+		return in_array(
-	 * @return Array
+			strtolower($extension),
-	 */
+			array(
 	public function supportedExtensions() {
 		return array(
 				'pdf', 'doc', 'docx', 'xls', 'xlsx',
 				'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
 				'ppt', 'pptx', 'odp', 'fodp', 'csv'
 			)
 		);
 	}
 	public function supportsMime($mime) {
 		// Rely on supportsExtension
 		return false;
 	}
 	public function getContent($path) {
 		if (!$path) return ""; // no file
--- a/code/extractors/TikaTextExtractor.php
+++ b/code/extractors/TikaTextExtractor.php
@ -0,0 +1,103 @@
 <?php
 /**
 * Enables text extraction of file content via the Tika CLI
 * 
 * {@link http://tika.apache.org/1.7/gettingstarted.html}
 */
 class TikaTextExtractor extends FileTextExtractor {
 	/**
 	 * Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
 	 * as the placeholder for the default locale
 	 *
 	 * @var string
 	 * @config
 	 */
 	private static $locale = '{default}.utf-8';
 	/**
 	 * Text extraction mode. Defaults to -t (plain text)
 	 *
 	 * @var string
 	 * @config
 	 */
 	private static $output_mode = '-t';
 	/**
 	 * Get the version of tika installed, or 0 if not installed
 	 *
 	 * @return float version of tika
 	 */
 	public function getVersion() {
 		$code = $this->runShell('tika --version', $stdout);
 		// Parse output
 		if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
 			return $matches['version'];
 		}
 		return 0;
 	}
 	/**
 	 * Runs an arbitrary and safely escaped shell command
 	 *
 	 * @param string $command Full command including arguments
 	 * @param string &$stdout Standand output
 	 * @param string &$stderr Standard error
 	 * @param string $input Content to pass via standard input
 	 * @return int Exit code. 0 is success
 	 */
 	protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
 		$descriptorSpecs = array(
 			0 => array("pipe", "r"),
 			1 => array("pipe", "w"),
 			2 => array("pipe", "w")
 		);
 		// Invoke command
 		$pipes = array();
 		$proc = proc_open($command, $descriptorSpecs, $pipes);
 		if (!is_resource($proc)) return 255;
 		// Send content as input
 		fwrite($pipes[0], $input);
 		fclose($pipes[0]);
 		// Get output
 		$stdout = stream_get_contents($pipes[1]);
 		fclose($pipes[1]);
 		$stderr = stream_get_contents($pipes[2]);
 		fclose($pipes[2]);
 		// Get result
 		return proc_close($proc);
 	}
 	public function getContent($path) {
 		$mode = $this->config()->output_mode;
 		$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
 		$code = $this->runShell($command, $output);
 		if($code == 0) return $output;
 	}
 	public function isAvailable() {
 		return $this->getVersion() > 0;
 	}
 	public function supportsExtension($extension) {
 		// Determine support via mime type only
 		return false;
 	}
 	public function supportsMime($mime) {
 		// Get list of supported mime types
 		$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
 		if($code) return false; // Error case
 		// Check if the mime type is inside the result
 		$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
 		return (bool)preg_match($pattern, $supportedTypes);
 	}
 }
--- a/composer.json
+++ b/composer.json
@ -18,7 +18,18 @@
 	"require": {
 		"php": ">=5.3.2",
 		"composer/installers": "*",
-		"silverstripe/framework": "~3.0",
+		"silverstripe/framework": "~3.1",
 		"guzzle/http": "*"
 	},
 	"require-dev": {
 		"phpunit/PHPUnit": "~3.7@stable"
 	},
 	"suggest": {
 		"ext-fileinfo": "Improved support for file mime detection"
 	},
 	"extra": {
 		"branch-alias": {
 			"dev-master": "2.0.x-dev"
 		}
 	}
 }
--- a/tests/FileTextExtractableTest.php
+++ b/tests/FileTextExtractableTest.php
@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
 		'File' => array('FileTextExtractable')
 	);
 	public function setUp() {
 		parent::setUp();
 		// Ensure that html is a valid extension
 		Config::inst()
 			->nest()
 			->update('File', 'allowed_extensions', array('html'));
 	}
 	public function tearDown() {
 		Config::unnest();
 		parent::tearDown();
 	}
 	function testExtractFileAsText() {
 		// Create a copy of the file, as it may be clobbered by the test
 		// ($file->extractFileAsText() calls $file->write)
--- a/tests/TikaTextExtractorTest.php
+++ b/tests/TikaTextExtractorTest.php
@ -0,0 +1,23 @@
 <?php
 /**
 * Tests the {@see TikaTextExtractor} class
 */
 class TikaTextExtractorTest extends SapphireTest {
 	function testExtraction() {
 		$extractor = new TikaTextExtractor();
 		if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
 		// Check file
 		$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
 		$content = $extractor->getContent($file);
 		$this->assertContains('This is a test file with a link', $content);
 		// Check mime validation
 		$this->assertTrue($extractor->supportsMime('application/pdf'));
 		$this->assertTrue($extractor->supportsMime('text/html'));
 		$this->assertFalse($extractor->supportsMime('application/not-supported'));
 	}
 }