commit ec0921c6d18dd502725c9a565b011bdeccb5f31b
Author: Ingo Schommer <ingo@silverstripe.com>
Date:   Wed Aug 22 17:52:08 2012 +0200

    Initial commit

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b1394eb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+# Text Extraction Module
+
+## Overview
+
+
+Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
+
+## Usage
+
+
+
+## Requirements
+
+ * SilverStripe 3.0
+ * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
\ No newline at end of file
diff --git a/_config.php b/_config.php
new file mode 100644
index 0000000..e69de29
diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php
new file mode 100644
index 0000000..6b4d19e
--- /dev/null
+++ b/code/extensions/FileTextExtractable.php
@@ -0,0 +1,43 @@
+<?php
+
+/**
+ * Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
+ * FileTextExtractor to do the extraction based on the content type of the file.
+ * 
+ * Adds an additional property which is the cached contents, which is populated on demand.
+ *
+ * @author mstephens
+ *
+ */
+class FileTextExtractable extends DataObjectDecorator {
+	
+	static $db = array(
+		'FileContentCache' => 'Text'
+	);
+
+	/**
+	 * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
+	 * The value is also cached into the File record itself.
+	 * 
+	 * @param $forceParse		If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
+	 * 	cached version
+	 * @return String
+	 */
+	function extractFileAsText($forceParse = false) {
+		if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
+
+		// Determine which extractor can process this file.
+		$extractor = FileTextExtractor::for_file($this->owner);
+		if (!$extractor) return null;
+
+		$text = $extractor->getContent($this->owner);
+		if (!$text) return null;
+
+		$this->owner->FileContentCache = $text;
+		$this->owner->write();
+
+		return $text;
+	}
+}
+
+?>
\ No newline at end of file
diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php
new file mode 100644
index 0000000..64c1b68
--- /dev/null
+++ b/code/extractors/FileTextExtractor.php
@@ -0,0 +1,58 @@
+<?php
+
+/**
+ * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
+ * @author mstephens
+ *
+ */
+abstract class FileTextExtractor extends Object {
+	/**
+	 * Set priority from 0-100.
+	 * The highest priority extractor for a given content type will be selected.
+	 *
+	 * @var int
+	 */
+	public static $priority = 50;
+
+	protected static $sorted_extractor_classes = null;
+
+	/**
+	 * @param  DataObject $file
+	 * @return FileTextExtractor
+	 */
+	static function for_file($file) {
+		$extension = strtolower($file->getExtension());
+
+		if (!self::$sorted_extractor_classes) {
+			// Generate the sorted list of extractors on demand.
+			$classes = ClassInfo::subclassesFor("FileTextExtractor");
+			array_shift($classes);
+			$sortedClasses = array();
+			foreach($classes as $class) $sortedClasses[$class] = Object::get_static($class, 'priority');
+			arsort($sortedClasses);
+
+			self::$sorted_extractor_classes = $sortedClasses;
+		}
+		foreach(self::$sorted_extractor_classes as $className => $priority) {
+			$formatter = new $className();
+			if(in_array($extension, $formatter->supportedExtensions())) {
+				return $formatter;
+			}
+		}
+	}
+
+	/**
+	 * Return an array of content types that the extractor can handle.
+	 * @return unknown_type
+	 */
+	abstract function supportedExtensions();
+
+	/**
+	 * Given a file object, extract the contents as text
+	 * @param $file
+	 * @return unknown_type
+	 */
+	abstract function getContent($file);
+}
+
+?>
\ No newline at end of file
diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php
new file mode 100644
index 0000000..9121311
--- /dev/null
+++ b/code/extractors/HTMLTextExtractor.php
@@ -0,0 +1,26 @@
+<?php
+
+/**
+ * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
+ * @author mstephens
+ *
+ */
+class HTMLTextExtractor extends FileTextExtractor {
+	function supportedExtensions() {
+		return array("html", "htm", "xhtml");
+	}
+
+	/**
+	 * Lower priority because its not the most clever HTML extraction. If there is something better, use it
+	 * @var unknown_type
+	 */
+	public static $priority = 10;
+
+	function getContent($file) {
+		$filename = Director::baseFolder() . "/" . $file->Filename;
+		$content = file_get_contents($filename);
+		return strip_tags($content);
+	}
+}
+
+?>
\ No newline at end of file
diff --git a/code/extractors/PDFTextExtractor.php b/code/extractors/PDFTextExtractor.php
new file mode 100644
index 0000000..aab996c
--- /dev/null
+++ b/code/extractors/PDFTextExtractor.php
@@ -0,0 +1,35 @@
+<?php
+
+/**
+ * Text extractor that calls pdftotext to do the conversion.
+ * @author mstephens
+ *
+ */
+class PDFTextExtractor extends FileTextExtractor {
+	function supportedExtensions() {
+		return array("pdf");
+	}
+
+	/**
+	 * Accessor to get the location of the binary
+	 * @param $prog
+	 * @return unknown_type
+	 */
+	function bin($prog='') {
+		if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
+		elseif (file_exists('/usr/bin/pdftotext'))  $path = '/usr/bin';                      // By searching common directories
+		elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
+		else $path = '.'; // Hope it's in path
+
+		return ( $path ? $path . '/' : '' ) . $prog;
+	}
+	
+	function getContent($file) {
+		$filename = Director::baseFolder() . "/" . $file->Filename;
+		if (!$filename) return ""; // no file
+		$content = `{$this->bin('pdftotext')} "$filename" -`;
+		return $content;
+	}
+}
+
+?>
\ No newline at end of file