commit ec0921c6d18dd502725c9a565b011bdeccb5f31b Author: Ingo Schommer Date: Wed Aug 22 17:52:08 2012 +0200 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1394eb --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +# Text Extraction Module + +## Overview + + +Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx). + +## Usage + + + +## Requirements + + * SilverStripe 3.0 + * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) \ No newline at end of file diff --git a/_config.php b/_config.php new file mode 100644 index 0000000..e69de29 diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php new file mode 100644 index 0000000..6b4d19e --- /dev/null +++ b/code/extensions/FileTextExtractable.php @@ -0,0 +1,43 @@ + 'Text' + ); + + /** + * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. + * The value is also cached into the File record itself. + * + * @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the + * cached version + * @return String + */ + function extractFileAsText($forceParse = false) { + if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache; + + // Determine which extractor can process this file. + $extractor = FileTextExtractor::for_file($this->owner); + if (!$extractor) return null; + + $text = $extractor->getContent($this->owner); + if (!$text) return null; + + $this->owner->FileContentCache = $text; + $this->owner->write(); + + return $text; + } +} + +?> \ No newline at end of file diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php new file mode 100644 index 0000000..64c1b68 --- /dev/null +++ b/code/extractors/FileTextExtractor.php @@ -0,0 +1,58 @@ +getExtension()); + + if (!self::$sorted_extractor_classes) { + // Generate the sorted list of extractors on demand. + $classes = ClassInfo::subclassesFor("FileTextExtractor"); + array_shift($classes); + $sortedClasses = array(); + foreach($classes as $class) $sortedClasses[$class] = Object::get_static($class, 'priority'); + arsort($sortedClasses); + + self::$sorted_extractor_classes = $sortedClasses; + } + foreach(self::$sorted_extractor_classes as $className => $priority) { + $formatter = new $className(); + if(in_array($extension, $formatter->supportedExtensions())) { + return $formatter; + } + } + } + + /** + * Return an array of content types that the extractor can handle. + * @return unknown_type + */ + abstract function supportedExtensions(); + + /** + * Given a file object, extract the contents as text + * @param $file + * @return unknown_type + */ + abstract function getContent($file); +} + +?> \ No newline at end of file diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php new file mode 100644 index 0000000..9121311 --- /dev/null +++ b/code/extractors/HTMLTextExtractor.php @@ -0,0 +1,26 @@ +Filename; + $content = file_get_contents($filename); + return strip_tags($content); + } +} + +?> \ No newline at end of file diff --git a/code/extractors/PDFTextExtractor.php b/code/extractors/PDFTextExtractor.php new file mode 100644 index 0000000..aab996c --- /dev/null +++ b/code/extractors/PDFTextExtractor.php @@ -0,0 +1,35 @@ +stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php + elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories + elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin'; + else $path = '.'; // Hope it's in path + + return ( $path ? $path . '/' : '' ) . $prog; + } + + function getContent($file) { + $filename = Director::baseFolder() . "/" . $file->Filename; + if (!$filename) return ""; // no file + $content = `{$this->bin('pdftotext')} "$filename" -`; + return $content; + } +} + +?> \ No newline at end of file