Initial commit

This commit is contained in:
Ingo Schommer 2012-08-22 17:52:08 +02:00
commit ec0921c6d1
6 changed files with 177 additions and 0 deletions

15
README.md Normal file
View File

@ -0,0 +1,15 @@
# Text Extraction Module
## Overview
Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
## Usage
## Requirements
* SilverStripe 3.0
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)

0
_config.php Normal file
View File

View File

@ -0,0 +1,43 @@
<?php
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
* FileTextExtractor to do the extraction based on the content type of the file.
*
* Adds an additional property which is the cached contents, which is populated on demand.
*
* @author mstephens
*
*/
class FileTextExtractable extends DataObjectDecorator {
static $db = array(
'FileContentCache' => 'Text'
);
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
*
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
* cached version
* @return String
*/
function extractFileAsText($forceParse = false) {
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner);
if (!$extractor) return null;
$text = $extractor->getContent($this->owner);
if (!$text) return null;
$this->owner->FileContentCache = $text;
$this->owner->write();
return $text;
}
}
?>

View File

@ -0,0 +1,58 @@
<?php
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*
*/
abstract class FileTextExtractor extends Object {
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
*
* @var int
*/
public static $priority = 50;
protected static $sorted_extractor_classes = null;
/**
* @param DataObject $file
* @return FileTextExtractor
*/
static function for_file($file) {
$extension = strtolower($file->getExtension());
if (!self::$sorted_extractor_classes) {
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$sortedClasses = array();
foreach($classes as $class) $sortedClasses[$class] = Object::get_static($class, 'priority');
arsort($sortedClasses);
self::$sorted_extractor_classes = $sortedClasses;
}
foreach(self::$sorted_extractor_classes as $className => $priority) {
$formatter = new $className();
if(in_array($extension, $formatter->supportedExtensions())) {
return $formatter;
}
}
}
/**
* Return an array of content types that the extractor can handle.
* @return unknown_type
*/
abstract function supportedExtensions();
/**
* Given a file object, extract the contents as text
* @param $file
* @return unknown_type
*/
abstract function getContent($file);
}
?>

View File

@ -0,0 +1,26 @@
<?php
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor {
function supportedExtensions() {
return array("html", "htm", "xhtml");
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
* @var unknown_type
*/
public static $priority = 10;
function getContent($file) {
$filename = Director::baseFolder() . "/" . $file->Filename;
$content = file_get_contents($filename);
return strip_tags($content);
}
}
?>

View File

@ -0,0 +1,35 @@
<?php
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
class PDFTextExtractor extends FileTextExtractor {
function supportedExtensions() {
return array("pdf");
}
/**
* Accessor to get the location of the binary
* @param $prog
* @return unknown_type
*/
function bin($prog='') {
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
else $path = '.'; // Hope it's in path
return ( $path ? $path . '/' : '' ) . $prog;
}
function getContent($file) {
$filename = Director::baseFolder() . "/" . $file->Filename;
if (!$filename) return ""; // no file
$content = `{$this->bin('pdftotext')} "$filename" -`;
return $content;
}
}
?>