API Using paths instead of File objects in extractors

Makes coupling to File objects optional, by choosing
to use the FileTextExtractable extension.
This commit is contained in:
Ingo Schommer 2012-08-22 18:25:12 +02:00
parent 7de717b0bd
commit 977c4e49c9
4 changed files with 22 additions and 15 deletions

View File

@ -27,10 +27,10 @@ class FileTextExtractable extends DataExtension {
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner);
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
if (!$extractor) return null;
$text = $extractor->getContent($this->owner);
$text = $extractor->getContent($this->owner->FullPath);
if (!$text) return null;
$this->owner->FileContentCache = $text;

View File

@ -17,11 +17,11 @@ abstract class FileTextExtractor extends Object {
protected static $sorted_extractor_classes = null;
/**
* @param DataObject $file
* @param String $path
* @return FileTextExtractor
*/
static function for_file($file) {
$extension = strtolower($file->getExtension());
static function for_file($path) {
$extension = pathinfo($path, PATHINFO_EXTENSION);
if (!self::$sorted_extractor_classes) {
// Generate the sorted list of extractors on demand.
@ -41,6 +41,14 @@ abstract class FileTextExtractor extends Object {
}
}
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
*
* @return boolean
*/
abstract function isAvailable();
/**
* Return an array of content types that the extractor can handle.
* @return unknown_type
@ -48,11 +56,12 @@ abstract class FileTextExtractor extends Object {
abstract function supportedExtensions();
/**
* Given a file object, extract the contents as text
* @param $file
* Given a file path, extract the contents as text.
*
* @param $path
* @return unknown_type
*/
abstract function getContent($file);
abstract function getContent($path);
}
?>

View File

@ -16,9 +16,8 @@ class HTMLTextExtractor extends FileTextExtractor {
*/
public static $priority = 10;
function getContent($file) {
$filename = Director::baseFolder() . "/" . $file->Filename;
$content = file_get_contents($filename);
function getContent($path) {
$content = file_get_contents($path);
return strip_tags($content);
}
}

View File

@ -24,10 +24,9 @@ class PDFTextExtractor extends FileTextExtractor {
return ( $path ? $path . '/' : '' ) . $prog;
}
function getContent($file) {
$filename = Director::baseFolder() . "/" . $file->Filename;
if (!$filename) return ""; // no file
$content = `{$this->bin('pdftotext')} "$filename" -`;
function getContent($path) {
if (!$path) return ""; // no file
$content = `{$this->bin('pdftotext')} "$path" -`;
return $content;
}
}