mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
API Using paths instead of File objects in extractors
Makes coupling to File objects optional, by choosing to use the FileTextExtractable extension.
This commit is contained in:
parent
7de717b0bd
commit
977c4e49c9
@ -27,10 +27,10 @@ class FileTextExtractable extends DataExtension {
|
||||
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($this->owner);
|
||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||
if (!$extractor) return null;
|
||||
|
||||
$text = $extractor->getContent($this->owner);
|
||||
$text = $extractor->getContent($this->owner->FullPath);
|
||||
if (!$text) return null;
|
||||
|
||||
$this->owner->FileContentCache = $text;
|
||||
|
@ -17,11 +17,11 @@ abstract class FileTextExtractor extends Object {
|
||||
protected static $sorted_extractor_classes = null;
|
||||
|
||||
/**
|
||||
* @param DataObject $file
|
||||
* @param String $path
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
static function for_file($file) {
|
||||
$extension = strtolower($file->getExtension());
|
||||
static function for_file($path) {
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
|
||||
if (!self::$sorted_extractor_classes) {
|
||||
// Generate the sorted list of extractors on demand.
|
||||
@ -41,6 +41,14 @@ abstract class FileTextExtractor extends Object {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the extractor is supported on the current environment,
|
||||
* for example if the correct binaries or libraries are available.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
abstract function isAvailable();
|
||||
|
||||
/**
|
||||
* Return an array of content types that the extractor can handle.
|
||||
* @return unknown_type
|
||||
@ -48,11 +56,12 @@ abstract class FileTextExtractor extends Object {
|
||||
abstract function supportedExtensions();
|
||||
|
||||
/**
|
||||
* Given a file object, extract the contents as text
|
||||
* @param $file
|
||||
* Given a file path, extract the contents as text.
|
||||
*
|
||||
* @param $path
|
||||
* @return unknown_type
|
||||
*/
|
||||
abstract function getContent($file);
|
||||
abstract function getContent($path);
|
||||
}
|
||||
|
||||
?>
|
@ -16,9 +16,8 @@ class HTMLTextExtractor extends FileTextExtractor {
|
||||
*/
|
||||
public static $priority = 10;
|
||||
|
||||
function getContent($file) {
|
||||
$filename = Director::baseFolder() . "/" . $file->Filename;
|
||||
$content = file_get_contents($filename);
|
||||
function getContent($path) {
|
||||
$content = file_get_contents($path);
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
||||
|
@ -24,10 +24,9 @@ class PDFTextExtractor extends FileTextExtractor {
|
||||
return ( $path ? $path . '/' : '' ) . $prog;
|
||||
}
|
||||
|
||||
function getContent($file) {
|
||||
$filename = Director::baseFolder() . "/" . $file->Filename;
|
||||
if (!$filename) return ""; // no file
|
||||
$content = `{$this->bin('pdftotext')} "$filename" -`;
|
||||
function getContent($path) {
|
||||
if (!$path) return ""; // no file
|
||||
$content = `{$this->bin('pdftotext')} "$path" -`;
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user