mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
Initial commit
This commit is contained in:
commit
ec0921c6d1
15
README.md
Normal file
15
README.md
Normal file
@ -0,0 +1,15 @@
|
||||
# Text Extraction Module
|
||||
|
||||
## Overview
|
||||
|
||||
|
||||
Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
|
||||
## Requirements
|
||||
|
||||
* SilverStripe 3.0
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
0
_config.php
Normal file
0
_config.php
Normal file
43
code/extensions/FileTextExtractable.php
Normal file
43
code/extensions/FileTextExtractable.php
Normal file
@ -0,0 +1,43 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||
* FileTextExtractor to do the extraction based on the content type of the file.
|
||||
*
|
||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||
*
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class FileTextExtractable extends DataObjectDecorator {
|
||||
|
||||
static $db = array(
|
||||
'FileContentCache' => 'Text'
|
||||
);
|
||||
|
||||
/**
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||
* The value is also cached into the File record itself.
|
||||
*
|
||||
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
|
||||
* cached version
|
||||
* @return String
|
||||
*/
|
||||
function extractFileAsText($forceParse = false) {
|
||||
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($this->owner);
|
||||
if (!$extractor) return null;
|
||||
|
||||
$text = $extractor->getContent($this->owner);
|
||||
if (!$text) return null;
|
||||
|
||||
$this->owner->FileContentCache = $text;
|
||||
$this->owner->write();
|
||||
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
58
code/extractors/FileTextExtractor.php
Normal file
58
code/extractors/FileTextExtractor.php
Normal file
@ -0,0 +1,58 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
abstract class FileTextExtractor extends Object {
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
* The highest priority extractor for a given content type will be selected.
|
||||
*
|
||||
* @var int
|
||||
*/
|
||||
public static $priority = 50;
|
||||
|
||||
protected static $sorted_extractor_classes = null;
|
||||
|
||||
/**
|
||||
* @param DataObject $file
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
static function for_file($file) {
|
||||
$extension = strtolower($file->getExtension());
|
||||
|
||||
if (!self::$sorted_extractor_classes) {
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
array_shift($classes);
|
||||
$sortedClasses = array();
|
||||
foreach($classes as $class) $sortedClasses[$class] = Object::get_static($class, 'priority');
|
||||
arsort($sortedClasses);
|
||||
|
||||
self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
foreach(self::$sorted_extractor_classes as $className => $priority) {
|
||||
$formatter = new $className();
|
||||
if(in_array($extension, $formatter->supportedExtensions())) {
|
||||
return $formatter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of content types that the extractor can handle.
|
||||
* @return unknown_type
|
||||
*/
|
||||
abstract function supportedExtensions();
|
||||
|
||||
/**
|
||||
* Given a file object, extract the contents as text
|
||||
* @param $file
|
||||
* @return unknown_type
|
||||
*/
|
||||
abstract function getContent($file);
|
||||
}
|
||||
|
||||
?>
|
26
code/extractors/HTMLTextExtractor.php
Normal file
26
code/extractors/HTMLTextExtractor.php
Normal file
@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor {
|
||||
function supportedExtensions() {
|
||||
return array("html", "htm", "xhtml");
|
||||
}
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
* @var unknown_type
|
||||
*/
|
||||
public static $priority = 10;
|
||||
|
||||
function getContent($file) {
|
||||
$filename = Director::baseFolder() . "/" . $file->Filename;
|
||||
$content = file_get_contents($filename);
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
35
code/extractors/PDFTextExtractor.php
Normal file
35
code/extractors/PDFTextExtractor.php
Normal file
@ -0,0 +1,35 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor {
|
||||
function supportedExtensions() {
|
||||
return array("pdf");
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
* @param $prog
|
||||
* @return unknown_type
|
||||
*/
|
||||
function bin($prog='') {
|
||||
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
|
||||
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories
|
||||
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
|
||||
else $path = '.'; // Hope it's in path
|
||||
|
||||
return ( $path ? $path . '/' : '' ) . $prog;
|
||||
}
|
||||
|
||||
function getContent($file) {
|
||||
$filename = Director::baseFolder() . "/" . $file->Filename;
|
||||
if (!$filename) return ""; // no file
|
||||
$content = `{$this->bin('pdftotext')} "$filename" -`;
|
||||
return $content;
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
Loading…
Reference in New Issue
Block a user