mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
Converted to PSR-2
This commit is contained in:
parent
80f61a21be
commit
8e14595f1a
@ -1,105 +1,112 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
interface FileTextCache {
|
interface FileTextCache
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Save extracted content for a given File entity
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @param string $content
|
||||||
|
*/
|
||||||
|
public function save(File $file, $content);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Save extracted content for a given File entity
|
* Return any cached extracted content for a given file entity
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File $file
|
||||||
* @param string $content
|
*/
|
||||||
*/
|
public function load(File $file);
|
||||||
public function save(File $file, $content);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return any cached extracted content for a given file entity
|
* Invalidate the cache for a given file.
|
||||||
*
|
* Invoked in onBeforeWrite on the file
|
||||||
* @param File $file
|
*
|
||||||
*/
|
* @param File $file
|
||||||
public function load(File $file);
|
*/
|
||||||
|
public function invalidate(File $file);
|
||||||
/**
|
|
||||||
* Invalidate the cache for a given file.
|
|
||||||
* Invoked in onBeforeWrite on the file
|
|
||||||
*
|
|
||||||
* @param File $file
|
|
||||||
*/
|
|
||||||
public function invalidate(File $file);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Caches the extracted content on the record for the file.
|
* Caches the extracted content on the record for the file.
|
||||||
* Limits the stored file content by default to avoid hitting query size limits.
|
* Limits the stored file content by default to avoid hitting query size limits.
|
||||||
*/
|
*/
|
||||||
class FileTextCache_Database implements FileTextCache {
|
class FileTextCache_Database implements FileTextCache
|
||||||
|
{
|
||||||
|
public function load(File $file)
|
||||||
|
{
|
||||||
|
return $file->FileContentCache;
|
||||||
|
}
|
||||||
|
|
||||||
public function load(File $file) {
|
public function save(File $file, $content)
|
||||||
return $file->FileContentCache;
|
{
|
||||||
}
|
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
||||||
|
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||||
public function save(File $file, $content) {
|
$file->write();
|
||||||
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
}
|
||||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
|
||||||
$file->write();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function invalidate(File $file) {
|
|
||||||
// To prevent writing to the cache from invalidating it
|
|
||||||
if(!$file->isChanged('FileContentCache')) {
|
|
||||||
$file->FileContentCache = '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public function invalidate(File $file)
|
||||||
|
{
|
||||||
|
// To prevent writing to the cache from invalidating it
|
||||||
|
if (!$file->isChanged('FileContentCache')) {
|
||||||
|
$file->FileContentCache = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses SS_Cache with a lifetime to cache extracted content
|
* Uses SS_Cache with a lifetime to cache extracted content
|
||||||
*/
|
*/
|
||||||
class FileTextCache_SSCache implements FileTextCache, Flushable {
|
class FileTextCache_SSCache implements FileTextCache, Flushable
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Lifetime of cache in seconds
|
||||||
|
* Null is indefinite
|
||||||
|
*
|
||||||
|
* @var int|null
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $lifetime = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lifetime of cache in seconds
|
* @return SS_Cache
|
||||||
* Null is indefinite
|
*/
|
||||||
*
|
protected static function get_cache()
|
||||||
* @var int|null
|
{
|
||||||
* @config
|
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
||||||
*/
|
$cache = SS_Cache::factory(__CLASS__);
|
||||||
private static $lifetime = null;
|
$cache->setLifetime($lifetime);
|
||||||
|
return $cache;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
protected function getKey(File $file)
|
||||||
* @return SS_Cache
|
{
|
||||||
*/
|
return md5($file->getFullPath());
|
||||||
protected static function get_cache() {
|
}
|
||||||
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
|
||||||
$cache = SS_Cache::factory(__CLASS__);
|
|
||||||
$cache->setLifetime($lifetime);
|
|
||||||
return $cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected function getKey(File $file) {
|
public function load(File $file)
|
||||||
return md5($file->getFullPath());
|
{
|
||||||
}
|
$key = $this->getKey($file);
|
||||||
|
$cache = self::get_cache();
|
||||||
|
return $cache->load($key);
|
||||||
|
}
|
||||||
|
|
||||||
public function load(File $file) {
|
public function save(File $file, $content)
|
||||||
$key = $this->getKey($file);
|
{
|
||||||
$cache = self::get_cache();
|
$key = $this->getKey($file);
|
||||||
return $cache->load($key);
|
$cache = self::get_cache();
|
||||||
}
|
return $cache->save($content, $key);
|
||||||
|
}
|
||||||
|
|
||||||
public function save(File $file, $content) {
|
public static function flush()
|
||||||
$key = $this->getKey($file);
|
{
|
||||||
$cache = self::get_cache();
|
$cache = self::get_cache();
|
||||||
return $cache->save($content, $key);
|
$cache->clean();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function flush() {
|
|
||||||
$cache = self::get_cache();
|
|
||||||
$cache->clean();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function invalidate(File $file) {
|
|
||||||
$key = $this->getKey($file);
|
|
||||||
$cache = self::get_cache();
|
|
||||||
return $cache->remove($key);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public function invalidate(File $file)
|
||||||
|
{
|
||||||
|
$key = $this->getKey($file);
|
||||||
|
$cache = self::get_cache();
|
||||||
|
return $cache->remove($key);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,83 +9,88 @@
|
|||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class FileTextExtractable extends DataExtension {
|
class FileTextExtractable extends DataExtension
|
||||||
|
{
|
||||||
|
private static $db = array(
|
||||||
|
'FileContentCache' => 'Text'
|
||||||
|
);
|
||||||
|
|
||||||
private static $db = array(
|
private static $casting = array(
|
||||||
'FileContentCache' => 'Text'
|
'FileContent' => 'Text'
|
||||||
);
|
);
|
||||||
|
|
||||||
private static $casting = array(
|
private static $dependencies = array(
|
||||||
'FileContent' => 'Text'
|
'TextCache' => '%$FileTextCache'
|
||||||
);
|
);
|
||||||
|
|
||||||
private static $dependencies = array(
|
/**
|
||||||
'TextCache' => '%$FileTextCache'
|
* @var FileTextCache
|
||||||
);
|
*/
|
||||||
|
protected $fileTextCache = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var FileTextCache
|
*
|
||||||
*/
|
* @param FileTextCache $cache
|
||||||
protected $fileTextCache = null;
|
*/
|
||||||
|
public function setTextCache(FileTextCache $cache)
|
||||||
|
{
|
||||||
|
$this->fileTextCache = $cache;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @return FileTextCache
|
||||||
* @param FileTextCache $cache
|
*/
|
||||||
*/
|
public function getTextCache()
|
||||||
public function setTextCache(FileTextCache $cache) {
|
{
|
||||||
$this->fileTextCache = $cache;
|
return $this->fileTextCache;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return FileTextCache
|
* Helper function for template
|
||||||
*/
|
*
|
||||||
public function getTextCache() {
|
* @return string
|
||||||
return $this->fileTextCache;
|
*/
|
||||||
}
|
public function getFileContent()
|
||||||
|
{
|
||||||
|
return $this->extractFileAsText();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper function for template
|
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||||
*
|
* The value is also cached into the File record itself.
|
||||||
* @return string
|
*
|
||||||
*/
|
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||||
public function getFileContent() {
|
* If true, the content parsing is forced, bypassing the cached version
|
||||||
return $this->extractFileAsText();
|
* @return string
|
||||||
}
|
*/
|
||||||
|
public function extractFileAsText($disableCache = false)
|
||||||
|
{
|
||||||
|
if (!$disableCache) {
|
||||||
|
$text = $this->getTextCache()->load($this->owner);
|
||||||
|
if ($text) {
|
||||||
|
return $text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
// Determine which extractor can process this file.
|
||||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||||
* The value is also cached into the File record itself.
|
if (!$extractor) {
|
||||||
*
|
return null;
|
||||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
}
|
||||||
* If true, the content parsing is forced, bypassing the cached version
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function extractFileAsText($disableCache = false) {
|
|
||||||
if (!$disableCache) {
|
|
||||||
$text = $this->getTextCache()->load($this->owner);
|
|
||||||
if($text) {
|
|
||||||
return $text;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determine which extractor can process this file.
|
$text = $extractor->getContent($this->owner->FullPath);
|
||||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
if (!$text) {
|
||||||
if (!$extractor) {
|
return null;
|
||||||
return null;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
$text = $extractor->getContent($this->owner->FullPath);
|
$this->getTextCache()->save($this->owner, $text);
|
||||||
if (!$text) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
$this->getTextCache()->save($this->owner, $text);
|
return $text;
|
||||||
|
}
|
||||||
|
|
||||||
return $text;
|
public function onBeforeWrite()
|
||||||
}
|
{
|
||||||
|
// Clear cache before changing file
|
||||||
public function onBeforeWrite() {
|
$this->getTextCache()->invalidate($this->owner);
|
||||||
// Clear cache before changing file
|
}
|
||||||
$this->getTextCache()->invalidate($this->owner);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -5,131 +5,141 @@
|
|||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor extends Object {
|
abstract class FileTextExtractor extends Object
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Set priority from 0-100.
|
||||||
|
* The highest priority extractor for a given content type will be selected.
|
||||||
|
*
|
||||||
|
* @config
|
||||||
|
* @var integer
|
||||||
|
*/
|
||||||
|
private static $priority = 50;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set priority from 0-100.
|
* Cache of extractor class names, sorted by priority
|
||||||
* The highest priority extractor for a given content type will be selected.
|
*
|
||||||
*
|
* @var array
|
||||||
* @config
|
*/
|
||||||
* @var integer
|
protected static $sorted_extractor_classes = null;
|
||||||
*/
|
|
||||||
private static $priority = 50;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cache of extractor class names, sorted by priority
|
* Gets the list of prioritised extractor classes
|
||||||
*
|
*
|
||||||
* @var array
|
* @return array
|
||||||
*/
|
*/
|
||||||
protected static $sorted_extractor_classes = null;
|
protected static function get_extractor_classes()
|
||||||
|
{
|
||||||
|
// Check cache
|
||||||
|
if (self::$sorted_extractor_classes) {
|
||||||
|
return self::$sorted_extractor_classes;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
// Generate the sorted list of extractors on demand.
|
||||||
* Gets the list of prioritised extractor classes
|
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||||
*
|
array_shift($classes);
|
||||||
* @return array
|
$classPriorities = array();
|
||||||
*/
|
foreach ($classes as $class) {
|
||||||
protected static function get_extractor_classes() {
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||||
// Check cache
|
}
|
||||||
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
|
arsort($classPriorities);
|
||||||
|
|
||||||
// Generate the sorted list of extractors on demand.
|
// Save classes
|
||||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
$sortedClasses = array_keys($classPriorities);
|
||||||
array_shift($classes);
|
return self::$sorted_extractor_classes = $sortedClasses;
|
||||||
$classPriorities = array();
|
}
|
||||||
foreach($classes as $class) {
|
|
||||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
|
||||||
}
|
|
||||||
arsort($classPriorities);
|
|
||||||
|
|
||||||
// Save classes
|
/**
|
||||||
$sortedClasses = array_keys($classPriorities);
|
* Get the text file extractor for the given class
|
||||||
return self::$sorted_extractor_classes = $sortedClasses;
|
*
|
||||||
}
|
* @param string $class
|
||||||
|
* @return FileTextExtractor
|
||||||
|
*/
|
||||||
|
protected static function get_extractor($class)
|
||||||
|
{
|
||||||
|
return Injector::inst()->get($class);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the text file extractor for the given class
|
* Attempt to detect mime type for given file
|
||||||
*
|
*
|
||||||
* @param string $class
|
* @param string $path
|
||||||
* @return FileTextExtractor
|
* @return string Mime type if found
|
||||||
*/
|
*/
|
||||||
protected static function get_extractor($class) {
|
protected static function get_mime($path)
|
||||||
return Injector::inst()->get($class);
|
{
|
||||||
}
|
$file = new Symfony\Component\HttpFoundation\File\File($path);
|
||||||
|
|
||||||
/**
|
return $file->getMimeType();
|
||||||
* Attempt to detect mime type for given file
|
}
|
||||||
*
|
|
||||||
* @param string $path
|
|
||||||
* @return string Mime type if found
|
|
||||||
*/
|
|
||||||
protected static function get_mime($path) {
|
|
||||||
$file = new Symfony\Component\HttpFoundation\File\File($path);
|
|
||||||
|
|
||||||
return $file->getMimeType();
|
/**
|
||||||
}
|
* @param string $path
|
||||||
|
* @return FileTextExtractor|null
|
||||||
|
*/
|
||||||
|
public static function for_file($path)
|
||||||
|
{
|
||||||
|
if (!file_exists($path) || is_dir($path)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||||
* @param string $path
|
$mime = self::get_mime($path);
|
||||||
* @return FileTextExtractor|null
|
foreach (self::get_extractor_classes() as $className) {
|
||||||
*/
|
$extractor = self::get_extractor($className);
|
||||||
static function for_file($path) {
|
|
||||||
if(!file_exists($path) || is_dir($path)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
// Skip unavailable extractors
|
||||||
$mime = self::get_mime($path);
|
if (!$extractor->isAvailable()) {
|
||||||
foreach(self::get_extractor_classes() as $className) {
|
continue;
|
||||||
$extractor = self::get_extractor($className);
|
}
|
||||||
|
|
||||||
// Skip unavailable extractors
|
// Check extension
|
||||||
if(!$extractor->isAvailable()) continue;
|
if ($extension && $extractor->supportsExtension($extension)) {
|
||||||
|
return $extractor;
|
||||||
|
}
|
||||||
|
|
||||||
// Check extension
|
// Check mime
|
||||||
if($extension && $extractor->supportsExtension($extension)) {
|
if ($mime && $extractor->supportsMime($mime)) {
|
||||||
return $extractor;
|
return $extractor;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check mime
|
/**
|
||||||
if($mime && $extractor->supportsMime($mime)) {
|
* Checks if the extractor is supported on the current environment,
|
||||||
return $extractor;
|
* for example if the correct binaries or libraries are available.
|
||||||
}
|
*
|
||||||
}
|
* @return boolean
|
||||||
}
|
*/
|
||||||
|
abstract public function isAvailable();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the extractor is supported on the current environment,
|
* Determine if this extractor supports the given extension.
|
||||||
* for example if the correct binaries or libraries are available.
|
* If support is determined by mime/type only, then this should return false.
|
||||||
*
|
*
|
||||||
* @return boolean
|
* @param string $extension
|
||||||
*/
|
* @return boolean
|
||||||
abstract public function isAvailable();
|
*/
|
||||||
|
abstract public function supportsExtension($extension);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine if this extractor supports the given extension.
|
* Determine if this extractor suports the given mime type.
|
||||||
* If support is determined by mime/type only, then this should return false.
|
* Will only be called if supportsExtension returns false.
|
||||||
*
|
*
|
||||||
* @param string $extension
|
* @param string $mime
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
abstract public function supportsExtension($extension);
|
abstract public function supportsMime($mime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine if this extractor suports the given mime type.
|
* Given a file path, extract the contents as text.
|
||||||
* Will only be called if supportsExtension returns false.
|
*
|
||||||
*
|
* @param string $path
|
||||||
* @param string $mime
|
* @return string
|
||||||
* @return boolean
|
*/
|
||||||
*/
|
abstract public function getContent($path);
|
||||||
abstract public function supportsMime($mime);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a file path, extract the contents as text.
|
|
||||||
*
|
|
||||||
* @param string $path
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
abstract public function getContent($path);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class FileTextExtractor_Exception extends Exception {}
|
class FileTextExtractor_Exception extends Exception
|
||||||
|
{
|
||||||
|
}
|
||||||
|
@ -5,69 +5,73 @@
|
|||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class HTMLTextExtractor extends FileTextExtractor {
|
class HTMLTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
public function isAvailable()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
public function isAvailable() {
|
public function supportsExtension($extension)
|
||||||
return true;
|
{
|
||||||
}
|
return in_array(
|
||||||
|
strtolower($extension),
|
||||||
|
array("html", "htm", "xhtml")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension) {
|
public function supportsMime($mime)
|
||||||
return in_array(
|
{
|
||||||
strtolower($extension),
|
return strtolower($mime) === 'text/html';
|
||||||
array("html", "htm", "xhtml")
|
}
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function supportsMime($mime) {
|
/**
|
||||||
return strtolower($mime) === 'text/html';
|
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||||
}
|
*
|
||||||
|
* @config
|
||||||
|
* @var integer
|
||||||
|
*/
|
||||||
|
private static $priority = 10;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
* Extracts content from regex, by using strip_tags()
|
||||||
*
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||||
* @config
|
* as well as adding line breaks after block tags.
|
||||||
* @var integer
|
*
|
||||||
*/
|
* @param string $path
|
||||||
private static $priority = 10;
|
* @return string
|
||||||
|
*/
|
||||||
/**
|
public function getContent($path)
|
||||||
* Extracts content from regex, by using strip_tags()
|
{
|
||||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
$content = file_get_contents($path);
|
||||||
* as well as adding line breaks after block tags.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
*
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
* @param string $path
|
$content = preg_replace(
|
||||||
* @return string
|
array(
|
||||||
*/
|
// Remove invisible content
|
||||||
public function getContent($path) {
|
'@<head[^>]*?>.*?</head>@siu',
|
||||||
$content = file_get_contents($path);
|
'@<style[^>]*?>.*?</style>@siu',
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
'@<script[^>]*?.*?</script>@siu',
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
'@<object[^>]*?.*?</object>@siu',
|
||||||
$content = preg_replace(
|
'@<embed[^>]*?.*?</embed>@siu',
|
||||||
array(
|
'@<applet[^>]*?.*?</applet>@siu',
|
||||||
// Remove invisible content
|
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||||
'@<head[^>]*?>.*?</head>@siu',
|
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||||
'@<style[^>]*?>.*?</style>@siu',
|
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||||
'@<script[^>]*?.*?</script>@siu',
|
// Add line breaks before and after blocks
|
||||||
'@<object[^>]*?.*?</object>@siu',
|
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||||
'@<embed[^>]*?.*?</embed>@siu',
|
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||||
'@<applet[^>]*?.*?</applet>@siu',
|
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||||
// Add line breaks before and after blocks
|
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
),
|
||||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
array(
|
||||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
||||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
),
|
||||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
$content
|
||||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
);
|
||||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
return strip_tags($content);
|
||||||
),
|
}
|
||||||
array(
|
|
||||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
|
|
||||||
),
|
|
||||||
$content
|
|
||||||
);
|
|
||||||
return strip_tags($content);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -5,94 +5,103 @@
|
|||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
class PDFTextExtractor extends FileTextExtractor {
|
class PDFTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
public function isAvailable()
|
||||||
|
{
|
||||||
|
$bin = $this->bin('pdftotext');
|
||||||
|
return (file_exists($bin) && is_executable($bin));
|
||||||
|
}
|
||||||
|
|
||||||
public function isAvailable() {
|
public function supportsExtension($extension)
|
||||||
$bin = $this->bin('pdftotext');
|
{
|
||||||
return (file_exists($bin) && is_executable($bin));
|
return strtolower($extension) === 'pdf';
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension) {
|
public function supportsMime($mime)
|
||||||
return strtolower($extension) === 'pdf';
|
{
|
||||||
}
|
return in_array(
|
||||||
|
strtolower($mime),
|
||||||
|
array(
|
||||||
|
'application/pdf',
|
||||||
|
'application/x-pdf',
|
||||||
|
'application/x-bzpdf',
|
||||||
|
'application/x-gzpdf'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public function supportsMime($mime) {
|
/**
|
||||||
return in_array(
|
* Accessor to get the location of the binary
|
||||||
strtolower($mime),
|
*
|
||||||
array(
|
* @param string $prog Name of binary
|
||||||
'application/pdf',
|
* @return string
|
||||||
'application/x-pdf',
|
*/
|
||||||
'application/x-bzpdf',
|
protected function bin($prog = '')
|
||||||
'application/x-gzpdf'
|
{
|
||||||
)
|
if ($this->config()->binary_location) {
|
||||||
);
|
// By config
|
||||||
}
|
$path = $this->config()->binary_location;
|
||||||
|
} elseif (file_exists('/usr/bin/pdftotext')) {
|
||||||
|
// By searching common directories
|
||||||
|
$path = '/usr/bin';
|
||||||
|
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
||||||
|
$path = '/usr/local/bin';
|
||||||
|
} else {
|
||||||
|
$path = '.'; // Hope it's in path
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
return ($path ? $path . '/' : '') . $prog;
|
||||||
* Accessor to get the location of the binary
|
}
|
||||||
*
|
|
||||||
* @param string $prog Name of binary
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
protected function bin($prog = '') {
|
|
||||||
if ($this->config()->binary_location) {
|
|
||||||
// By config
|
|
||||||
$path = $this->config()->binary_location;
|
|
||||||
} elseif (file_exists('/usr/bin/pdftotext')) {
|
|
||||||
// By searching common directories
|
|
||||||
$path = '/usr/bin';
|
|
||||||
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
|
||||||
$path = '/usr/local/bin';
|
|
||||||
} else {
|
|
||||||
$path = '.'; // Hope it's in path
|
|
||||||
}
|
|
||||||
|
|
||||||
return ( $path ? $path . '/' : '' ) . $prog;
|
public function getContent($path)
|
||||||
}
|
{
|
||||||
|
if (!$path) {
|
||||||
|
return "";
|
||||||
|
} // no file
|
||||||
|
$content = $this->getRawOutput($path);
|
||||||
|
return $this->cleanupLigatures($content);
|
||||||
|
}
|
||||||
|
|
||||||
public function getContent($path) {
|
/**
|
||||||
if(!$path) return ""; // no file
|
* Invoke pdftotext with the given path
|
||||||
$content = $this->getRawOutput($path);
|
*
|
||||||
return $this->cleanupLigatures($content);
|
* @param string $path
|
||||||
}
|
* @return string Output
|
||||||
|
* @throws FileTextExtractor_Exception
|
||||||
|
*/
|
||||||
|
protected function getRawOutput($path)
|
||||||
|
{
|
||||||
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
|
if ($err) {
|
||||||
|
throw new FileTextExtractor_Exception(sprintf(
|
||||||
|
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||||
|
$path,
|
||||||
|
implode('', $err)
|
||||||
|
));
|
||||||
|
}
|
||||||
|
return implode('', $content);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoke pdftotext with the given path
|
* Removes utf-8 ligatures.
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||||
* @return string Output
|
*
|
||||||
* @throws FileTextExtractor_Exception
|
* @param string $input
|
||||||
*/
|
* @return string
|
||||||
protected function getRawOutput($path) {
|
*/
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
protected function cleanupLigatures($input)
|
||||||
if($err) {
|
{
|
||||||
throw new FileTextExtractor_Exception(sprintf(
|
$mapping = array(
|
||||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
'ff' => 'ff',
|
||||||
$path,
|
'fi' => 'fi',
|
||||||
implode('', $err)
|
'fl' => 'fl',
|
||||||
));
|
'ffi' => 'ffi',
|
||||||
}
|
'ffl' => 'ffl',
|
||||||
return implode('', $content);
|
'ſt' => 'ft',
|
||||||
}
|
'st' => 'st'
|
||||||
|
);
|
||||||
/**
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||||
* Removes utf-8 ligatures.
|
}
|
||||||
*
|
|
||||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
|
||||||
*
|
|
||||||
* @param string $input
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
protected function cleanupLigatures($input) {
|
|
||||||
$mapping = array(
|
|
||||||
'ff' => 'ff',
|
|
||||||
'fi' => 'fi',
|
|
||||||
'fl' => 'fl',
|
|
||||||
'ffi' => 'ffi',
|
|
||||||
'ffl' => 'ffl',
|
|
||||||
'ſt' => 'ft',
|
|
||||||
'st' => 'st'
|
|
||||||
);
|
|
||||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -10,83 +10,93 @@ use Guzzle\Http\Client;
|
|||||||
* @author ischommer
|
* @author ischommer
|
||||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||||
*/
|
*/
|
||||||
class SolrCellTextExtractor extends FileTextExtractor {
|
class SolrCellTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Base URL to use for solr text extraction.
|
||||||
|
* E.g. http://localhost:8983/solr/update/extract
|
||||||
|
*
|
||||||
|
* @config
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private static $base_url;
|
||||||
|
|
||||||
/**
|
private static $priority = 75;
|
||||||
* Base URL to use for solr text extraction.
|
|
||||||
* E.g. http://localhost:8983/solr/update/extract
|
|
||||||
*
|
|
||||||
* @config
|
|
||||||
* @var string
|
|
||||||
*/
|
|
||||||
private static $base_url;
|
|
||||||
|
|
||||||
private static $priority = 75;
|
protected $httpClient;
|
||||||
|
|
||||||
protected $httpClient;
|
public function getHttpClient()
|
||||||
|
{
|
||||||
|
if (!$this->config()->get('base_url')) {
|
||||||
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||||
|
}
|
||||||
|
if (!$this->httpClient) {
|
||||||
|
$this->httpClient = new Client($this->config()->get('base_url'));
|
||||||
|
}
|
||||||
|
return $this->httpClient;
|
||||||
|
}
|
||||||
|
|
||||||
public function getHttpClient() {
|
public function setHttpClient($client)
|
||||||
if(!$this->config()->get('base_url')) {
|
{
|
||||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
$this->httpClient = $client;
|
||||||
}
|
}
|
||||||
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
|
|
||||||
return $this->httpClient;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function setHttpClient($client) {
|
public function isAvailable()
|
||||||
$this->httpClient = $client;
|
{
|
||||||
}
|
$url = $this->config()->get('base_url');
|
||||||
|
return (boolean) $url;
|
||||||
|
}
|
||||||
|
|
||||||
public function isAvailable() {
|
public function supportsExtension($extension)
|
||||||
$url = $this->config()->get('base_url');
|
{
|
||||||
return (boolean) $url;
|
return in_array(
|
||||||
}
|
strtolower($extension),
|
||||||
|
array(
|
||||||
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||||
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||||
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension) {
|
public function supportsMime($mime)
|
||||||
return in_array(
|
{
|
||||||
strtolower($extension),
|
// Rely on supportsExtension
|
||||||
array(
|
return false;
|
||||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
}
|
||||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
||||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function supportsMime($mime) {
|
public function getContent($path)
|
||||||
// Rely on supportsExtension
|
{
|
||||||
return false;
|
if (!$path) {
|
||||||
}
|
return "";
|
||||||
|
} // no file
|
||||||
|
|
||||||
public function getContent($path) {
|
$fileName = basename($path);
|
||||||
if (!$path) return ""; // no file
|
$client = $this->getHttpClient();
|
||||||
|
try {
|
||||||
|
$request = $client
|
||||||
|
->post()
|
||||||
|
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
||||||
|
->addPostFiles(array('myfile' => $path));
|
||||||
|
$response = $request->send();
|
||||||
|
} catch (InvalidArgumentException $e) {
|
||||||
|
SS_Log::log(
|
||||||
|
sprintf(
|
||||||
|
'Error extracting text from "%s" (message: %s)',
|
||||||
|
$path,
|
||||||
|
$e->getMessage()
|
||||||
|
),
|
||||||
|
SS_Log::NOTICE
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||||
|
preg_match(
|
||||||
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
||||||
|
(string)$response->getBody(),
|
||||||
|
$matches
|
||||||
|
);
|
||||||
|
|
||||||
$fileName = basename($path);
|
return $matches ? $matches[1] : null;
|
||||||
$client = $this->getHttpClient();
|
}
|
||||||
try {
|
|
||||||
$request = $client
|
|
||||||
->post()
|
|
||||||
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
|
||||||
->addPostFiles(array('myfile' => $path));
|
|
||||||
$response = $request->send();
|
|
||||||
} catch(InvalidArgumentException $e) {
|
|
||||||
SS_Log::log(
|
|
||||||
sprintf(
|
|
||||||
'Error extracting text from "%s" (message: %s)',
|
|
||||||
$path,
|
|
||||||
$e->getMessage()
|
|
||||||
),
|
|
||||||
SS_Log::NOTICE
|
|
||||||
);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
||||||
preg_match(
|
|
||||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
|
||||||
(string)$response->getBody(),
|
|
||||||
$matches
|
|
||||||
);
|
|
||||||
|
|
||||||
return $matches ? $matches[1] : null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -5,100 +5,112 @@
|
|||||||
*
|
*
|
||||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||||
*/
|
*/
|
||||||
class TikaServerTextExtractor extends FileTextExtractor {
|
class TikaServerTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Tika server is pretty efficient so use it immediately if available
|
||||||
|
*
|
||||||
|
* @var integer
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $priority = 80;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tika server is pretty efficient so use it immediately if available
|
* Server endpoint
|
||||||
*
|
*
|
||||||
* @var integer
|
* @var string
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $priority = 80;
|
private static $server_endpoint;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Server endpoint
|
* @var TikaRestClient
|
||||||
*
|
*/
|
||||||
* @var string
|
protected $client = null;
|
||||||
* @config
|
|
||||||
*/
|
|
||||||
private static $server_endpoint;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var TikaRestClient
|
* @return TikaRestClient
|
||||||
*/
|
*/
|
||||||
protected $client = null;
|
public function getClient()
|
||||||
|
{
|
||||||
|
return $this->client ?:
|
||||||
|
($this->client =
|
||||||
|
Injector::inst()->createWithArgs(
|
||||||
|
'TikaRestClient',
|
||||||
|
array($this->getServerEndpoint())
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
public function getServerEndpoint()
|
||||||
* @return TikaRestClient
|
{
|
||||||
*/
|
if (defined('SS_TIKA_ENDPOINT')) {
|
||||||
public function getClient() {
|
return SS_TIKA_ENDPOINT;
|
||||||
return $this->client ?:
|
}
|
||||||
($this->client =
|
|
||||||
Injector::inst()->createWithArgs(
|
|
||||||
'TikaRestClient',
|
|
||||||
array($this->getServerEndpoint())
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function getServerEndpoint() {
|
if (getenv('SS_TIKA_ENDPOINT')) {
|
||||||
if(defined('SS_TIKA_ENDPOINT')) {
|
return getenv('SS_TIKA_ENDPOINT');
|
||||||
return SS_TIKA_ENDPOINT;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
|
// Default to configured endpoint
|
||||||
|
return $this->config()->server_endpoint;
|
||||||
|
}
|
||||||
|
|
||||||
// Default to configured endpoint
|
/**
|
||||||
return $this->config()->server_endpoint;
|
* Get the version of tika installed, or 0 if not installed
|
||||||
}
|
*
|
||||||
|
* @return float version of tika
|
||||||
|
*/
|
||||||
|
public function getVersion()
|
||||||
|
{
|
||||||
|
return $this
|
||||||
|
->getClient()
|
||||||
|
->getVersion();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
public function isAvailable()
|
||||||
* Get the version of tika installed, or 0 if not installed
|
{
|
||||||
*
|
return $this->getServerEndpoint() &&
|
||||||
* @return float version of tika
|
$this->getClient()->isAvailable() &&
|
||||||
*/
|
$this->getVersion() >= 1.7;
|
||||||
public function getVersion() {
|
}
|
||||||
return $this
|
|
||||||
->getClient()
|
|
||||||
->getVersion();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function isAvailable() {
|
public function supportsExtension($extension)
|
||||||
return $this->getServerEndpoint() &&
|
{
|
||||||
$this->getClient()->isAvailable() &&
|
// Determine support via mime type only
|
||||||
$this->getVersion() >= 1.7;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension) {
|
|
||||||
// Determine support via mime type only
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cache of supported mime types
|
* Cache of supported mime types
|
||||||
*
|
*
|
||||||
* @var array
|
* @var array
|
||||||
*/
|
*/
|
||||||
protected $supportedMimes = array();
|
protected $supportedMimes = array();
|
||||||
|
|
||||||
public function supportsMime($mime) {
|
public function supportsMime($mime)
|
||||||
$supported = $this->supportedMimes ?:
|
{
|
||||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
$supported = $this->supportedMimes ?:
|
||||||
|
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||||
|
|
||||||
// Check if supported (most common / quickest lookup)
|
// Check if supported (most common / quickest lookup)
|
||||||
if(isset($supported[$mime])) return true;
|
if (isset($supported[$mime])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Check aliases
|
// Check aliases
|
||||||
foreach($supported as $info) {
|
foreach ($supported as $info) {
|
||||||
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
|
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
||||||
}
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path) {
|
|
||||||
return $this->getClient()->tika($path);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public function getContent($path)
|
||||||
|
{
|
||||||
|
return $this->getClient()->tika($path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,90 +5,101 @@
|
|||||||
*
|
*
|
||||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractor extends FileTextExtractor {
|
class TikaTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Text extraction mode. Defaults to -t (plain text)
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $output_mode = '-t';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extraction mode. Defaults to -t (plain text)
|
* Get the version of tika installed, or 0 if not installed
|
||||||
*
|
*
|
||||||
* @var string
|
* @return float version of tika
|
||||||
* @config
|
*/
|
||||||
*/
|
public function getVersion()
|
||||||
private static $output_mode = '-t';
|
{
|
||||||
|
$code = $this->runShell('tika --version', $stdout);
|
||||||
|
|
||||||
/**
|
// Parse output
|
||||||
* Get the version of tika installed, or 0 if not installed
|
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
||||||
*
|
return $matches['version'];
|
||||||
* @return float version of tika
|
}
|
||||||
*/
|
|
||||||
public function getVersion() {
|
|
||||||
$code = $this->runShell('tika --version', $stdout);
|
|
||||||
|
|
||||||
// Parse output
|
return 0;
|
||||||
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
}
|
||||||
return $matches['version'];
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
/**
|
||||||
}
|
* Runs an arbitrary and safely escaped shell command
|
||||||
|
*
|
||||||
|
* @param string $command Full command including arguments
|
||||||
|
* @param string &$stdout Standand output
|
||||||
|
* @param string &$stderr Standard error
|
||||||
|
* @param string $input Content to pass via standard input
|
||||||
|
* @return int Exit code. 0 is success
|
||||||
|
*/
|
||||||
|
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
||||||
|
{
|
||||||
|
$descriptorSpecs = array(
|
||||||
|
0 => array("pipe", "r"),
|
||||||
|
1 => array("pipe", "w"),
|
||||||
|
2 => array("pipe", "w")
|
||||||
|
);
|
||||||
|
// Invoke command
|
||||||
|
$pipes = array();
|
||||||
|
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||||
|
if (!is_resource($proc)) {
|
||||||
|
return 255;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
// Send content as input
|
||||||
* Runs an arbitrary and safely escaped shell command
|
fwrite($pipes[0], $input);
|
||||||
*
|
fclose($pipes[0]);
|
||||||
* @param string $command Full command including arguments
|
|
||||||
* @param string &$stdout Standand output
|
|
||||||
* @param string &$stderr Standard error
|
|
||||||
* @param string $input Content to pass via standard input
|
|
||||||
* @return int Exit code. 0 is success
|
|
||||||
*/
|
|
||||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
|
|
||||||
$descriptorSpecs = array(
|
|
||||||
0 => array("pipe", "r"),
|
|
||||||
1 => array("pipe", "w"),
|
|
||||||
2 => array("pipe", "w")
|
|
||||||
);
|
|
||||||
// Invoke command
|
|
||||||
$pipes = array();
|
|
||||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
|
||||||
if (!is_resource($proc)) return 255;
|
|
||||||
|
|
||||||
// Send content as input
|
// Get output
|
||||||
fwrite($pipes[0], $input);
|
$stdout = stream_get_contents($pipes[1]);
|
||||||
fclose($pipes[0]);
|
fclose($pipes[1]);
|
||||||
|
$stderr = stream_get_contents($pipes[2]);
|
||||||
|
fclose($pipes[2]);
|
||||||
|
|
||||||
// Get output
|
// Get result
|
||||||
$stdout = stream_get_contents($pipes[1]);
|
return proc_close($proc);
|
||||||
fclose($pipes[1]);
|
}
|
||||||
$stderr = stream_get_contents($pipes[2]);
|
|
||||||
fclose($pipes[2]);
|
|
||||||
|
|
||||||
// Get result
|
public function getContent($path)
|
||||||
return proc_close($proc);
|
{
|
||||||
}
|
$mode = $this->config()->output_mode;
|
||||||
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
|
$code = $this->runShell($command, $output);
|
||||||
|
if ($code == 0) {
|
||||||
|
return $output;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public function getContent($path) {
|
public function isAvailable()
|
||||||
$mode = $this->config()->output_mode;
|
{
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
return $this->getVersion() > 0;
|
||||||
$code = $this->runShell($command, $output);
|
}
|
||||||
if($code == 0) return $output;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function isAvailable() {
|
public function supportsExtension($extension)
|
||||||
return $this->getVersion() > 0;
|
{
|
||||||
}
|
// Determine support via mime type only
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension) {
|
public function supportsMime($mime)
|
||||||
// Determine support via mime type only
|
{
|
||||||
return false;
|
// Get list of supported mime types
|
||||||
}
|
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||||
|
if ($code) {
|
||||||
public function supportsMime($mime) {
|
return false;
|
||||||
// Get list of supported mime types
|
} // Error case
|
||||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
|
||||||
if($code) return false; // Error case
|
|
||||||
|
|
||||||
// Check if the mime type is inside the result
|
|
||||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
|
||||||
return (bool)preg_match($pattern, $supportedTypes);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Check if the mime type is inside the result
|
||||||
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||||
|
return (bool)preg_match($pattern, $supportedTypes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,92 +3,97 @@
|
|||||||
use Guzzle\Http\Client;
|
use Guzzle\Http\Client;
|
||||||
use Guzzle\Http\Exception\RequestException;
|
use Guzzle\Http\Exception\RequestException;
|
||||||
|
|
||||||
class TikaRestClient extends Client {
|
class TikaRestClient extends Client
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Detect if the service is available
|
||||||
|
*
|
||||||
|
* @return bool
|
||||||
|
*/
|
||||||
|
public function isAvailable()
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
return $this
|
||||||
|
->get()->send()
|
||||||
|
->getStatusCode() == 200;
|
||||||
|
} catch (RequestException $ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detect if the service is available
|
* Get version code
|
||||||
*
|
*
|
||||||
* @return bool
|
* @return float
|
||||||
*/
|
*/
|
||||||
public function isAvailable() {
|
public function getVersion()
|
||||||
try {
|
{
|
||||||
return $this
|
$response = $this->get('version')->send();
|
||||||
->get()->send()
|
// Parse output
|
||||||
->getStatusCode() == 200;
|
if ($response->getStatusCode() == 200 &&
|
||||||
} catch (RequestException $ex) {
|
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
||||||
return false;
|
) {
|
||||||
}
|
return (float)$matches['version'];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
return 0.0;
|
||||||
* Get version code
|
}
|
||||||
*
|
|
||||||
* @return float
|
|
||||||
*/
|
|
||||||
public function getVersion() {
|
|
||||||
$response = $this->get('version')->send();
|
|
||||||
// Parse output
|
|
||||||
if($response->getStatusCode() == 200 &&
|
|
||||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
|
||||||
) {
|
|
||||||
return (float)$matches['version'];
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0.0;
|
protected $mimes = array();
|
||||||
}
|
|
||||||
|
|
||||||
protected $mimes = array();
|
/**
|
||||||
|
* Gets supported mime data. May include aliased mime types.
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function getSupportedMimes()
|
||||||
|
{
|
||||||
|
if ($this->mimes) {
|
||||||
|
return $this->mimes;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
$response = $this->get(
|
||||||
* Gets supported mime data. May include aliased mime types.
|
'mime-types',
|
||||||
*
|
array('Accept' => 'application/json')
|
||||||
* @return array
|
)->send();
|
||||||
*/
|
|
||||||
public function getSupportedMimes() {
|
|
||||||
if($this->mimes) return $this->mimes;
|
|
||||||
|
|
||||||
$response = $this->get(
|
return $this->mimes = $response->json();
|
||||||
'mime-types',
|
}
|
||||||
array('Accept' => 'application/json')
|
|
||||||
)->send();
|
|
||||||
|
|
||||||
return $this->mimes = $response->json();
|
/**
|
||||||
}
|
* Extract text content from a given file.
|
||||||
|
* Logs a notice-level error if the document can't be parsed.
|
||||||
|
*
|
||||||
|
* @param string $file Full filesystem path to a file to post
|
||||||
|
* @return string Content of the file extracted as plain text
|
||||||
|
*/
|
||||||
|
public function tika($file)
|
||||||
|
{
|
||||||
|
$text = null;
|
||||||
|
try {
|
||||||
|
$response = $this->put(
|
||||||
|
'tika',
|
||||||
|
array('Accept' => 'text/plain'),
|
||||||
|
file_get_contents($file)
|
||||||
|
)->send();
|
||||||
|
$text = $response->getBody(true);
|
||||||
|
} catch (RequestException $e) {
|
||||||
|
$msg = sprintf(
|
||||||
|
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||||
|
$file,
|
||||||
|
$e->getResponse()->getStatusCode(),
|
||||||
|
$e->getResponse()->getReasonPhrase()
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
// Only available if tika-server was started with --includeStack
|
||||||
* Extract text content from a given file.
|
$body = $e->getResponse()->getBody(true);
|
||||||
* Logs a notice-level error if the document can't be parsed.
|
if ($body) {
|
||||||
*
|
$msg .= ' Body: ' . $body;
|
||||||
* @param string $file Full filesystem path to a file to post
|
}
|
||||||
* @return string Content of the file extracted as plain text
|
|
||||||
*/
|
|
||||||
public function tika($file) {
|
|
||||||
$text = null;
|
|
||||||
try {
|
|
||||||
$response = $this->put(
|
|
||||||
'tika',
|
|
||||||
array('Accept' => 'text/plain'),
|
|
||||||
file_get_contents($file)
|
|
||||||
)->send();
|
|
||||||
$text = $response->getBody(true);
|
|
||||||
} catch(RequestException $e) {
|
|
||||||
$msg = sprintf(
|
|
||||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
|
||||||
$file,
|
|
||||||
$e->getResponse()->getStatusCode(),
|
|
||||||
$e->getResponse()->getReasonPhrase()
|
|
||||||
);
|
|
||||||
|
|
||||||
// Only available if tika-server was started with --includeStack
|
SS_Log::log($msg, SS_Log::NOTICE);
|
||||||
$body = $e->getResponse()->getBody(true);
|
}
|
||||||
if($body) {
|
|
||||||
$msg .= ' Body: ' . $body;
|
|
||||||
}
|
|
||||||
|
|
||||||
SS_Log::log($msg, SS_Log::NOTICE);
|
|
||||||
}
|
|
||||||
|
|
||||||
return $text;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return $text;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
<?php
|
<?php
|
||||||
class FileTextCacheDatabaseTest extends SapphireTest {
|
class FileTextCacheDatabaseTest extends SapphireTest
|
||||||
|
{
|
||||||
|
public function testTruncatesByMaxLength()
|
||||||
|
{
|
||||||
|
Config::nest();
|
||||||
|
|
||||||
public function testTruncatesByMaxLength() {
|
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
|
||||||
Config::nest();
|
$cache = new FileTextCache_Database();
|
||||||
|
$file = $this->getMock('File', array('write'));
|
||||||
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
|
$content = '0123456789';
|
||||||
$cache = new FileTextCache_Database();
|
$cache->save($file, $content);
|
||||||
$file = $this->getMock('File', array('write'));
|
$this->assertEquals($cache->load($file), '01234');
|
||||||
$content = '0123456789';
|
|
||||||
$cache->save($file, $content);
|
|
||||||
$this->assertEquals($cache->load($file), '01234');
|
|
||||||
|
|
||||||
Config::unnest();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
Config::unnest();
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,43 +1,46 @@
|
|||||||
<?php
|
<?php
|
||||||
class FileTextExtractableTest extends SapphireTest {
|
class FileTextExtractableTest extends SapphireTest
|
||||||
|
{
|
||||||
|
protected $requiredExtensions = array(
|
||||||
|
'File' => array('FileTextExtractable')
|
||||||
|
);
|
||||||
|
|
||||||
protected $requiredExtensions = array(
|
public function setUp()
|
||||||
'File' => array('FileTextExtractable')
|
{
|
||||||
);
|
parent::setUp();
|
||||||
|
|
||||||
public function setUp() {
|
// Ensure that html is a valid extension
|
||||||
parent::setUp();
|
Config::inst()
|
||||||
|
->nest()
|
||||||
|
->update('File', 'allowed_extensions', array('html'));
|
||||||
|
}
|
||||||
|
|
||||||
// Ensure that html is a valid extension
|
public function tearDown()
|
||||||
Config::inst()
|
{
|
||||||
->nest()
|
Config::unnest();
|
||||||
->update('File', 'allowed_extensions', array('html'));
|
parent::tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function tearDown() {
|
public function testExtractFileAsText()
|
||||||
Config::unnest();
|
{
|
||||||
parent::tearDown();
|
// Create a copy of the file, as it may be clobbered by the test
|
||||||
}
|
// ($file->extractFileAsText() calls $file->write)
|
||||||
|
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||||
|
|
||||||
function testExtractFileAsText() {
|
// Use HTML, since the extractor is always available
|
||||||
// Create a copy of the file, as it may be clobbered by the test
|
$file = new File(array(
|
||||||
// ($file->extractFileAsText() calls $file->write)
|
'Name' => 'test1-copy.html',
|
||||||
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
|
||||||
|
));
|
||||||
// Use HTML, since the extractor is always available
|
$file->write();
|
||||||
$file = new File(array(
|
|
||||||
'Name' => 'test1-copy.html',
|
|
||||||
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
|
|
||||||
));
|
|
||||||
$file->write();
|
|
||||||
|
|
||||||
$content = $file->extractFileAsText();
|
|
||||||
$this->assertContains('Test Headline', $content);
|
|
||||||
$this->assertContains('Test Text', $content);
|
|
||||||
$this->assertEquals($content, $file->FileContentCache);
|
|
||||||
|
|
||||||
if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
$content = $file->extractFileAsText();
|
||||||
|
$this->assertContains('Test Headline', $content);
|
||||||
|
$this->assertContains('Test Text', $content);
|
||||||
|
$this->assertEquals($content, $file->FileContentCache);
|
||||||
|
|
||||||
|
if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
|
||||||
|
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,14 +1,14 @@
|
|||||||
<?php
|
<?php
|
||||||
class HTMLTextExtractorTest extends SapphireTest {
|
class HTMLTextExtractorTest extends SapphireTest
|
||||||
|
{
|
||||||
function testExtraction() {
|
public function testExtraction()
|
||||||
$extractor = new HTMLTextExtractor();
|
{
|
||||||
|
$extractor = new HTMLTextExtractor();
|
||||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
|
||||||
$this->assertContains('Test Headline', $content);
|
|
||||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
|
||||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
|
||||||
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
|
||||||
}
|
|
||||||
|
|
||||||
|
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
||||||
|
$this->assertContains('Test Headline', $content);
|
||||||
|
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||||
|
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
||||||
|
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
||||||
|
}
|
||||||
}
|
}
|
@ -1,12 +1,14 @@
|
|||||||
<?php
|
<?php
|
||||||
class PDFTextExtractorTest extends SapphireTest {
|
class PDFTextExtractorTest extends SapphireTest
|
||||||
|
{
|
||||||
function testExtraction() {
|
public function testExtraction()
|
||||||
$extractor = new PDFTextExtractor();
|
{
|
||||||
if(!$extractor->isAvailable()) $this->markTestSkipped('pdftotext not available');
|
$extractor = new PDFTextExtractor();
|
||||||
|
if (!$extractor->isAvailable()) {
|
||||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
$this->markTestSkipped('pdftotext not available');
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,36 +3,41 @@
|
|||||||
/**
|
/**
|
||||||
* Tests the {@see TikaTextExtractor} class
|
* Tests the {@see TikaTextExtractor} class
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractorTest extends SapphireTest {
|
class TikaTextExtractorTest extends SapphireTest
|
||||||
|
{
|
||||||
|
public function testExtraction()
|
||||||
|
{
|
||||||
|
$extractor = new TikaTextExtractor();
|
||||||
|
if (!$extractor->isAvailable()) {
|
||||||
|
$this->markTestSkipped('tika cli not available');
|
||||||
|
}
|
||||||
|
|
||||||
function testExtraction() {
|
// Check file
|
||||||
$extractor = new TikaTextExtractor();
|
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
|
$content = $extractor->getContent($file);
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
|
||||||
// Check file
|
// Check mime validation
|
||||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
$content = $extractor->getContent($file);
|
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
|
}
|
||||||
|
|
||||||
// Check mime validation
|
public function testServerExtraction()
|
||||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
{
|
||||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
$extractor = new TikaServerTextExtractor();
|
||||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
if (!$extractor->isAvailable()) {
|
||||||
}
|
$this->markTestSkipped('tika server not available');
|
||||||
|
}
|
||||||
|
|
||||||
function testServerExtraction() {
|
// Check file
|
||||||
$extractor = new TikaServerTextExtractor();
|
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
|
$content = $extractor->getContent($file);
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
// Check file
|
|
||||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
|
||||||
$content = $extractor->getContent($file);
|
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
|
||||||
|
|
||||||
// Check mime validation
|
|
||||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
|
||||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
|
||||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Check mime validation
|
||||||
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
|
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||||
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user