Merge pull request #21 from helpfulrobot/convert-to-psr-2

Converted to PSR-2
This commit is contained in:
Daniel Hensby 2015-11-18 23:30:07 +00:00
commit ebfa07dc5f
14 changed files with 892 additions and 809 deletions

View File

@ -1,105 +1,112 @@
<?php <?php
interface FileTextCache { interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/** /**
* Save extracted content for a given File entity * Return any cached extracted content for a given file entity
* *
* @param File $file * @param File $file
* @param string $content */
*/ public function load(File $file);
public function save(File $file, $content);
/** /**
* Return any cached extracted content for a given file entity * Invalidate the cache for a given file.
* * Invoked in onBeforeWrite on the file
* @param File $file *
*/ * @param File $file
public function load(File $file); */
public function invalidate(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
} }
/** /**
* Caches the extracted content on the record for the file. * Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits. * Limits the stored file content by default to avoid hitting query size limits.
*/ */
class FileTextCache_Database implements FileTextCache { class FileTextCache_Database implements FileTextCache
{
public function load(File $file) { public function load(File $file)
return $file->FileContentCache; {
} return $file->FileContentCache;
}
public function save(File $file, $content) { public function save(File $file, $content)
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); {
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->write(); $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
} $file->write();
}
public function invalidate(File $file) {
// To prevent writing to the cache from invalidating it
if(!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
} }
/** /**
* Uses SS_Cache with a lifetime to cache extracted content * Uses SS_Cache with a lifetime to cache extracted content
*/ */
class FileTextCache_SSCache implements FileTextCache, Flushable { class FileTextCache_SSCache implements FileTextCache, Flushable
{
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/** /**
* Lifetime of cache in seconds * @return SS_Cache
* Null is indefinite */
* protected static function get_cache()
* @var int|null {
* @config $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
*/ $cache = SS_Cache::factory(__CLASS__);
private static $lifetime = null; $cache->setLifetime($lifetime);
return $cache;
}
/** protected function getKey(File $file)
* @return SS_Cache {
*/ return md5($file->getFullPath());
protected static function get_cache() { }
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime);
return $cache;
}
protected function getKey(File $file) { public function load(File $file)
return md5($file->getFullPath()); {
} $key = $this->getKey($file);
$cache = self::get_cache();
return $cache->load($key);
}
public function load(File $file) { public function save(File $file, $content)
$key = $this->getKey($file); {
$cache = self::get_cache(); $key = $this->getKey($file);
return $cache->load($key); $cache = self::get_cache();
} return $cache->save($content, $key);
}
public function save(File $file, $content) { public static function flush()
$key = $this->getKey($file); {
$cache = self::get_cache(); $cache = self::get_cache();
return $cache->save($content, $key); $cache->clean();
} }
public static function flush() {
$cache = self::get_cache();
$cache->clean();
}
public function invalidate(File $file) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->remove($key);
}
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->remove($key);
}
} }

View File

@ -9,83 +9,88 @@
* @author mstephens * @author mstephens
* *
*/ */
class FileTextExtractable extends DataExtension { class FileTextExtractable extends DataExtension
{
private static $db = array( private static $db = array(
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); );
private static $casting = array( private static $casting = array(
'FileContent' => 'Text' 'FileContent' => 'Text'
); );
private static $dependencies = array( private static $dependencies = array(
'TextCache' => '%$FileTextCache' 'TextCache' => '%$FileTextCache'
); );
/** /**
* @var FileTextCache * @var FileTextCache
*/ */
protected $fileTextCache = null; protected $fileTextCache = null;
/** /**
* *
* @param FileTextCache $cache * @param FileTextCache $cache
*/ */
public function setTextCache(FileTextCache $cache) { public function setTextCache(FileTextCache $cache)
$this->fileTextCache = $cache; {
} $this->fileTextCache = $cache;
}
/** /**
* @return FileTextCache * @return FileTextCache
*/ */
public function getTextCache() { public function getTextCache()
return $this->fileTextCache; {
} return $this->fileTextCache;
}
/** /**
* Helper function for template * Helper function for template
* *
* @return string * @return string
*/ */
public function getFileContent() { public function getFileContent()
return $this->extractFileAsText(); {
} return $this->extractFileAsText();
}
/** /**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself. * The value is also cached into the File record itself.
* *
* @param boolean $disableCache If false, the file content is only parsed on demand. * @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing the cached version * If true, the content parsing is forced, bypassing the cached version
* @return string * @return string
*/ */
public function extractFileAsText($disableCache = false) { public function extractFileAsText($disableCache = false)
if (!$disableCache) { {
$text = $this->getTextCache()->load($this->owner); if (!$disableCache) {
if($text) { $text = $this->getTextCache()->load($this->owner);
return $text; if ($text) {
} return $text;
} }
}
// Determine which extractor can process this file. // Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath); $extractor = FileTextExtractor::for_file($this->owner->FullPath);
if (!$extractor) { if (!$extractor) {
return null; return null;
} }
$text = $extractor->getContent($this->owner->FullPath); $text = $extractor->getContent($this->owner->FullPath);
if (!$text) { if (!$text) {
return null; return null;
} }
$this->getTextCache()->save($this->owner, $text); $this->getTextCache()->save($this->owner, $text);
return $text; return $text;
} }
public function onBeforeWrite() { public function onBeforeWrite()
// Clear cache before changing file {
$this->getTextCache()->invalidate($this->owner); // Clear cache before changing file
} $this->getTextCache()->invalidate($this->owner);
}
} }

View File

@ -5,131 +5,141 @@
* @author mstephens * @author mstephens
* *
*/ */
abstract class FileTextExtractor extends Object { abstract class FileTextExtractor extends Object
{
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
*
* @config
* @var integer
*/
private static $priority = 50;
/** /**
* Set priority from 0-100. * Cache of extractor class names, sorted by priority
* The highest priority extractor for a given content type will be selected. *
* * @var array
* @config */
* @var integer protected static $sorted_extractor_classes = null;
*/
private static $priority = 50;
/** /**
* Cache of extractor class names, sorted by priority * Gets the list of prioritised extractor classes
* *
* @var array * @return array
*/ */
protected static $sorted_extractor_classes = null; protected static function get_extractor_classes()
{
// Check cache
if (self::$sorted_extractor_classes) {
return self::$sorted_extractor_classes;
}
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$classPriorities = array();
foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
/** // Save classes
* Gets the list of prioritised extractor classes $sortedClasses = array_keys($classPriorities);
* return self::$sorted_extractor_classes = $sortedClasses;
* @return array }
*/
protected static function get_extractor_classes() {
// Check cache
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$classPriorities = array();
foreach($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
// Save classes /**
$sortedClasses = array_keys($classPriorities); * Get the text file extractor for the given class
return self::$sorted_extractor_classes = $sortedClasses; *
} * @param string $class
* @return FileTextExtractor
*/
protected static function get_extractor($class)
{
return Injector::inst()->get($class);
}
/** /**
* Get the text file extractor for the given class * Attempt to detect mime type for given file
* *
* @param string $class * @param string $path
* @return FileTextExtractor * @return string Mime type if found
*/ */
protected static function get_extractor($class) { protected static function get_mime($path)
return Injector::inst()->get($class); {
} $file = new Symfony\Component\HttpFoundation\File\File($path);
/** return $file->getMimeType();
* Attempt to detect mime type for given file }
*
* @param string $path
* @return string Mime type if found
*/
protected static function get_mime($path) {
$file = new Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType(); /**
} * @param string $path
* @return FileTextExtractor|null
*/
public static function for_file($path)
{
if (!file_exists($path) || is_dir($path)) {
return;
}
/** $extension = pathinfo($path, PATHINFO_EXTENSION);
* @param string $path $mime = self::get_mime($path);
* @return FileTextExtractor|null foreach (self::get_extractor_classes() as $className) {
*/ $extractor = self::get_extractor($className);
static function for_file($path) {
if(!file_exists($path) || is_dir($path)) {
return;
}
$extension = pathinfo($path, PATHINFO_EXTENSION); // Skip unavailable extractors
$mime = self::get_mime($path); if (!$extractor->isAvailable()) {
foreach(self::get_extractor_classes() as $className) { continue;
$extractor = self::get_extractor($className); }
// Skip unavailable extractors // Check extension
if(!$extractor->isAvailable()) continue; if ($extension && $extractor->supportsExtension($extension)) {
return $extractor;
}
// Check extension // Check mime
if($extension && $extractor->supportsExtension($extension)) { if ($mime && $extractor->supportsMime($mime)) {
return $extractor; return $extractor;
} }
}
}
// Check mime /**
if($mime && $extractor->supportsMime($mime)) { * Checks if the extractor is supported on the current environment,
return $extractor; * for example if the correct binaries or libraries are available.
} *
} * @return boolean
} */
abstract public function isAvailable();
/** /**
* Checks if the extractor is supported on the current environment, * Determine if this extractor supports the given extension.
* for example if the correct binaries or libraries are available. * If support is determined by mime/type only, then this should return false.
* *
* @return boolean * @param string $extension
*/ * @return boolean
abstract public function isAvailable(); */
abstract public function supportsExtension($extension);
/** /**
* Determine if this extractor supports the given extension. * Determine if this extractor suports the given mime type.
* If support is determined by mime/type only, then this should return false. * Will only be called if supportsExtension returns false.
* *
* @param string $extension * @param string $mime
* @return boolean * @return boolean
*/ */
abstract public function supportsExtension($extension); abstract public function supportsMime($mime);
/** /**
* Determine if this extractor suports the given mime type. * Given a file path, extract the contents as text.
* Will only be called if supportsExtension returns false. *
* * @param string $path
* @param string $mime * @return string
* @return boolean */
*/ abstract public function getContent($path);
abstract public function supportsMime($mime);
/**
* Given a file path, extract the contents as text.
*
* @param string $path
* @return string
*/
abstract public function getContent($path);
} }
class FileTextExtractor_Exception extends Exception {} class FileTextExtractor_Exception extends Exception
{
}

View File

@ -5,69 +5,73 @@
* @author mstephens * @author mstephens
* *
*/ */
class HTMLTextExtractor extends FileTextExtractor { class HTMLTextExtractor extends FileTextExtractor
{
public function isAvailable() { public function isAvailable()
return true; {
} return true;
}
public function supportsExtension($extension) { public function supportsExtension($extension)
return in_array( {
strtolower($extension), return in_array(
array("html", "htm", "xhtml") strtolower($extension),
); array("html", "htm", "xhtml")
} );
}
public function supportsMime($mime) { public function supportsMime($mime)
return strtolower($mime) === 'text/html'; {
} return strtolower($mime) === 'text/html';
}
/** /**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it * Lower priority because its not the most clever HTML extraction. If there is something better, use it
* *
* @config * @config
* @var integer * @var integer
*/ */
private static $priority = 10; private static $priority = 10;
/** /**
* Extracts content from regex, by using strip_tags() * Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>, * combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags. * as well as adding line breaks after block tags.
* *
* @param string $path * @param string $path
* @return string * @return string
*/ */
public function getContent($path) { public function getContent($path)
$content = file_get_contents($path); {
// Yes, yes, regex'ing HTML is evil. $content = file_get_contents($path);
// Since we don't care about well-formedness or markup here, it does the job. // Yes, yes, regex'ing HTML is evil.
$content = preg_replace( // Since we don't care about well-formedness or markup here, it does the job.
array( $content = preg_replace(
// Remove invisible content array(
'@<head[^>]*?>.*?</head>@siu', // Remove invisible content
'@<style[^>]*?>.*?</style>@siu', '@<head[^>]*?>.*?</head>@siu',
'@<script[^>]*?.*?</script>@siu', '@<style[^>]*?>.*?</style>@siu',
'@<object[^>]*?.*?</object>@siu', '@<script[^>]*?.*?</script>@siu',
'@<embed[^>]*?.*?</embed>@siu', '@<object[^>]*?.*?</object>@siu',
'@<applet[^>]*?.*?</applet>@siu', '@<embed[^>]*?.*?</embed>@siu',
'@<noframes[^>]*?.*?</noframes>@siu', '@<applet[^>]*?.*?</applet>@siu',
'@<noscript[^>]*?.*?</noscript>@siu', '@<noframes[^>]*?.*?</noframes>@siu',
'@<noembed[^>]*?.*?</noembed>@siu', '@<noscript[^>]*?.*?</noscript>@siu',
// Add line breaks before and after blocks '@<noembed[^>]*?.*?</noembed>@siu',
'@</?((address)|(blockquote)|(center)|(del))@iu', // Add line breaks before and after blocks
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((table)|(th)|(td)|(caption))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((table)|(th)|(td)|(caption))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((frameset)|(frame)|(iframe))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
), '@</?((frameset)|(frame)|(iframe))@iu',
array( ),
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0", array(
), ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
$content ),
); $content
return strip_tags($content); );
} return strip_tags($content);
}
} }

View File

@ -5,94 +5,103 @@
* @author mstephens * @author mstephens
* *
*/ */
class PDFTextExtractor extends FileTextExtractor { class PDFTextExtractor extends FileTextExtractor
{
public function isAvailable()
{
$bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin));
}
public function supportsExtension($extension)
{
return strtolower($extension) === 'pdf';
}
public function isAvailable() { public function supportsMime($mime)
$bin = $this->bin('pdftotext'); {
return (file_exists($bin) && is_executable($bin)); return in_array(
} strtolower($mime),
array(
public function supportsExtension($extension) { 'application/pdf',
return strtolower($extension) === 'pdf'; 'application/x-pdf',
} 'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
public function supportsMime($mime) { /**
return in_array( * Accessor to get the location of the binary
strtolower($mime), *
array( * @param string $prog Name of binary
'application/pdf', * @return string
'application/x-pdf', */
'application/x-bzpdf', protected function bin($prog = '')
'application/x-gzpdf' {
) if ($this->config()->binary_location) {
); // By config
} $path = $this->config()->binary_location;
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
/** return ($path ? $path . '/' : '') . $prog;
* Accessor to get the location of the binary }
*
* @param string $prog Name of binary public function getContent($path)
* @return string {
*/ if (!$path) {
protected function bin($prog = '') { return "";
if ($this->config()->binary_location) { } // no file
// By config $content = $this->getRawOutput($path);
$path = $this->config()->binary_location; return $this->cleanupLigatures($content);
} elseif (file_exists('/usr/bin/pdftotext')) { }
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
return ( $path ? $path . '/' : '' ) . $prog; /**
} * Invoke pdftotext with the given path
*
public function getContent($path) { * @param string $path
if(!$path) return ""; // no file * @return string Output
$content = $this->getRawOutput($path); * @throws FileTextExtractor_Exception
return $this->cleanupLigatures($content); */
} protected function getRawOutput($path)
{
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode('', $err)
));
}
return implode('', $content);
}
/** /**
* Invoke pdftotext with the given path * Removes utf-8 ligatures.
* *
* @param string $path * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
* @return string Output *
* @throws FileTextExtractor_Exception * @param string $input
*/ * @return string
protected function getRawOutput($path) { */
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); protected function cleanupLigatures($input)
if($err) { {
throw new FileTextExtractor_Exception(sprintf( $mapping = array(
'PDFTextExtractor->getContent() failed for %s: %s', 'ff' => 'ff',
$path, 'fi' => 'fi',
implode('', $err) 'fl' => 'fl',
)); 'ffi' => 'ffi',
} 'ffl' => 'ffl',
return implode('', $content); 'ſt' => 'ft',
} 'st' => 'st'
);
/** return str_replace(array_keys($mapping), array_values($mapping), $input);
* Removes utf-8 ligatures. }
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input) {
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
} }

View File

@ -10,83 +10,93 @@ use Guzzle\Http\Client;
* @author ischommer * @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler * @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/ */
class SolrCellTextExtractor extends FileTextExtractor { class SolrCellTextExtractor extends FileTextExtractor
{
/**
* Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var string
*/
private static $base_url;
/** private static $priority = 75;
* Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var string
*/
private static $base_url;
private static $priority = 75; protected $httpClient;
protected $httpClient; public function getHttpClient()
{
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
}
return $this->httpClient;
}
public function getHttpClient() { public function setHttpClient($client)
if(!$this->config()->get('base_url')) { {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); $this->httpClient = $client;
} }
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
return $this->httpClient;
}
public function setHttpClient($client) { public function isAvailable()
$this->httpClient = $client; {
} $url = $this->config()->get('base_url');
return (boolean) $url;
}
public function isAvailable() { public function supportsExtension($extension)
$url = $this->config()->get('base_url'); {
return (boolean) $url; return in_array(
} strtolower($extension),
array(
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
);
}
public function supportsExtension($extension) { public function supportsMime($mime)
return in_array( {
strtolower($extension), // Rely on supportsExtension
array( return false;
'pdf', 'doc', 'docx', 'xls', 'xlsx', }
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv' public function getContent($path)
) {
); if (!$path) {
} return "";
} // no file
public function supportsMime($mime) { $fileName = basename($path);
// Rely on supportsExtension $client = $this->getHttpClient();
return false; try {
} $request = $client
->post()
public function getContent($path) { ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
if (!$path) return ""; // no file ->addPostFiles(array('myfile' => $path));
$response = $request->send();
$fileName = basename($path); } catch (InvalidArgumentException $e) {
$client = $this->getHttpClient(); SS_Log::log(
try { sprintf(
$request = $client 'Error extracting text from "%s" (message: %s)',
->post() $path,
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) $e->getMessage()
->addPostFiles(array('myfile' => $path)); ),
$response = $request->send(); SS_Log::NOTICE
} catch(InvalidArgumentException $e) { );
SS_Log::log( return null;
sprintf( }
'Error extracting text from "%s" (message: %s)', // Use preg match to avoid SimpleXML running out of memory on large text nodes
$path, preg_match(
$e->getMessage() sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
), (string)$response->getBody(),
SS_Log::NOTICE $matches
); );
return null;
}
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
(string)$response->getBody(),
$matches
);
return $matches ? $matches[1] : null; return $matches ? $matches[1] : null;
} }
} }

View File

@ -5,100 +5,112 @@
* *
* {@link http://tika.apache.org/1.7/gettingstarted.html} * {@link http://tika.apache.org/1.7/gettingstarted.html}
*/ */
class TikaServerTextExtractor extends FileTextExtractor { class TikaServerTextExtractor extends FileTextExtractor
{
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/** /**
* Tika server is pretty efficient so use it immediately if available * Server endpoint
* *
* @var integer * @var string
* @config * @config
*/ */
private static $priority = 80; private static $server_endpoint;
/** /**
* Server endpoint * @var TikaRestClient
* */
* @var string protected $client = null;
* @config
*/
private static $server_endpoint;
/** /**
* @var TikaRestClient * @return TikaRestClient
*/ */
protected $client = null; public function getClient()
{
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
array($this->getServerEndpoint())
)
);
}
/** public function getServerEndpoint()
* @return TikaRestClient {
*/ if (defined('SS_TIKA_ENDPOINT')) {
public function getClient() { return SS_TIKA_ENDPOINT;
return $this->client ?: }
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
array($this->getServerEndpoint())
)
);
}
public function getServerEndpoint() { if (getenv('SS_TIKA_ENDPOINT')) {
if(defined('SS_TIKA_ENDPOINT')) { return getenv('SS_TIKA_ENDPOINT');
return SS_TIKA_ENDPOINT; }
}
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT'); // Default to configured endpoint
return $this->config()->server_endpoint;
}
// Default to configured endpoint /**
return $this->config()->server_endpoint; * Get the version of tika installed, or 0 if not installed
} *
* @return float version of tika
*/
public function getVersion()
{
return $this
->getClient()
->getVersion();
}
/** public function isAvailable()
* Get the version of tika installed, or 0 if not installed {
* return $this->getServerEndpoint() &&
* @return float version of tika $this->getClient()->isAvailable() &&
*/ $this->getVersion() >= 1.7;
public function getVersion() { }
return $this
->getClient()
->getVersion();
}
public function isAvailable() { public function supportsExtension($extension)
return $this->getServerEndpoint() && {
$this->getClient()->isAvailable() && // Determine support via mime type only
$this->getVersion() >= 1.7; return false;
} }
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
/** /**
* Cache of supported mime types * Cache of supported mime types
* *
* @var array * @var array
*/ */
protected $supportedMimes = array(); protected $supportedMimes = array();
public function supportsMime($mime) { public function supportsMime($mime)
$supported = $this->supportedMimes ?: {
($this->supportedMimes = $this->getClient()->getSupportedMimes()); $supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
// Check if supported (most common / quickest lookup) // Check if supported (most common / quickest lookup)
if(isset($supported[$mime])) return true; if (isset($supported[$mime])) {
return true;
}
// Check aliases // Check aliases
foreach($supported as $info) { foreach ($supported as $info) {
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true; if (isset($info['alias']) && in_array($mime, $info['alias'])) {
} return true;
}
}
return false; return false;
} }
public function getContent($path) {
return $this->getClient()->tika($path);
}
public function getContent($path)
{
return $this->getClient()->tika($path);
}
} }

View File

@ -5,90 +5,101 @@
* *
* {@link http://tika.apache.org/1.7/gettingstarted.html} * {@link http://tika.apache.org/1.7/gettingstarted.html}
*/ */
class TikaTextExtractor extends FileTextExtractor { class TikaTextExtractor extends FileTextExtractor
{
/**
* Text extraction mode. Defaults to -t (plain text)
*
* @var string
* @config
*/
private static $output_mode = '-t';
/** /**
* Text extraction mode. Defaults to -t (plain text) * Get the version of tika installed, or 0 if not installed
* *
* @var string * @return float version of tika
* @config */
*/ public function getVersion()
private static $output_mode = '-t'; {
$code = $this->runShell('tika --version', $stdout);
/** // Parse output
* Get the version of tika installed, or 0 if not installed if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
* return $matches['version'];
* @return float version of tika }
*/
public function getVersion() {
$code = $this->runShell('tika --version', $stdout);
// Parse output return 0;
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) { }
return $matches['version'];
}
return 0; /**
} * Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) {
return 255;
}
/** // Send content as input
* Runs an arbitrary and safely escaped shell command fwrite($pipes[0], $input);
* fclose($pipes[0]);
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) return 255;
// Send content as input // Get output
fwrite($pipes[0], $input); $stdout = stream_get_contents($pipes[1]);
fclose($pipes[0]); fclose($pipes[1]);
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]);
// Get output // Get result
$stdout = stream_get_contents($pipes[1]); return proc_close($proc);
fclose($pipes[1]); }
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]); public function getContent($path)
{
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if ($code == 0) {
return $output;
}
}
// Get result public function isAvailable()
return proc_close($proc); {
} return $this->getVersion() > 0;
}
public function getContent($path) {
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if($code == 0) return $output;
}
public function isAvailable() { public function supportsExtension($extension)
return $this->getVersion() > 0; {
} // Determine support via mime type only
return false;
}
public function supportsExtension($extension) { public function supportsMime($mime)
// Determine support via mime type only {
return false; // Get list of supported mime types
} $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) {
public function supportsMime($mime) { return false;
// Get list of supported mime types } // Error case
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if($code) return false; // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
} }

View File

@ -3,92 +3,97 @@
use Guzzle\Http\Client; use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException; use Guzzle\Http\Exception\RequestException;
class TikaRestClient extends Client { class TikaRestClient extends Client
{
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable()
{
try {
return $this
->get()->send()
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
/** /**
* Detect if the service is available * Get version code
* *
* @return bool * @return float
*/ */
public function isAvailable() { public function getVersion()
try { {
return $this $response = $this->get('version')->send();
->get()->send() // Parse output
->getStatusCode() == 200; if ($response->getStatusCode() == 200 &&
} catch (RequestException $ex) { preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
return false; ) {
} return (float)$matches['version'];
} }
/** return 0.0;
* Get version code }
*
* @return float
*/
public function getVersion() {
$response = $this->get('version')->send();
// Parse output
if($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) {
return (float)$matches['version'];
}
return 0.0; protected $mimes = array();
}
protected $mimes = array(); /**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes()
{
if ($this->mimes) {
return $this->mimes;
}
/** $response = $this->get(
* Gets supported mime data. May include aliased mime types. 'mime-types',
* array('Accept' => 'application/json')
* @return array )->send();
*/
public function getSupportedMimes() {
if($this->mimes) return $this->mimes;
$response = $this->get( return $this->mimes = $response->json();
'mime-types', }
array('Accept' => 'application/json')
)->send();
return $this->mimes = $response->json(); /**
} * Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file)
{
$text = null;
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
file_get_contents($file)
)->send();
$text = $response->getBody(true);
} catch (RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
/** // Only available if tika-server was started with --includeStack
* Extract text content from a given file. $body = $e->getResponse()->getBody(true);
* Logs a notice-level error if the document can't be parsed. if ($body) {
* $msg .= ' Body: ' . $body;
* @param string $file Full filesystem path to a file to post }
* @return string Content of the file extracted as plain text
*/
public function tika($file) {
$text = null;
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
file_get_contents($file)
)->send();
$text = $response->getBody(true);
} catch(RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
if($body) {
$msg .= ' Body: ' . $body;
}
SS_Log::log($msg, SS_Log::NOTICE);
}
return $text;
}
SS_Log::log($msg, SS_Log::NOTICE);
}
return $text;
}
} }

View File

@ -1,17 +1,17 @@
<?php <?php
class FileTextCacheDatabaseTest extends SapphireTest { class FileTextCacheDatabaseTest extends SapphireTest
{
public function testTruncatesByMaxLength() { public function testTruncatesByMaxLength()
Config::nest(); {
Config::nest();
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
$cache = new FileTextCache_Database(); Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
$file = $this->getMock('File', array('write')); $cache = new FileTextCache_Database();
$content = '0123456789'; $file = $this->getMock('File', array('write'));
$cache->save($file, $content); $content = '0123456789';
$this->assertEquals($cache->load($file), '01234'); $cache->save($file, $content);
$this->assertEquals($cache->load($file), '01234');
Config::unnest(); Config::unnest();
} }
}
}

View File

@ -1,43 +1,46 @@
<?php <?php
class FileTextExtractableTest extends SapphireTest { class FileTextExtractableTest extends SapphireTest
{
protected $requiredExtensions = array(
'File' => array('FileTextExtractable')
);
protected $requiredExtensions = array( public function setUp()
'File' => array('FileTextExtractable') {
); parent::setUp();
public function setUp() { // Ensure that html is a valid extension
parent::setUp(); Config::inst()
->nest()
->update('File', 'allowed_extensions', array('html'));
}
// Ensure that html is a valid extension public function tearDown()
Config::inst() {
->nest() Config::unnest();
->update('File', 'allowed_extensions', array('html')); parent::tearDown();
} }
public function tearDown() { public function testExtractFileAsText()
Config::unnest(); {
parent::tearDown(); // Create a copy of the file, as it may be clobbered by the test
} // ($file->extractFileAsText() calls $file->write)
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// Use HTML, since the extractor is always available
$file = new File(array(
'Name' => 'test1-copy.html',
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
));
$file->write();
$content = $file->extractFileAsText();
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
function testExtractFileAsText() { if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
// Create a copy of the file, as it may be clobbered by the test unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// ($file->extractFileAsText() calls $file->write) }
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); }
}
// Use HTML, since the extractor is always available
$file = new File(array(
'Name' => 'test1-copy.html',
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
));
$file->write();
$content = $file->extractFileAsText();
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
}
}

View File

@ -1,14 +1,14 @@
<?php <?php
class HTMLTextExtractorTest extends SapphireTest { class HTMLTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
$extractor = new HTMLTextExtractor(); {
$extractor = new HTMLTextExtractor();
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
$this->assertContains('Test Headline', $content); $this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); $this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags'); $this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
} }
}
}

View File

@ -1,12 +1,14 @@
<?php <?php
class PDFTextExtractorTest extends SapphireTest { class PDFTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
$extractor = new PDFTextExtractor(); {
if(!$extractor->isAvailable()) $this->markTestSkipped('pdftotext not available'); $extractor = new PDFTextExtractor();
if (!$extractor->isAvailable()) {
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); $this->markTestSkipped('pdftotext not available');
$this->assertContains('This is a test file with a link', $content); }
}
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
$this->assertContains('This is a test file with a link', $content);
}
} }

View File

@ -3,36 +3,41 @@
/** /**
* Tests the {@see TikaTextExtractor} class * Tests the {@see TikaTextExtractor} class
*/ */
class TikaTextExtractorTest extends SapphireTest { class TikaTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
$extractor = new TikaTextExtractor(); {
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available'); $extractor = new TikaTextExtractor();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika cli not available');
}
// Check file // Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file); $content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);
// Check mime validation // Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf')); $this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html')); $this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported')); $this->assertFalse($extractor->supportsMime('application/not-supported'));
} }
function testServerExtraction() { public function testServerExtraction()
$extractor = new TikaServerTextExtractor(); {
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available'); $extractor = new TikaServerTextExtractor();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file // Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file); $content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
} }