diff --git a/code/extensions/FileTextCache.php b/code/extensions/FileTextCache.php index cd1d046..385d848 100644 --- a/code/extensions/FileTextCache.php +++ b/code/extensions/FileTextCache.php @@ -1,105 +1,112 @@ FileContentCache; - } +class FileTextCache_Database implements FileTextCache +{ + public function load(File $file) + { + return $file->FileContentCache; + } - public function save(File $file, $content) { - $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); - $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; - $file->write(); - } - - public function invalidate(File $file) { - // To prevent writing to the cache from invalidating it - if(!$file->isChanged('FileContentCache')) { - $file->FileContentCache = ''; - } - } + public function save(File $file, $content) + { + $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); + $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; + $file->write(); + } + public function invalidate(File $file) + { + // To prevent writing to the cache from invalidating it + if (!$file->isChanged('FileContentCache')) { + $file->FileContentCache = ''; + } + } } /** * Uses SS_Cache with a lifetime to cache extracted content */ -class FileTextCache_SSCache implements FileTextCache, Flushable { +class FileTextCache_SSCache implements FileTextCache, Flushable +{ + /** + * Lifetime of cache in seconds + * Null is indefinite + * + * @var int|null + * @config + */ + private static $lifetime = null; - /** - * Lifetime of cache in seconds - * Null is indefinite - * - * @var int|null - * @config - */ - private static $lifetime = null; + /** + * @return SS_Cache + */ + protected static function get_cache() + { + $lifetime = Config::inst()->get(__CLASS__, 'lifetime'); + $cache = SS_Cache::factory(__CLASS__); + $cache->setLifetime($lifetime); + return $cache; + } - /** - * @return SS_Cache - */ - protected static function get_cache() { - $lifetime = Config::inst()->get(__CLASS__, 'lifetime'); - $cache = SS_Cache::factory(__CLASS__); - $cache->setLifetime($lifetime); - return $cache; - } + protected function getKey(File $file) + { + return md5($file->getFullPath()); + } - protected function getKey(File $file) { - return md5($file->getFullPath()); - } + public function load(File $file) + { + $key = $this->getKey($file); + $cache = self::get_cache(); + return $cache->load($key); + } - public function load(File $file) { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->load($key); - } + public function save(File $file, $content) + { + $key = $this->getKey($file); + $cache = self::get_cache(); + return $cache->save($content, $key); + } - public function save(File $file, $content) { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->save($content, $key); - } - - public static function flush() { - $cache = self::get_cache(); - $cache->clean(); - } - - public function invalidate(File $file) { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->remove($key); - } + public static function flush() + { + $cache = self::get_cache(); + $cache->clean(); + } + public function invalidate(File $file) + { + $key = $this->getKey($file); + $cache = self::get_cache(); + return $cache->remove($key); + } } diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php index 16110e3..f39d407 100644 --- a/code/extensions/FileTextExtractable.php +++ b/code/extensions/FileTextExtractable.php @@ -9,83 +9,88 @@ * @author mstephens * */ -class FileTextExtractable extends DataExtension { - - private static $db = array( - 'FileContentCache' => 'Text' - ); +class FileTextExtractable extends DataExtension +{ + private static $db = array( + 'FileContentCache' => 'Text' + ); - private static $casting = array( - 'FileContent' => 'Text' - ); + private static $casting = array( + 'FileContent' => 'Text' + ); - private static $dependencies = array( - 'TextCache' => '%$FileTextCache' - ); + private static $dependencies = array( + 'TextCache' => '%$FileTextCache' + ); - /** - * @var FileTextCache - */ - protected $fileTextCache = null; + /** + * @var FileTextCache + */ + protected $fileTextCache = null; - /** - * - * @param FileTextCache $cache - */ - public function setTextCache(FileTextCache $cache) { - $this->fileTextCache = $cache; - } + /** + * + * @param FileTextCache $cache + */ + public function setTextCache(FileTextCache $cache) + { + $this->fileTextCache = $cache; + } - /** - * @return FileTextCache - */ - public function getTextCache() { - return $this->fileTextCache; - } + /** + * @return FileTextCache + */ + public function getTextCache() + { + return $this->fileTextCache; + } - /** - * Helper function for template - * - * @return string - */ - public function getFileContent() { - return $this->extractFileAsText(); - } + /** + * Helper function for template + * + * @return string + */ + public function getFileContent() + { + return $this->extractFileAsText(); + } - /** - * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. - * The value is also cached into the File record itself. - * - * @param boolean $disableCache If false, the file content is only parsed on demand. - * If true, the content parsing is forced, bypassing the cached version - * @return string - */ - public function extractFileAsText($disableCache = false) { - if (!$disableCache) { - $text = $this->getTextCache()->load($this->owner); - if($text) { - return $text; - } - } + /** + * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. + * The value is also cached into the File record itself. + * + * @param boolean $disableCache If false, the file content is only parsed on demand. + * If true, the content parsing is forced, bypassing the cached version + * @return string + */ + public function extractFileAsText($disableCache = false) + { + if (!$disableCache) { + $text = $this->getTextCache()->load($this->owner); + if ($text) { + return $text; + } + } - // Determine which extractor can process this file. - $extractor = FileTextExtractor::for_file($this->owner->FullPath); - if (!$extractor) { - return null; - } + // Determine which extractor can process this file. + $extractor = FileTextExtractor::for_file($this->owner->FullPath); + if (!$extractor) { + return null; + } - $text = $extractor->getContent($this->owner->FullPath); - if (!$text) { - return null; - } + $text = $extractor->getContent($this->owner->FullPath); + if (!$text) { + return null; + } - $this->getTextCache()->save($this->owner, $text); + $this->getTextCache()->save($this->owner, $text); - return $text; - } + return $text; + } - public function onBeforeWrite() { - // Clear cache before changing file - $this->getTextCache()->invalidate($this->owner); - } + public function onBeforeWrite() + { + // Clear cache before changing file + $this->getTextCache()->invalidate($this->owner); + } } diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php index b22e98e..cc7c176 100644 --- a/code/extractors/FileTextExtractor.php +++ b/code/extractors/FileTextExtractor.php @@ -5,131 +5,141 @@ * @author mstephens * */ -abstract class FileTextExtractor extends Object { +abstract class FileTextExtractor extends Object +{ + /** + * Set priority from 0-100. + * The highest priority extractor for a given content type will be selected. + * + * @config + * @var integer + */ + private static $priority = 50; - /** - * Set priority from 0-100. - * The highest priority extractor for a given content type will be selected. - * - * @config - * @var integer - */ - private static $priority = 50; + /** + * Cache of extractor class names, sorted by priority + * + * @var array + */ + protected static $sorted_extractor_classes = null; - /** - * Cache of extractor class names, sorted by priority - * - * @var array - */ - protected static $sorted_extractor_classes = null; + /** + * Gets the list of prioritised extractor classes + * + * @return array + */ + protected static function get_extractor_classes() + { + // Check cache + if (self::$sorted_extractor_classes) { + return self::$sorted_extractor_classes; + } + + // Generate the sorted list of extractors on demand. + $classes = ClassInfo::subclassesFor("FileTextExtractor"); + array_shift($classes); + $classPriorities = array(); + foreach ($classes as $class) { + $classPriorities[$class] = Config::inst()->get($class, 'priority'); + } + arsort($classPriorities); - /** - * Gets the list of prioritised extractor classes - * - * @return array - */ - protected static function get_extractor_classes() { - // Check cache - if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes; - - // Generate the sorted list of extractors on demand. - $classes = ClassInfo::subclassesFor("FileTextExtractor"); - array_shift($classes); - $classPriorities = array(); - foreach($classes as $class) { - $classPriorities[$class] = Config::inst()->get($class, 'priority'); - } - arsort($classPriorities); + // Save classes + $sortedClasses = array_keys($classPriorities); + return self::$sorted_extractor_classes = $sortedClasses; + } - // Save classes - $sortedClasses = array_keys($classPriorities); - return self::$sorted_extractor_classes = $sortedClasses; - } + /** + * Get the text file extractor for the given class + * + * @param string $class + * @return FileTextExtractor + */ + protected static function get_extractor($class) + { + return Injector::inst()->get($class); + } - /** - * Get the text file extractor for the given class - * - * @param string $class - * @return FileTextExtractor - */ - protected static function get_extractor($class) { - return Injector::inst()->get($class); - } + /** + * Attempt to detect mime type for given file + * + * @param string $path + * @return string Mime type if found + */ + protected static function get_mime($path) + { + $file = new Symfony\Component\HttpFoundation\File\File($path); - /** - * Attempt to detect mime type for given file - * - * @param string $path - * @return string Mime type if found - */ - protected static function get_mime($path) { - $file = new Symfony\Component\HttpFoundation\File\File($path); + return $file->getMimeType(); + } - return $file->getMimeType(); - } + /** + * @param string $path + * @return FileTextExtractor|null + */ + public static function for_file($path) + { + if (!file_exists($path) || is_dir($path)) { + return; + } - /** - * @param string $path - * @return FileTextExtractor|null - */ - static function for_file($path) { - if(!file_exists($path) || is_dir($path)) { - return; - } + $extension = pathinfo($path, PATHINFO_EXTENSION); + $mime = self::get_mime($path); + foreach (self::get_extractor_classes() as $className) { + $extractor = self::get_extractor($className); - $extension = pathinfo($path, PATHINFO_EXTENSION); - $mime = self::get_mime($path); - foreach(self::get_extractor_classes() as $className) { - $extractor = self::get_extractor($className); + // Skip unavailable extractors + if (!$extractor->isAvailable()) { + continue; + } - // Skip unavailable extractors - if(!$extractor->isAvailable()) continue; + // Check extension + if ($extension && $extractor->supportsExtension($extension)) { + return $extractor; + } - // Check extension - if($extension && $extractor->supportsExtension($extension)) { - return $extractor; - } + // Check mime + if ($mime && $extractor->supportsMime($mime)) { + return $extractor; + } + } + } - // Check mime - if($mime && $extractor->supportsMime($mime)) { - return $extractor; - } - } - } + /** + * Checks if the extractor is supported on the current environment, + * for example if the correct binaries or libraries are available. + * + * @return boolean + */ + abstract public function isAvailable(); - /** - * Checks if the extractor is supported on the current environment, - * for example if the correct binaries or libraries are available. - * - * @return boolean - */ - abstract public function isAvailable(); + /** + * Determine if this extractor supports the given extension. + * If support is determined by mime/type only, then this should return false. + * + * @param string $extension + * @return boolean + */ + abstract public function supportsExtension($extension); - /** - * Determine if this extractor supports the given extension. - * If support is determined by mime/type only, then this should return false. - * - * @param string $extension - * @return boolean - */ - abstract public function supportsExtension($extension); + /** + * Determine if this extractor suports the given mime type. + * Will only be called if supportsExtension returns false. + * + * @param string $mime + * @return boolean + */ + abstract public function supportsMime($mime); - /** - * Determine if this extractor suports the given mime type. - * Will only be called if supportsExtension returns false. - * - * @param string $mime - * @return boolean - */ - abstract public function supportsMime($mime); - - /** - * Given a file path, extract the contents as text. - * - * @param string $path - * @return string - */ - abstract public function getContent($path); + /** + * Given a file path, extract the contents as text. + * + * @param string $path + * @return string + */ + abstract public function getContent($path); } -class FileTextExtractor_Exception extends Exception {} \ No newline at end of file +class FileTextExtractor_Exception extends Exception +{ +} diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php index a41fe23..810f473 100644 --- a/code/extractors/HTMLTextExtractor.php +++ b/code/extractors/HTMLTextExtractor.php @@ -5,69 +5,73 @@ * @author mstephens * */ -class HTMLTextExtractor extends FileTextExtractor { - - public function isAvailable() { - return true; - } +class HTMLTextExtractor extends FileTextExtractor +{ + public function isAvailable() + { + return true; + } - public function supportsExtension($extension) { - return in_array( - strtolower($extension), - array("html", "htm", "xhtml") - ); - } + public function supportsExtension($extension) + { + return in_array( + strtolower($extension), + array("html", "htm", "xhtml") + ); + } - public function supportsMime($mime) { - return strtolower($mime) === 'text/html'; - } + public function supportsMime($mime) + { + return strtolower($mime) === 'text/html'; + } - /** - * Lower priority because its not the most clever HTML extraction. If there is something better, use it - * - * @config - * @var integer - */ - private static $priority = 10; + /** + * Lower priority because its not the most clever HTML extraction. If there is something better, use it + * + * @config + * @var integer + */ + private static $priority = 10; - /** - * Extracts content from regex, by using strip_tags() - * combined with regular expressions to remove non-content tags like @siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - // Add line breaks before and after blocks - '@ or @siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@bin('pdftotext'); + return (file_exists($bin) && is_executable($bin)); + } + + public function supportsExtension($extension) + { + return strtolower($extension) === 'pdf'; + } - public function isAvailable() { - $bin = $this->bin('pdftotext'); - return (file_exists($bin) && is_executable($bin)); - } - - public function supportsExtension($extension) { - return strtolower($extension) === 'pdf'; - } + public function supportsMime($mime) + { + return in_array( + strtolower($mime), + array( + 'application/pdf', + 'application/x-pdf', + 'application/x-bzpdf', + 'application/x-gzpdf' + ) + ); + } - public function supportsMime($mime) { - return in_array( - strtolower($mime), - array( - 'application/pdf', - 'application/x-pdf', - 'application/x-bzpdf', - 'application/x-gzpdf' - ) - ); - } + /** + * Accessor to get the location of the binary + * + * @param string $prog Name of binary + * @return string + */ + protected function bin($prog = '') + { + if ($this->config()->binary_location) { + // By config + $path = $this->config()->binary_location; + } elseif (file_exists('/usr/bin/pdftotext')) { + // By searching common directories + $path = '/usr/bin'; + } elseif (file_exists('/usr/local/bin/pdftotext')) { + $path = '/usr/local/bin'; + } else { + $path = '.'; // Hope it's in path + } - /** - * Accessor to get the location of the binary - * - * @param string $prog Name of binary - * @return string - */ - protected function bin($prog = '') { - if ($this->config()->binary_location) { - // By config - $path = $this->config()->binary_location; - } elseif (file_exists('/usr/bin/pdftotext')) { - // By searching common directories - $path = '/usr/bin'; - } elseif (file_exists('/usr/local/bin/pdftotext')) { - $path = '/usr/local/bin'; - } else { - $path = '.'; // Hope it's in path - } + return ($path ? $path . '/' : '') . $prog; + } + + public function getContent($path) + { + if (!$path) { + return ""; + } // no file + $content = $this->getRawOutput($path); + return $this->cleanupLigatures($content); + } - return ( $path ? $path . '/' : '' ) . $prog; - } - - public function getContent($path) { - if(!$path) return ""; // no file - $content = $this->getRawOutput($path); - return $this->cleanupLigatures($content); - } + /** + * Invoke pdftotext with the given path + * + * @param string $path + * @return string Output + * @throws FileTextExtractor_Exception + */ + protected function getRawOutput($path) + { + exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); + if ($err) { + throw new FileTextExtractor_Exception(sprintf( + 'PDFTextExtractor->getContent() failed for %s: %s', + $path, + implode('', $err) + )); + } + return implode('', $content); + } - /** - * Invoke pdftotext with the given path - * - * @param string $path - * @return string Output - * @throws FileTextExtractor_Exception - */ - protected function getRawOutput($path) { - exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); - if($err) { - throw new FileTextExtractor_Exception(sprintf( - 'PDFTextExtractor->getContent() failed for %s: %s', - $path, - implode('', $err) - )); - } - return implode('', $content); - } - - /** - * Removes utf-8 ligatures. - * - * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting - * - * @param string $input - * @return string - */ - protected function cleanupLigatures($input) { - $mapping = array( - 'ff' => 'ff', - 'fi' => 'fi', - 'fl' => 'fl', - 'ffi' => 'ffi', - 'ffl' => 'ffl', - 'ſt' => 'ft', - 'st' => 'st' - ); - return str_replace(array_keys($mapping), array_values($mapping), $input); - } + /** + * Removes utf-8 ligatures. + * + * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting + * + * @param string $input + * @return string + */ + protected function cleanupLigatures($input) + { + $mapping = array( + 'ff' => 'ff', + 'fi' => 'fi', + 'fl' => 'fl', + 'ffi' => 'ffi', + 'ffl' => 'ffl', + 'ſt' => 'ft', + 'st' => 'st' + ); + return str_replace(array_keys($mapping), array_values($mapping), $input); + } } diff --git a/code/extractors/SolrCellTextExtractor.php b/code/extractors/SolrCellTextExtractor.php index 29d26f9..6e14543 100644 --- a/code/extractors/SolrCellTextExtractor.php +++ b/code/extractors/SolrCellTextExtractor.php @@ -10,83 +10,93 @@ use Guzzle\Http\Client; * @author ischommer * @see http://wiki.apache.org/solr/ExtractingRequestHandler */ -class SolrCellTextExtractor extends FileTextExtractor { +class SolrCellTextExtractor extends FileTextExtractor +{ + /** + * Base URL to use for solr text extraction. + * E.g. http://localhost:8983/solr/update/extract + * + * @config + * @var string + */ + private static $base_url; - /** - * Base URL to use for solr text extraction. - * E.g. http://localhost:8983/solr/update/extract - * - * @config - * @var string - */ - private static $base_url; + private static $priority = 75; - private static $priority = 75; + protected $httpClient; - protected $httpClient; + public function getHttpClient() + { + if (!$this->config()->get('base_url')) { + throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); + } + if (!$this->httpClient) { + $this->httpClient = new Client($this->config()->get('base_url')); + } + return $this->httpClient; + } - public function getHttpClient() { - if(!$this->config()->get('base_url')) { - throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); - } - if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url')); - return $this->httpClient; - } + public function setHttpClient($client) + { + $this->httpClient = $client; + } - public function setHttpClient($client) { - $this->httpClient = $client; - } + public function isAvailable() + { + $url = $this->config()->get('base_url'); + return (boolean) $url; + } - public function isAvailable() { - $url = $this->config()->get('base_url'); - return (boolean) $url; - } + public function supportsExtension($extension) + { + return in_array( + strtolower($extension), + array( + 'pdf', 'doc', 'docx', 'xls', 'xlsx', + 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', + 'ppt', 'pptx', 'odp', 'fodp', 'csv' + ) + ); + } - public function supportsExtension($extension) { - return in_array( - strtolower($extension), - array( - 'pdf', 'doc', 'docx', 'xls', 'xlsx', - 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', - 'ppt', 'pptx', 'odp', 'fodp', 'csv' - ) - ); - } + public function supportsMime($mime) + { + // Rely on supportsExtension + return false; + } + + public function getContent($path) + { + if (!$path) { + return ""; + } // no file - public function supportsMime($mime) { - // Rely on supportsExtension - return false; - } - - public function getContent($path) { - if (!$path) return ""; // no file - - $fileName = basename($path); - $client = $this->getHttpClient(); - try { - $request = $client - ->post() - ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) - ->addPostFiles(array('myfile' => $path)); - $response = $request->send(); - } catch(InvalidArgumentException $e) { - SS_Log::log( - sprintf( - 'Error extracting text from "%s" (message: %s)', - $path, - $e->getMessage() - ), - SS_Log::NOTICE - ); - return null; - } - // Use preg match to avoid SimpleXML running out of memory on large text nodes - preg_match( - sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), - (string)$response->getBody(), - $matches - ); + $fileName = basename($path); + $client = $this->getHttpClient(); + try { + $request = $client + ->post() + ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) + ->addPostFiles(array('myfile' => $path)); + $response = $request->send(); + } catch (InvalidArgumentException $e) { + SS_Log::log( + sprintf( + 'Error extracting text from "%s" (message: %s)', + $path, + $e->getMessage() + ), + SS_Log::NOTICE + ); + return null; + } + // Use preg match to avoid SimpleXML running out of memory on large text nodes + preg_match( + sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), + (string)$response->getBody(), + $matches + ); - return $matches ? $matches[1] : null; - } + return $matches ? $matches[1] : null; + } } diff --git a/code/extractors/TikaServerTextExtractor.php b/code/extractors/TikaServerTextExtractor.php index b4a74e5..5371206 100644 --- a/code/extractors/TikaServerTextExtractor.php +++ b/code/extractors/TikaServerTextExtractor.php @@ -5,100 +5,112 @@ * * {@link http://tika.apache.org/1.7/gettingstarted.html} */ -class TikaServerTextExtractor extends FileTextExtractor { +class TikaServerTextExtractor extends FileTextExtractor +{ + /** + * Tika server is pretty efficient so use it immediately if available + * + * @var integer + * @config + */ + private static $priority = 80; - /** - * Tika server is pretty efficient so use it immediately if available - * - * @var integer - * @config - */ - private static $priority = 80; + /** + * Server endpoint + * + * @var string + * @config + */ + private static $server_endpoint; - /** - * Server endpoint - * - * @var string - * @config - */ - private static $server_endpoint; + /** + * @var TikaRestClient + */ + protected $client = null; - /** - * @var TikaRestClient - */ - protected $client = null; + /** + * @return TikaRestClient + */ + public function getClient() + { + return $this->client ?: + ($this->client = + Injector::inst()->createWithArgs( + 'TikaRestClient', + array($this->getServerEndpoint()) + ) + ); + } - /** - * @return TikaRestClient - */ - public function getClient() { - return $this->client ?: - ($this->client = - Injector::inst()->createWithArgs( - 'TikaRestClient', - array($this->getServerEndpoint()) - ) - ); - } + public function getServerEndpoint() + { + if (defined('SS_TIKA_ENDPOINT')) { + return SS_TIKA_ENDPOINT; + } - public function getServerEndpoint() { - if(defined('SS_TIKA_ENDPOINT')) { - return SS_TIKA_ENDPOINT; - } + if (getenv('SS_TIKA_ENDPOINT')) { + return getenv('SS_TIKA_ENDPOINT'); + } - if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT'); + // Default to configured endpoint + return $this->config()->server_endpoint; + } - // Default to configured endpoint - return $this->config()->server_endpoint; - } + /** + * Get the version of tika installed, or 0 if not installed + * + * @return float version of tika + */ + public function getVersion() + { + return $this + ->getClient() + ->getVersion(); + } - /** - * Get the version of tika installed, or 0 if not installed - * - * @return float version of tika - */ - public function getVersion() { - return $this - ->getClient() - ->getVersion(); - } + public function isAvailable() + { + return $this->getServerEndpoint() && + $this->getClient()->isAvailable() && + $this->getVersion() >= 1.7; + } - public function isAvailable() { - return $this->getServerEndpoint() && - $this->getClient()->isAvailable() && - $this->getVersion() >= 1.7; - } - - public function supportsExtension($extension) { - // Determine support via mime type only - return false; - } + public function supportsExtension($extension) + { + // Determine support via mime type only + return false; + } - /** - * Cache of supported mime types - * - * @var array - */ - protected $supportedMimes = array(); + /** + * Cache of supported mime types + * + * @var array + */ + protected $supportedMimes = array(); - public function supportsMime($mime) { - $supported = $this->supportedMimes ?: - ($this->supportedMimes = $this->getClient()->getSupportedMimes()); + public function supportsMime($mime) + { + $supported = $this->supportedMimes ?: + ($this->supportedMimes = $this->getClient()->getSupportedMimes()); - // Check if supported (most common / quickest lookup) - if(isset($supported[$mime])) return true; + // Check if supported (most common / quickest lookup) + if (isset($supported[$mime])) { + return true; + } - // Check aliases - foreach($supported as $info) { - if(isset($info['alias']) && in_array($mime, $info['alias'])) return true; - } + // Check aliases + foreach ($supported as $info) { + if (isset($info['alias']) && in_array($mime, $info['alias'])) { + return true; + } + } - return false; - } - - public function getContent($path) { - return $this->getClient()->tika($path); - } + return false; + } + public function getContent($path) + { + return $this->getClient()->tika($path); + } } diff --git a/code/extractors/TikaTextExtractor.php b/code/extractors/TikaTextExtractor.php index 871bcca..0150058 100644 --- a/code/extractors/TikaTextExtractor.php +++ b/code/extractors/TikaTextExtractor.php @@ -5,90 +5,101 @@ * * {@link http://tika.apache.org/1.7/gettingstarted.html} */ -class TikaTextExtractor extends FileTextExtractor { +class TikaTextExtractor extends FileTextExtractor +{ + /** + * Text extraction mode. Defaults to -t (plain text) + * + * @var string + * @config + */ + private static $output_mode = '-t'; - /** - * Text extraction mode. Defaults to -t (plain text) - * - * @var string - * @config - */ - private static $output_mode = '-t'; + /** + * Get the version of tika installed, or 0 if not installed + * + * @return float version of tika + */ + public function getVersion() + { + $code = $this->runShell('tika --version', $stdout); - /** - * Get the version of tika installed, or 0 if not installed - * - * @return float version of tika - */ - public function getVersion() { - $code = $this->runShell('tika --version', $stdout); + // Parse output + if (!$code && preg_match('/Apache Tika (?[\.\d]+)/', $stdout, $matches)) { + return $matches['version']; + } - // Parse output - if(!$code && preg_match('/Apache Tika (?[\.\d]+)/', $stdout, $matches)) { - return $matches['version']; - } + return 0; + } - return 0; - } + /** + * Runs an arbitrary and safely escaped shell command + * + * @param string $command Full command including arguments + * @param string &$stdout Standand output + * @param string &$stderr Standard error + * @param string $input Content to pass via standard input + * @return int Exit code. 0 is success + */ + protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') + { + $descriptorSpecs = array( + 0 => array("pipe", "r"), + 1 => array("pipe", "w"), + 2 => array("pipe", "w") + ); + // Invoke command + $pipes = array(); + $proc = proc_open($command, $descriptorSpecs, $pipes); + if (!is_resource($proc)) { + return 255; + } - /** - * Runs an arbitrary and safely escaped shell command - * - * @param string $command Full command including arguments - * @param string &$stdout Standand output - * @param string &$stderr Standard error - * @param string $input Content to pass via standard input - * @return int Exit code. 0 is success - */ - protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') { - $descriptorSpecs = array( - 0 => array("pipe", "r"), - 1 => array("pipe", "w"), - 2 => array("pipe", "w") - ); - // Invoke command - $pipes = array(); - $proc = proc_open($command, $descriptorSpecs, $pipes); - if (!is_resource($proc)) return 255; + // Send content as input + fwrite($pipes[0], $input); + fclose($pipes[0]); - // Send content as input - fwrite($pipes[0], $input); - fclose($pipes[0]); + // Get output + $stdout = stream_get_contents($pipes[1]); + fclose($pipes[1]); + $stderr = stream_get_contents($pipes[2]); + fclose($pipes[2]); - // Get output - $stdout = stream_get_contents($pipes[1]); - fclose($pipes[1]); - $stderr = stream_get_contents($pipes[2]); - fclose($pipes[2]); + // Get result + return proc_close($proc); + } + + public function getContent($path) + { + $mode = $this->config()->output_mode; + $command = sprintf('tika %s %s', $mode, escapeshellarg($path)); + $code = $this->runShell($command, $output); + if ($code == 0) { + return $output; + } + } - // Get result - return proc_close($proc); - } - - public function getContent($path) { - $mode = $this->config()->output_mode; - $command = sprintf('tika %s %s', $mode, escapeshellarg($path)); - $code = $this->runShell($command, $output); - if($code == 0) return $output; - } + public function isAvailable() + { + return $this->getVersion() > 0; + } - public function isAvailable() { - return $this->getVersion() > 0; - } + public function supportsExtension($extension) + { + // Determine support via mime type only + return false; + } - public function supportsExtension($extension) { - // Determine support via mime type only - return false; - } - - public function supportsMime($mime) { - // Get list of supported mime types - $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); - if($code) return false; // Error case - - // Check if the mime type is inside the result - $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); - return (bool)preg_match($pattern, $supportedTypes); - } + public function supportsMime($mime) + { + // Get list of supported mime types + $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); + if ($code) { + return false; + } // Error case + // Check if the mime type is inside the result + $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); + return (bool)preg_match($pattern, $supportedTypes); + } } diff --git a/code/tika/TikaRestClient.php b/code/tika/TikaRestClient.php index 6773c32..f5561af 100644 --- a/code/tika/TikaRestClient.php +++ b/code/tika/TikaRestClient.php @@ -3,92 +3,97 @@ use Guzzle\Http\Client; use Guzzle\Http\Exception\RequestException; -class TikaRestClient extends Client { +class TikaRestClient extends Client +{ + /** + * Detect if the service is available + * + * @return bool + */ + public function isAvailable() + { + try { + return $this + ->get()->send() + ->getStatusCode() == 200; + } catch (RequestException $ex) { + return false; + } + } - /** - * Detect if the service is available - * - * @return bool - */ - public function isAvailable() { - try { - return $this - ->get()->send() - ->getStatusCode() == 200; - } catch (RequestException $ex) { - return false; - } - } + /** + * Get version code + * + * @return float + */ + public function getVersion() + { + $response = $this->get('version')->send(); + // Parse output + if ($response->getStatusCode() == 200 && + preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody(), $matches) + ) { + return (float)$matches['version']; + } - /** - * Get version code - * - * @return float - */ - public function getVersion() { - $response = $this->get('version')->send(); - // Parse output - if($response->getStatusCode() == 200 && - preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody(), $matches) - ) { - return (float)$matches['version']; - } + return 0.0; + } - return 0.0; - } + protected $mimes = array(); - protected $mimes = array(); + /** + * Gets supported mime data. May include aliased mime types. + * + * @return array + */ + public function getSupportedMimes() + { + if ($this->mimes) { + return $this->mimes; + } - /** - * Gets supported mime data. May include aliased mime types. - * - * @return array - */ - public function getSupportedMimes() { - if($this->mimes) return $this->mimes; + $response = $this->get( + 'mime-types', + array('Accept' => 'application/json') + )->send(); - $response = $this->get( - 'mime-types', - array('Accept' => 'application/json') - )->send(); + return $this->mimes = $response->json(); + } - return $this->mimes = $response->json(); - } + /** + * Extract text content from a given file. + * Logs a notice-level error if the document can't be parsed. + * + * @param string $file Full filesystem path to a file to post + * @return string Content of the file extracted as plain text + */ + public function tika($file) + { + $text = null; + try { + $response = $this->put( + 'tika', + array('Accept' => 'text/plain'), + file_get_contents($file) + )->send(); + $text = $response->getBody(true); + } catch (RequestException $e) { + $msg = sprintf( + 'TikaRestClient was not able to process %s. Response: %s %s.', + $file, + $e->getResponse()->getStatusCode(), + $e->getResponse()->getReasonPhrase() + ); - /** - * Extract text content from a given file. - * Logs a notice-level error if the document can't be parsed. - * - * @param string $file Full filesystem path to a file to post - * @return string Content of the file extracted as plain text - */ - public function tika($file) { - $text = null; - try { - $response = $this->put( - 'tika', - array('Accept' => 'text/plain'), - file_get_contents($file) - )->send(); - $text = $response->getBody(true); - } catch(RequestException $e) { - $msg = sprintf( - 'TikaRestClient was not able to process %s. Response: %s %s.', - $file, - $e->getResponse()->getStatusCode(), - $e->getResponse()->getReasonPhrase() - ); - - // Only available if tika-server was started with --includeStack - $body = $e->getResponse()->getBody(true); - if($body) { - $msg .= ' Body: ' . $body; - } - - SS_Log::log($msg, SS_Log::NOTICE); - } - - return $text; - } + // Only available if tika-server was started with --includeStack + $body = $e->getResponse()->getBody(true); + if ($body) { + $msg .= ' Body: ' . $body; + } + SS_Log::log($msg, SS_Log::NOTICE); + } + + return $text; + } } diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php index d0caf60..6b8d784 100644 --- a/tests/FileTextCacheDatabaseTest.php +++ b/tests/FileTextCacheDatabaseTest.php @@ -1,17 +1,17 @@ update('FileTextCache_Database', 'max_content_length', 5); - $cache = new FileTextCache_Database(); - $file = $this->getMock('File', array('write')); - $content = '0123456789'; - $cache->save($file, $content); - $this->assertEquals($cache->load($file), '01234'); +class FileTextCacheDatabaseTest extends SapphireTest +{ + public function testTruncatesByMaxLength() + { + Config::nest(); + + Config::inst()->update('FileTextCache_Database', 'max_content_length', 5); + $cache = new FileTextCache_Database(); + $file = $this->getMock('File', array('write')); + $content = '0123456789'; + $cache->save($file, $content); + $this->assertEquals($cache->load($file), '01234'); - Config::unnest(); - } - -} \ No newline at end of file + Config::unnest(); + } +} diff --git a/tests/FileTextExtractableTest.php b/tests/FileTextExtractableTest.php index 6c2d788..166b1ee 100644 --- a/tests/FileTextExtractableTest.php +++ b/tests/FileTextExtractableTest.php @@ -1,43 +1,46 @@ array('FileTextExtractable') + ); - protected $requiredExtensions = array( - 'File' => array('FileTextExtractable') - ); + public function setUp() + { + parent::setUp(); - public function setUp() { - parent::setUp(); + // Ensure that html is a valid extension + Config::inst() + ->nest() + ->update('File', 'allowed_extensions', array('html')); + } - // Ensure that html is a valid extension - Config::inst() - ->nest() - ->update('File', 'allowed_extensions', array('html')); - } + public function tearDown() + { + Config::unnest(); + parent::tearDown(); + } - public function tearDown() { - Config::unnest(); - parent::tearDown(); - } + public function testExtractFileAsText() + { + // Create a copy of the file, as it may be clobbered by the test + // ($file->extractFileAsText() calls $file->write) + copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); + + // Use HTML, since the extractor is always available + $file = new File(array( + 'Name' => 'test1-copy.html', + 'Filename' => 'textextraction/tests/fixtures/test1-copy.html' + )); + $file->write(); + + $content = $file->extractFileAsText(); + $this->assertContains('Test Headline', $content); + $this->assertContains('Test Text', $content); + $this->assertEquals($content, $file->FileContentCache); - function testExtractFileAsText() { - // Create a copy of the file, as it may be clobbered by the test - // ($file->extractFileAsText() calls $file->write) - copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); - - // Use HTML, since the extractor is always available - $file = new File(array( - 'Name' => 'test1-copy.html', - 'Filename' => 'textextraction/tests/fixtures/test1-copy.html' - )); - $file->write(); - - $content = $file->extractFileAsText(); - $this->assertContains('Test Headline', $content); - $this->assertContains('Test Text', $content); - $this->assertEquals($content, $file->FileContentCache); - - if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); - } - - -} \ No newline at end of file + if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) { + unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); + } + } +} diff --git a/tests/HTMLTextExtractorTest.php b/tests/HTMLTextExtractorTest.php index a1f2429..8ff8b0b 100644 --- a/tests/HTMLTextExtractorTest.php +++ b/tests/HTMLTextExtractorTest.php @@ -1,14 +1,14 @@ getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); - $this->assertContains('Test Headline', $content); - $this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); - $this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); - $this->assertNotContains('Test Script', $content, 'Strips non-content script tags'); - } - -} \ No newline at end of file + $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); + $this->assertContains('Test Headline', $content); + $this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); + $this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); + $this->assertNotContains('Test Script', $content, 'Strips non-content script tags'); + } +} diff --git a/tests/PDFTextExtractorTest.php b/tests/PDFTextExtractorTest.php index 21b0a73..b99ff06 100644 --- a/tests/PDFTextExtractorTest.php +++ b/tests/PDFTextExtractorTest.php @@ -1,12 +1,14 @@ isAvailable()) $this->markTestSkipped('pdftotext not available'); - - $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); - $this->assertContains('This is a test file with a link', $content); - } +class PDFTextExtractorTest extends SapphireTest +{ + public function testExtraction() + { + $extractor = new PDFTextExtractor(); + if (!$extractor->isAvailable()) { + $this->markTestSkipped('pdftotext not available'); + } + $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); + $this->assertContains('This is a test file with a link', $content); + } } diff --git a/tests/TikaTextExtractorTest.php b/tests/TikaTextExtractorTest.php index a35c636..0342dcf 100644 --- a/tests/TikaTextExtractorTest.php +++ b/tests/TikaTextExtractorTest.php @@ -3,36 +3,41 @@ /** * Tests the {@see TikaTextExtractor} class */ -class TikaTextExtractorTest extends SapphireTest { - - function testExtraction() { - $extractor = new TikaTextExtractor(); - if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available'); +class TikaTextExtractorTest extends SapphireTest +{ + public function testExtraction() + { + $extractor = new TikaTextExtractor(); + if (!$extractor->isAvailable()) { + $this->markTestSkipped('tika cli not available'); + } - // Check file - $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; - $content = $extractor->getContent($file); - $this->assertContains('This is a test file with a link', $content); + // Check file + $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; + $content = $extractor->getContent($file); + $this->assertContains('This is a test file with a link', $content); - // Check mime validation - $this->assertTrue($extractor->supportsMime('application/pdf')); - $this->assertTrue($extractor->supportsMime('text/html')); - $this->assertFalse($extractor->supportsMime('application/not-supported')); - } + // Check mime validation + $this->assertTrue($extractor->supportsMime('application/pdf')); + $this->assertTrue($extractor->supportsMime('text/html')); + $this->assertFalse($extractor->supportsMime('application/not-supported')); + } - function testServerExtraction() { - $extractor = new TikaServerTextExtractor(); - if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available'); + public function testServerExtraction() + { + $extractor = new TikaServerTextExtractor(); + if (!$extractor->isAvailable()) { + $this->markTestSkipped('tika server not available'); + } - // Check file - $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; - $content = $extractor->getContent($file); - $this->assertContains('This is a test file with a link', $content); - - // Check mime validation - $this->assertTrue($extractor->supportsMime('application/pdf')); - $this->assertTrue($extractor->supportsMime('text/html')); - $this->assertFalse($extractor->supportsMime('application/not-supported')); - } + // Check file + $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; + $content = $extractor->getContent($file); + $this->assertContains('This is a test file with a link', $content); + // Check mime validation + $this->assertTrue($extractor->supportsMime('application/pdf')); + $this->assertTrue($extractor->supportsMime('text/html')); + $this->assertFalse($extractor->supportsMime('application/not-supported')); + } }