diff --git a/_config/cache.yml b/_config/cache.yml new file mode 100644 index 0000000..ff793b2 --- /dev/null +++ b/_config/cache.yml @@ -0,0 +1,11 @@ +--- +Name: textextractioncache +After: + - '#corecache' +--- + +SilverStripe\Core\Injector\Injector: + Psr\SimpleCache\CacheInterface.FileTextCache_Cache: + factory: SilverStripe\Core\Cache\CacheFactory + constructor: + namespace: 'FileTextCache_Cache' \ No newline at end of file diff --git a/_config/config.yml b/_config/config.yml deleted file mode 100644 index bed07e8..0000000 --- a/_config/config.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -Name: textextraction ---- -Injector: - FileTextCache: FileTextCache_Database - -#SolrCellTextExtractor: -# base_url: 'http://localhost:8983/solr/update/extract' - -FileTextCache_Database: - max_content_length: 500000 diff --git a/code/extensions/FileTextCache.php b/code/extensions/FileTextCache.php deleted file mode 100644 index 385d848..0000000 --- a/code/extensions/FileTextCache.php +++ /dev/null @@ -1,112 +0,0 @@ -FileContentCache; - } - - public function save(File $file, $content) - { - $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); - $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; - $file->write(); - } - - public function invalidate(File $file) - { - // To prevent writing to the cache from invalidating it - if (!$file->isChanged('FileContentCache')) { - $file->FileContentCache = ''; - } - } -} - -/** - * Uses SS_Cache with a lifetime to cache extracted content - */ -class FileTextCache_SSCache implements FileTextCache, Flushable -{ - /** - * Lifetime of cache in seconds - * Null is indefinite - * - * @var int|null - * @config - */ - private static $lifetime = null; - - /** - * @return SS_Cache - */ - protected static function get_cache() - { - $lifetime = Config::inst()->get(__CLASS__, 'lifetime'); - $cache = SS_Cache::factory(__CLASS__); - $cache->setLifetime($lifetime); - return $cache; - } - - protected function getKey(File $file) - { - return md5($file->getFullPath()); - } - - public function load(File $file) - { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->load($key); - } - - public function save(File $file, $content) - { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->save($content, $key); - } - - public static function flush() - { - $cache = self::get_cache(); - $cache->clean(); - } - - public function invalidate(File $file) - { - $key = $this->getKey($file); - $cache = self::get_cache(); - return $cache->remove($key); - } -} diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php deleted file mode 100644 index 810f473..0000000 --- a/code/extractors/HTMLTextExtractor.php +++ /dev/null @@ -1,77 +0,0 @@ - or @siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - // Add line breaks before and after blocks - '@=5.3.2", + "php": ">=5.6", "composer/installers": "*", - "silverstripe/framework": "^3.1", - "guzzle/guzzle": "^3.9", + "silverstripe/framework": "4.0.x-dev", + "guzzlehttp/guzzle": "~3.8.1", "symfony/event-dispatcher": "^2.6.0@stable", - "symfony/http-foundation": "^2.6.0" + "symfony/http-foundation": "^2.6.0", + "silverstripe/assets": "^1" }, "require-dev": { - "phpunit/phpunit": "^3.7" + "phpunit/phpunit": "~5.0" }, "suggest": { "ext-fileinfo": "Improved support for file mime detection" diff --git a/src/Exception/FileTextExtractor_Exception.php b/src/Exception/FileTextExtractor_Exception.php new file mode 100644 index 0000000..4fa1038 --- /dev/null +++ b/src/Exception/FileTextExtractor_Exception.php @@ -0,0 +1,9 @@ +get($for); + } + + /** + * + * @param File $file + * @return string + */ + protected function getKey(File $file) + { + return md5($file->getFilename()); + } + + /** + * + * @param File $file + * @return type + */ + public function load(File $file) + { + $key = $this->getKey($file); + $cache = self::get_cache(); + + return $cache->get($key); + } + + /** + * @param File $file + * @param string $content + * @return string + */ + public function save(File $file, $content) + { + $lifetime = Config::inst()->get(__CLASS__, 'lifetime'); + $lifetime = $lifetime ?: 3600; + $key = $this->getKey($file); + $cache = self::get_cache(); + + return $cache->set($key, $content, $lifetime); + } + + /** + * @return void + */ + public static function flush() + { + $cache = self::get_cache(); + $cache->clear(); + } + + /** + * Alias for $this->flush() + * + * @return void + */ + public static function clear() + { + $cache = self::get_cache(); + $cache->clear(); + } + + /** + * + * @param File $file + * @return type + */ + public function invalidate(File $file) + { + $key = $this->getKey($file); + $cache = self::get_cache(); + + return $cache->delete($key); + } +} diff --git a/src/Extension/FieldTextCache_Database.php b/src/Extension/FieldTextCache_Database.php new file mode 100644 index 0000000..a96ff60 --- /dev/null +++ b/src/Extension/FieldTextCache_Database.php @@ -0,0 +1,47 @@ +FileContentCache; + } + + /** + * @param File $file + * @param mixed $content + */ + public function save(File $file, $content) + { + $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); + $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; + $file->write(); + } + + /** + * @param File $file + * @return void + */ + public function invalidate(File $file) + { + // To prevent writing to the cache from invalidating it + if (!$file->isChanged('FileContentCache')) { + $file->FileContentCache = ''; + } + } +} diff --git a/src/Extension/FileTextCache.php b/src/Extension/FileTextCache.php new file mode 100644 index 0000000..d0ccd70 --- /dev/null +++ b/src/Extension/FileTextCache.php @@ -0,0 +1,31 @@ + 'Text' ); + /** + * + * @var array + * @config + */ private static $casting = array( 'FileContent' => 'Text' ); + /** + * + * @var array + * @config + */ private static $dependencies = array( - 'TextCache' => '%$FileTextCache' + 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' ); /** @@ -30,7 +51,8 @@ class FileTextExtractable extends DataExtension /** * - * @param FileTextCache $cache + * @param FileTextCache $cache + * @return void */ public function setTextCache(FileTextCache $cache) { @@ -58,10 +80,11 @@ class FileTextExtractable extends DataExtension /** * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. * The value is also cached into the File record itself. - * + * * @param boolean $disableCache If false, the file content is only parsed on demand. - * If true, the content parsing is forced, bypassing the cached version - * @return string + * If true, the content parsing is forced, bypassing + * the cached version + * @return mixed string | null */ public function extractFileAsText($disableCache = false) { @@ -73,23 +96,27 @@ class FileTextExtractable extends DataExtension } // Determine which extractor can process this file. - $extractor = FileTextExtractor::for_file($this->owner->FullPath); + $path = Director::baseFolder() . '/' . $this->owner->getFilename(); + $extractor = FileTextExtractor::for_file($path); if (!$extractor) { return null; } - $text = $extractor->getContent($this->owner->FullPath); + $text = $extractor->getContent($path); if (!$text) { return null; } if (!$disableCache) { - $this->getTextCache()->save($this->owner, $text); + $this->getTextCache()->save($this->owner, $text); } return $text; } + /** + * @return void + */ public function onBeforeWrite() { // Clear cache before changing file diff --git a/code/extractors/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php similarity index 91% rename from code/extractors/FileTextExtractor.php rename to src/Extractor/FileTextExtractor.php index cc7c176..115d679 100644 --- a/code/extractors/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -1,12 +1,19 @@ get($class, 'priority'); } @@ -74,8 +82,8 @@ abstract class FileTextExtractor extends Object } /** - * @param string $path - * @return FileTextExtractor|null + * @param string $path + * @return mixed FileTextExtractor | null */ public static function for_file($path) { @@ -85,6 +93,7 @@ abstract class FileTextExtractor extends Object $extension = pathinfo($path, PATHINFO_EXTENSION); $mime = self::get_mime($path); + foreach (self::get_extractor_classes() as $className) { $extractor = self::get_extractor($className); @@ -108,7 +117,7 @@ abstract class FileTextExtractor extends Object /** * Checks if the extractor is supported on the current environment, * for example if the correct binaries or libraries are available. - * + * * @return boolean */ abstract public function isAvailable(); @@ -125,7 +134,7 @@ abstract class FileTextExtractor extends Object /** * Determine if this extractor suports the given mime type. * Will only be called if supportsExtension returns false. - * + * * @param string $mime * @return boolean */ @@ -133,13 +142,9 @@ abstract class FileTextExtractor extends Object /** * Given a file path, extract the contents as text. - * + * * @param string $path * @return string */ abstract public function getContent($path); } - -class FileTextExtractor_Exception extends Exception -{ -} diff --git a/src/Extractor/HTMLTextExtractor.php b/src/Extractor/HTMLTextExtractor.php new file mode 100644 index 0000000..ace1400 --- /dev/null +++ b/src/Extractor/HTMLTextExtractor.php @@ -0,0 +1,94 @@ + or @siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@isAvailable()) { + if (!$this->isAvailable()) { throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); } exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); @@ -108,11 +113,10 @@ class PDFTextExtractor extends FileTextExtractor $err = $content; } throw new FileTextExtractor_Exception(sprintf( - 'PDFTextExtractor->getContent() failed for %s: %s', - $path, - implode(PHP_EOL, $err) + 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) )); } + return implode(PHP_EOL, $content); } @@ -135,6 +139,8 @@ class PDFTextExtractor extends FileTextExtractor 'ſt' => 'ft', 'st' => 'st' ); + return str_replace(array_keys($mapping), array_values($mapping), $input); } + } diff --git a/code/extractors/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php similarity index 70% rename from code/extractors/SolrCellTextExtractor.php rename to src/Extractor/SolrCellTextExtractor.php index 2590153..27852f1 100644 --- a/code/extractors/SolrCellTextExtractor.php +++ b/src/Extractor/SolrCellTextExtractor.php @@ -1,12 +1,18 @@ config()->get('base_url')) { @@ -33,20 +53,35 @@ class SolrCellTextExtractor extends FileTextExtractor if (!$this->httpClient) { $this->httpClient = new Client($this->config()->get('base_url')); } + return $this->httpClient; } + /** + * + * @param Guzzle\Http\Client $client + * @return void + */ public function setHttpClient($client) { $this->httpClient = $client; } + /** + * @return string + */ public function isAvailable() { $url = $this->config()->get('base_url'); + return (boolean) $url; } + /** + * + * @param string $extension + * @return boolean + */ public function supportsExtension($extension) { return in_array( @@ -59,12 +94,22 @@ class SolrCellTextExtractor extends FileTextExtractor ); } + /** + * + * @param string $mime + * @return boolean + */ public function supportsMime($mime) { // Rely on supportsExtension return false; } - + + /** + * + * @param string $path + * @return string + */ public function getContent($path) { if (!$path) { @@ -73,6 +118,7 @@ class SolrCellTextExtractor extends FileTextExtractor $fileName = basename($path); $client = $this->getHttpClient(); + try { $request = $client ->post() @@ -80,27 +126,30 @@ class SolrCellTextExtractor extends FileTextExtractor ->addPostFiles(array('myfile' => $path)); $response = $request->send(); } catch (InvalidArgumentException $e) { - SS_Log::log( - sprintf( + $msg = sprintf( 'Error extracting text from "%s" (message: %s)', $path, $e->getMessage() - ), - SS_Log::NOTICE - ); + ); + Injector::inst()->get(LoggerInterface::class)->notice($msg); + return null; } catch (Guzzle\Http\Exception\ServerErrorResponseException $e) { - //catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. - SS_Log::log( - sprintf( + // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. + $msg = sprintf( 'Tika server error attempting to extract from "%s" (message: %s)', $path, $e->getMessage() - ), - SS_Log::NOTICE - ); + ); + + Injector::inst()->get(LoggerInterface::class)->notice($msg); + return null; } + + // Just initialise it, it doesn't take miuch. + $matches = []; + // Use preg match to avoid SimpleXML running out of memory on large text nodes preg_match( sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), diff --git a/code/extractors/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php similarity index 79% rename from code/extractors/TikaServerTextExtractor.php rename to src/Extractor/TikaServerTextExtractor.php index 5e42bc9..2ae38e8 100644 --- a/code/extractors/TikaServerTextExtractor.php +++ b/src/Extractor/TikaServerTextExtractor.php @@ -1,5 +1,12 @@ client ?: ($this->client = Injector::inst()->createWithArgs( - 'TikaRestClient', + TikaRestClient::class, array($this->getServerEndpoint()) ) ); } + /** + * @return string + */ public function getServerEndpoint() { - if (defined('SS_TIKA_ENDPOINT')) { - return SS_TIKA_ENDPOINT; - } - - if (getenv('SS_TIKA_ENDPOINT')) { - return getenv('SS_TIKA_ENDPOINT'); + if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) { + return $endpoint; } // Default to configured endpoint @@ -68,6 +74,9 @@ class TikaServerTextExtractor extends FileTextExtractor ->getVersion(); } + /** + * @return boolean + */ public function isAvailable() { return $this->getServerEndpoint() && @@ -75,6 +84,11 @@ class TikaServerTextExtractor extends FileTextExtractor version_compare($this->getVersion(), '1.7.0') >= 0; } + /** + * + * @param string $extension + * @return boolean + */ public function supportsExtension($extension) { // Determine support via mime type only @@ -89,6 +103,11 @@ class TikaServerTextExtractor extends FileTextExtractor */ protected $supportedMimes = array(); + /** + * + * @param string $mime + * @return boolean + */ public function supportsMime($mime) { $supported = $this->supportedMimes ?: diff --git a/code/extractors/TikaTextExtractor.php b/src/Extractor/TikaTextExtractor.php similarity index 78% rename from code/extractors/TikaTextExtractor.php rename to src/Extractor/TikaTextExtractor.php index 0150058..0d4b18f 100644 --- a/code/extractors/TikaTextExtractor.php +++ b/src/Extractor/TikaTextExtractor.php @@ -1,8 +1,12 @@ config()->output_mode; $command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $code = $this->runShell($command, $output); + if ($code == 0) { return $output; } } + /** + * + * @return boolean + */ public function isAvailable() { return $this->getVersion() > 0; } + /** + * + * @return boolean + */ public function supportsExtension($extension) { // Determine support via mime type only return false; } + + /** + * + * @param string $mime + * @return boolean + */ public function supportsMime($mime) { // Get list of supported mime types $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); + if ($code) { return false; } // Error case // Check if the mime type is inside the result $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); + return (bool)preg_match($pattern, $supportedTypes); } } diff --git a/code/tika/TikaRestClient.php b/src/Rest/TikaRestClient.php similarity index 80% rename from code/tika/TikaRestClient.php rename to src/Rest/TikaRestClient.php index 4ae6242..764dcac 100644 --- a/code/tika/TikaRestClient.php +++ b/src/Rest/TikaRestClient.php @@ -1,7 +1,12 @@ options = array( - 'username' => SS_TIKA_USERNAME, - 'password' => SS_TIKA_PASSWORD, + 'username' => Environment::getEnv('SS_TIKA_USERNAME'), + 'password' => $psswd, ); } + parent::__construct($baseUrl, $config); } @@ -39,11 +52,14 @@ class TikaRestClient extends Client $result = $this->get(null); $result->setAuth($this->options['username'], $this->options['password']); $result->send(); + if ($result->getResponse()->getStatusCode() == 200) { return true; } } catch (RequestException $ex) { - SS_Log::log(sprintf("Tika unavailable - %s", $ex->getMessage()), SS_Log::ERR); + $msg = sprintf("Tika unavailable - %s", $ex->getMessage()); + Injector::inst()->get(LoggerInterface::class)->error($msg); + return false; } } @@ -59,12 +75,14 @@ class TikaRestClient extends Client $response->setAuth($this->options['username'], $this->options['password']); $response->send(); $version = 0.0; + // Parse output if ($response->getResponse()->getStatusCode() == 200 && preg_match('/Apache Tika (?[\.\d]+)/', $response->getResponse()->getBody(), $matches) ) { $version = (float)$matches['version']; } + return $version; } @@ -78,12 +96,14 @@ class TikaRestClient extends Client if ($this->mimes) { return $this->mimes; } + $response = $this->get( 'mime-types', array('Accept' => 'application/json') ); $response->setAuth($this->options['username'], $this->options['password']); $response->send(); + return $this->mimes = $response->getResponse()->json(); } @@ -91,7 +111,7 @@ class TikaRestClient extends Client * Extract text content from a given file. * Logs a notice-level error if the document can't be parsed. * - * @param string $file Full filesystem path to a file to post + * @param string $file Full filesystem path to a file to post * @return string Content of the file extracted as plain text */ public function tika($file) @@ -118,8 +138,10 @@ class TikaRestClient extends Client if ($body) { $msg .= ' Body: ' . $body; } - SS_Log::log($msg, SS_Log::NOTICE); + + Injector::inst()->get(LoggerInterface::class)->notice($msg); } + return $text; } } diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php index 6b8d784..e300c19 100644 --- a/tests/FileTextCacheDatabaseTest.php +++ b/tests/FileTextCacheDatabaseTest.php @@ -1,10 +1,16 @@ update('FileTextCache_Database', 'max_content_length', 5); $cache = new FileTextCache_Database(); $file = $this->getMock('File', array('write'));