diff --git a/.upgrade.yml b/.upgrade.yml new file mode 100644 index 0000000..5d5dd4f --- /dev/null +++ b/.upgrade.yml @@ -0,0 +1,14 @@ +mappings: + FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable + FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache + FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache + FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database + FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor + FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception + HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor + PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor + SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor + TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor + TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor + TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient + diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 5db972b..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,12 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -This project adheres to [Semantic Versioning](http://semver.org/). - - -## [2.0.1] -Using Symfony mime type detection - -## [2.0.0] -Clarified Tika docs diff --git a/_config.php b/_config.php deleted file mode 100644 index e69de29..0000000 diff --git a/src/Extension/FileTextCache.php b/src/Cache/FileTextCache.php similarity index 92% rename from src/Extension/FileTextCache.php rename to src/Cache/FileTextCache.php index d0ccd70..3586d78 100644 --- a/src/Extension/FileTextCache.php +++ b/src/Cache/FileTextCache.php @@ -1,6 +1,6 @@ get(__CLASS__, 'lifetime'); - $lifetime = $lifetime ?: 3600; + $lifetime = $this->config()->get('lifetime') ?: 3600; $key = $this->getKey($file); $cache = self::get_cache(); @@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable /** * * @param File $file - * @return type + * @return bool */ public function invalidate(File $file) { diff --git a/src/Extension/FieldTextCache_Database.php b/src/Cache/FileTextCache/Database.php similarity index 67% rename from src/Extension/FieldTextCache_Database.php rename to src/Cache/FileTextCache/Database.php index a96ff60..1379ee0 100644 --- a/src/Extension/FieldTextCache_Database.php +++ b/src/Cache/FileTextCache/Database.php @@ -1,17 +1,25 @@ get('FileTextCache_Database', 'max_content_length'); + $maxLength = $this->config()->get('max_content_length'); $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->write(); } diff --git a/src/Exception/FileTextExtractor_Exception.php b/src/Exception/FileTextExtractor_Exception.php deleted file mode 100644 index 4fa1038..0000000 --- a/src/Exception/FileTextExtractor_Exception.php +++ /dev/null @@ -1,9 +0,0 @@ - 'Text' - ); + ]; /** * * @var array * @config */ - private static $casting = array( + private static $casting = [ 'FileContent' => 'Text' - ); + ]; /** * * @var array * @config */ - private static $dependencies = array( - 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' - ); + private static $dependencies = [ + 'TextCache' => FileTextCache\Cache::class, + ]; /** * @var FileTextCache @@ -52,11 +53,12 @@ class FileTextExtractable extends DataExtension /** * * @param FileTextCache $cache - * @return void + * @return $this */ public function setTextCache(FileTextCache $cache) { $this->fileTextCache = $cache; + return $this; } /** @@ -84,7 +86,7 @@ class FileTextExtractable extends DataExtension * @param boolean $disableCache If false, the file content is only parsed on demand. * If true, the content parsing is forced, bypassing * the cached version - * @return mixed string | null + * @return string|null */ public function extractFileAsText($disableCache = false) { diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index 115d679..fd3cf5c 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -2,17 +2,18 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\Core\Config\Config, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\ClassInfo; +use SilverStripe\Core\ClassInfo; +use SilverStripe\Core\Config\Config; +use SilverStripe\Core\Config\Configurable; +use SilverStripe\Core\Injector\Injector; /** * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * @author mstephens - * */ abstract class FileTextExtractor { + use Configurable; /** * Set priority from 0-100. @@ -45,7 +46,7 @@ abstract class FileTextExtractor // Generate the sorted list of extractors on demand. $classes = ClassInfo::subclassesFor(__CLASS__); array_shift($classes); - $classPriorities = array(); + $classPriorities = []; foreach ($classes as $class) { $classPriorities[$class] = Config::inst()->get($class, 'priority'); @@ -76,19 +77,19 @@ abstract class FileTextExtractor */ protected static function get_mime($path) { - $file = new Symfony\Component\HttpFoundation\File\File($path); + $file = new \Symfony\Component\HttpFoundation\File\File($path); return $file->getMimeType(); } /** * @param string $path - * @return mixed FileTextExtractor | null + * @return FileTextExtractor|null */ public static function for_file($path) { if (!file_exists($path) || is_dir($path)) { - return; + return null; } $extension = pathinfo($path, PATHINFO_EXTENSION); @@ -132,7 +133,7 @@ abstract class FileTextExtractor abstract public function supportsExtension($extension); /** - * Determine if this extractor suports the given mime type. + * Determine if this extractor supports the given mime type. * Will only be called if supportsExtension returns false. * * @param string $mime diff --git a/src/Extractor/FileTextExtractor/Exception.php b/src/Extractor/FileTextExtractor/Exception.php new file mode 100644 index 0000000..1a54a80 --- /dev/null +++ b/src/Extractor/FileTextExtractor/Exception.php @@ -0,0 +1,7 @@ + or @siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - // Add line breaks before and after blocks - '@]*?>.*?@siu', + '@]*?>.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@config()->binary_location) { - $locations = array($location); + if ($location = $this->config()->get('binary_location')) { + $locations = [$location]; } else { - $locations = $this->config()->search_binary_locations; + $locations = $this->config()->get('search_binary_locations'); } // Find program in each path @@ -88,8 +86,9 @@ class PDFTextExtractor extends FileTextExtractor public function getContent($path) { if (!$path) { - return ""; - } // no file + // no file + return ''; + } $content = $this->getRawOutput($path); return $this->cleanupLigatures($content); } @@ -99,12 +98,12 @@ class PDFTextExtractor extends FileTextExtractor * * @param string $path * @return string Output - * @throws FileTextExtractor_Exception + * @throws Exception */ protected function getRawOutput($path) { if (!$this->isAvailable()) { - throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); + throw new Exception("getRawOutput called on unavailable extractor"); } exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); if ($err) { @@ -112,8 +111,11 @@ class PDFTextExtractor extends FileTextExtractor // For Windows compatibility $err = $content; } - throw new FileTextExtractor_Exception(sprintf( - 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) + + throw new Exception(sprintf( + 'PDFTextExtractor->getContent() failed for %s: %s', + $path, + implode(PHP_EOL, $err) )); } @@ -130,7 +132,7 @@ class PDFTextExtractor extends FileTextExtractor */ protected function cleanupLigatures($input) { - $mapping = array( + $mapping = [ 'ff' => 'ff', 'fi' => 'fi', 'fl' => 'fl', @@ -138,7 +140,7 @@ class PDFTextExtractor extends FileTextExtractor 'ffl' => 'ffl', 'ſt' => 'ft', 'st' => 'st' - ); + ]; return str_replace(array_keys($mapping), array_values($mapping), $input); } diff --git a/src/Extractor/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php index b2f149e..0e26b74 100644 --- a/src/Extractor/SolrCellTextExtractor.php +++ b/src/Extractor/SolrCellTextExtractor.php @@ -2,9 +2,11 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - GuzzleHttp\Client, - Psr\Log\LoggerInterface; +use Exception; +use GuzzleHttp\Client; +use InvalidArgumentException; +use Psr\Log\LoggerInterface; +use SilverStripe\Core\Injector\Injector; /** * Text extractor that calls an Apache Solr instance @@ -18,7 +20,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor, class SolrCellTextExtractor extends FileTextExtractor { /** - * Base URL to use for solr text extraction. + * Base URL to use for Solr text extraction. * E.g. http://localhost:8983/solr/update/extract * * @config @@ -27,43 +29,36 @@ class SolrCellTextExtractor extends FileTextExtractor private static $base_url; /** - * * @var int * @config */ private static $priority = 75; /** - * - * @var GuzzleHttp\Client + * @var Client */ protected $httpClient; /** - * - * @return GuzzleHttp\Client - * @throws InvalidArgumentException + * @return Client */ public function getHttpClient() { - if (!$this->config()->get('base_url')) { - throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); - } if (!$this->httpClient) { - $this->httpClient = new Client($this->config()->get('base_url')); + $this->httpClient = new Client(); } return $this->httpClient; } /** - * - * @param GuzzleHttp\Client $client - * @return void + * @param Client $client + * @return $this */ - public function setHttpClient($client) + public function setHttpClient(Client $client) { $this->httpClient = $client; + return $this; } /** @@ -73,30 +68,28 @@ class SolrCellTextExtractor extends FileTextExtractor { $url = $this->config()->get('base_url'); - return (boolean) $url; + return (bool) $url; } /** - * * @param string $extension - * @return boolean + * @return bool */ public function supportsExtension($extension) { return in_array( strtolower($extension), - array( + [ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' - ) + ] ); } /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -105,48 +98,55 @@ class SolrCellTextExtractor extends FileTextExtractor } /** - * - * @param string $path + * @param string $path * @return string + * @throws InvalidArgumentException */ public function getContent($path) { if (!$path) { - return ""; - } // no file + // no file + return ''; + } $fileName = basename($path); $client = $this->getHttpClient(); + // Get and validate base URL + $baseUrl = $this->config()->get('base_url'); + if (!$this->config()->get('base_url')) { + throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); + } + try { $request = $client - ->post() - ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) - ->addPostFiles(array('myfile' => $path)); + ->post($baseUrl) + ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text']) + ->addPostFiles(['myfile' => $path]); $response = $request->send(); - } catch (\InvalidArgumentException $e) { + } catch (InvalidArgumentException $e) { $msg = sprintf( - 'Error extracting text from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Error extracting text from "%s" (message: %s)', + $path, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; - } catch (\Exception $e) { + } catch (Exception $e) { // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. $msg = sprintf( - 'Tika server error attempting to extract from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Tika server error attempting to extract from "%s" (message: %s)', + $path, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } - // Just initialise it, it doesn't take miuch. + // Just initialise it, it doesn't take much. $matches = []; // Use preg match to avoid SimpleXML running out of memory on large text nodes diff --git a/src/Extractor/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php index 2ae38e8..1c477ec 100644 --- a/src/Extractor/TikaServerTextExtractor.php +++ b/src/Extractor/TikaServerTextExtractor.php @@ -2,10 +2,9 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\Environment, - SilverStripe\TextExtraction\Rest\TikaRestClient; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Rest\TikaRestClient; /** * Enables text extraction of file content via the Tika Rest Server @@ -35,18 +34,25 @@ class TikaServerTextExtractor extends FileTextExtractor */ protected $client = null; + /** + * Cache of supported mime types + * + * @var array + */ + protected $supportedMimes = []; + /** * @return TikaRestClient */ public function getClient() { - return $this->client ?: - ($this->client = - Injector::inst()->createWithArgs( - TikaRestClient::class, - array($this->getServerEndpoint()) - ) + if (!$this->client) { + $this->client = Injector::inst()->createWithArgs( + TikaRestClient::class, + [$this->getServerEndpoint()] ); + } + return $this->client; } /** @@ -59,19 +65,17 @@ class TikaServerTextExtractor extends FileTextExtractor } // Default to configured endpoint - return $this->config()->server_endpoint; + return $this->config()->get('server_endpoint'); } /** - * Get the version of tika installed, or 0 if not installed + * Get the version of Tika installed, or 0 if not installed * - * @return float version of tika + * @return float version of Tika */ public function getVersion() { - return $this - ->getClient() - ->getVersion(); + return $this->getClient()->getVersion(); } /** @@ -79,13 +83,12 @@ class TikaServerTextExtractor extends FileTextExtractor */ public function isAvailable() { - return $this->getServerEndpoint() && - $this->getClient()->isAvailable() && - version_compare($this->getVersion(), '1.7.0') >= 0; + return $this->getServerEndpoint() + && $this->getClient()->isAvailable() + && version_compare($this->getVersion(), '1.7.0') >= 0; } /** - * * @param string $extension * @return boolean */ @@ -95,31 +98,23 @@ class TikaServerTextExtractor extends FileTextExtractor return false; } - /** - * Cache of supported mime types - * - * @var array - */ - protected $supportedMimes = array(); - - /** - * * @param string $mime * @return boolean */ public function supportsMime($mime) { - $supported = $this->supportedMimes ?: - ($this->supportedMimes = $this->getClient()->getSupportedMimes()); + if (!$this->supportedMimes) { + $this->supportedMimes = $this->getClient()->getSupportedMimes(); + } // Check if supported (most common / quickest lookup) - if (isset($supported[$mime])) { + if (isset($this->supportedMimes[$mime])) { return true; } // Check aliases - foreach ($supported as $info) { + foreach ($this->supportedMimes as $info) { if (isset($info['alias']) && in_array($mime, $info['alias'])) { return true; } diff --git a/src/Extractor/TikaTextExtractor.php b/src/Extractor/TikaTextExtractor.php index 0d4b18f..a6a21bf 100644 --- a/src/Extractor/TikaTextExtractor.php +++ b/src/Extractor/TikaTextExtractor.php @@ -2,8 +2,6 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor; - /** * Enables text extraction of file content via the Tika CLI * @@ -47,13 +45,13 @@ class TikaTextExtractor extends FileTextExtractor */ protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') { - $descriptorSpecs = array( - 0 => array("pipe", "r"), - 1 => array("pipe", "w"), - 2 => array("pipe", "w") - ); + $descriptorSpecs = [ + 0 => ["pipe", "r"], + 1 => ["pipe", "w"], + 2 => ["pipe", "w"] + ]; // Invoke command - $pipes = array(); + $pipes = []; $proc = proc_open($command, $descriptorSpecs, $pipes); if (!is_resource($proc)) { @@ -75,7 +73,6 @@ class TikaTextExtractor extends FileTextExtractor } /** - * * @param string $path * @return string */ @@ -91,8 +88,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function isAvailable() { @@ -100,8 +96,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function supportsExtension($extension) { @@ -111,9 +106,8 @@ class TikaTextExtractor extends FileTextExtractor /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -121,8 +115,9 @@ class TikaTextExtractor extends FileTextExtractor $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); if ($code) { + // Error case return false; - } // Error case + } // Check if the mime type is inside the result $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); diff --git a/src/Rest/TikaRestClient.php b/src/Rest/TikaRestClient.php index 8473f67..cd8b5c7 100644 --- a/src/Rest/TikaRestClient.php +++ b/src/Rest/TikaRestClient.php @@ -2,11 +2,11 @@ namespace SilverStripe\TextExtraction\Rest; -use GuzzleHttp\Client, - GuzzleHttp\Exception\RequestException, - SilverStripe\Core\Environment, - Psr\Log\LoggerInterface, - SilverStripe\Core\Injector\Injector; +use GuzzleHttp\Client; +use GuzzleHttp\Exception\RequestException; +use Psr\Log\LoggerInterface; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; class TikaRestClient extends Client { @@ -15,12 +15,12 @@ class TikaRestClient extends Client * * @var array */ - protected $options = array('username' => null, 'password' => null); + protected $options = ['username' => null, 'password' => null]; /** * @var array */ - protected $mimes = array(); + protected $mimes = []; /** * @@ -29,16 +29,16 @@ class TikaRestClient extends Client */ public function __construct($baseUrl = '', $config = null) { - $psswd = Environment::getEnv('SS_TIKA_PASSWORD'); + $password = Environment::getEnv('SS_TIKA_PASSWORD'); - if (!empty($psswd)) { - $this->options = array( + if (!empty($password)) { + $this->options = [ 'username' => Environment::getEnv('SS_TIKA_USERNAME'), - 'password' => $psswd, - ); + 'password' => $password, + ]; } - parent::__construct($baseUrl, $config); + parent::__construct($config); } /** @@ -120,7 +120,7 @@ class TikaRestClient extends Client try { $response = $this->put( 'tika', - array('Accept' => 'text/plain'), + ['Accept' => 'text/plain'], file_get_contents($file) ); $response->setAuth($this->options['username'], $this->options['password']);