From 300941c9e89d7c6fe4f116b39f32220ca6698a47 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 10:47:56 +1200 Subject: [PATCH 01/16] Update readme badges and requirements for SilverStripe 4 --- README.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ad98b1f..dbf66a4 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ # Text extraction module -[![Build Status](https://secure.travis-ci.org/silverstripe/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe/silverstripe-textextraction) +[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction) +[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master) +[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction) [![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/) -[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe/silverstripe-textextraction.svg?style=flat)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction) -[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat)](https://packagist.org/packages/silverstripe/silverstripe-textextraction) -[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat)](license.md) - Provides a text extraction API for file content, that can hook into different extractor engines based on availability and the parsed file format. The output returned is always a string of the file content. @@ -26,14 +24,14 @@ The module supports text extraction on the following file formats: ## Requirements - * SilverStripe ^3.1 + * SilverStripe ^4.0 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) * (optional) [Apache Tika](http://tika.apache.org/) ## Installation -```js +``` composer require silverstripe/textextraction ``` From f1bacd2aa98f236e6241d33d29dddb67eb1ebf24 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 10:48:02 +1200 Subject: [PATCH 02/16] Bump license year --- license.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/license.md b/license.md index 8794670..30758eb 100644 --- a/license.md +++ b/license.md @@ -1,4 +1,4 @@ -Copyright (c) 2017, SilverStripe Limited +Copyright (c) 2018, SilverStripe Limited All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: From 66c9db8c0dcba60ad34518490fcde47ea5788715 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 11:23:27 +1200 Subject: [PATCH 03/16] API Update namespaces for FileTextCache and add upgrader mapping --- .upgrade.yml | 14 +++ CHANGELOG.md | 12 --- _config.php | 0 src/{Extension => Cache}/FileTextCache.php | 2 +- .../FileTextCache/Cache.php} | 25 +++--- .../FileTextCache/Database.php} | 20 +++-- src/Exception/FileTextExtractor_Exception.php | 9 -- src/Extension/FileTextExtractable.php | 26 +++--- src/Extractor/FileTextExtractor.php | 19 ++-- src/Extractor/FileTextExtractor/Exception.php | 7 ++ src/Extractor/HTMLTextExtractor.php | 70 +++++++-------- src/Extractor/PDFTextExtractor.php | 48 ++++++----- src/Extractor/SolrCellTextExtractor.php | 86 +++++++++---------- src/Extractor/TikaServerTextExtractor.php | 61 ++++++------- src/Extractor/TikaTextExtractor.php | 27 +++--- src/Rest/TikaRestClient.php | 28 +++--- 16 files changed, 225 insertions(+), 229 deletions(-) create mode 100644 .upgrade.yml delete mode 100644 CHANGELOG.md delete mode 100644 _config.php rename src/{Extension => Cache}/FileTextCache.php (92%) rename src/{Extension/FieldTextCache_Cache.php => Cache/FileTextCache/Cache.php} (77%) rename src/{Extension/FieldTextCache_Database.php => Cache/FileTextCache/Database.php} (67%) delete mode 100644 src/Exception/FileTextExtractor_Exception.php create mode 100644 src/Extractor/FileTextExtractor/Exception.php diff --git a/.upgrade.yml b/.upgrade.yml new file mode 100644 index 0000000..5d5dd4f --- /dev/null +++ b/.upgrade.yml @@ -0,0 +1,14 @@ +mappings: + FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable + FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache + FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache + FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database + FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor + FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception + HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor + PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor + SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor + TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor + TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor + TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient + diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 5db972b..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,12 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -This project adheres to [Semantic Versioning](http://semver.org/). - - -## [2.0.1] -Using Symfony mime type detection - -## [2.0.0] -Clarified Tika docs diff --git a/_config.php b/_config.php deleted file mode 100644 index e69de29..0000000 diff --git a/src/Extension/FileTextCache.php b/src/Cache/FileTextCache.php similarity index 92% rename from src/Extension/FileTextCache.php rename to src/Cache/FileTextCache.php index d0ccd70..3586d78 100644 --- a/src/Extension/FileTextCache.php +++ b/src/Cache/FileTextCache.php @@ -1,6 +1,6 @@ get(__CLASS__, 'lifetime'); - $lifetime = $lifetime ?: 3600; + $lifetime = $this->config()->get('lifetime') ?: 3600; $key = $this->getKey($file); $cache = self::get_cache(); @@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable /** * * @param File $file - * @return type + * @return bool */ public function invalidate(File $file) { diff --git a/src/Extension/FieldTextCache_Database.php b/src/Cache/FileTextCache/Database.php similarity index 67% rename from src/Extension/FieldTextCache_Database.php rename to src/Cache/FileTextCache/Database.php index a96ff60..1379ee0 100644 --- a/src/Extension/FieldTextCache_Database.php +++ b/src/Cache/FileTextCache/Database.php @@ -1,17 +1,25 @@ get('FileTextCache_Database', 'max_content_length'); + $maxLength = $this->config()->get('max_content_length'); $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->write(); } diff --git a/src/Exception/FileTextExtractor_Exception.php b/src/Exception/FileTextExtractor_Exception.php deleted file mode 100644 index 4fa1038..0000000 --- a/src/Exception/FileTextExtractor_Exception.php +++ /dev/null @@ -1,9 +0,0 @@ - 'Text' - ); + ]; /** * * @var array * @config */ - private static $casting = array( + private static $casting = [ 'FileContent' => 'Text' - ); + ]; /** * * @var array * @config */ - private static $dependencies = array( - 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' - ); + private static $dependencies = [ + 'TextCache' => FileTextCache\Cache::class, + ]; /** * @var FileTextCache @@ -52,11 +53,12 @@ class FileTextExtractable extends DataExtension /** * * @param FileTextCache $cache - * @return void + * @return $this */ public function setTextCache(FileTextCache $cache) { $this->fileTextCache = $cache; + return $this; } /** @@ -84,7 +86,7 @@ class FileTextExtractable extends DataExtension * @param boolean $disableCache If false, the file content is only parsed on demand. * If true, the content parsing is forced, bypassing * the cached version - * @return mixed string | null + * @return string|null */ public function extractFileAsText($disableCache = false) { diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index 115d679..fd3cf5c 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -2,17 +2,18 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\Core\Config\Config, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\ClassInfo; +use SilverStripe\Core\ClassInfo; +use SilverStripe\Core\Config\Config; +use SilverStripe\Core\Config\Configurable; +use SilverStripe\Core\Injector\Injector; /** * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * @author mstephens - * */ abstract class FileTextExtractor { + use Configurable; /** * Set priority from 0-100. @@ -45,7 +46,7 @@ abstract class FileTextExtractor // Generate the sorted list of extractors on demand. $classes = ClassInfo::subclassesFor(__CLASS__); array_shift($classes); - $classPriorities = array(); + $classPriorities = []; foreach ($classes as $class) { $classPriorities[$class] = Config::inst()->get($class, 'priority'); @@ -76,19 +77,19 @@ abstract class FileTextExtractor */ protected static function get_mime($path) { - $file = new Symfony\Component\HttpFoundation\File\File($path); + $file = new \Symfony\Component\HttpFoundation\File\File($path); return $file->getMimeType(); } /** * @param string $path - * @return mixed FileTextExtractor | null + * @return FileTextExtractor|null */ public static function for_file($path) { if (!file_exists($path) || is_dir($path)) { - return; + return null; } $extension = pathinfo($path, PATHINFO_EXTENSION); @@ -132,7 +133,7 @@ abstract class FileTextExtractor abstract public function supportsExtension($extension); /** - * Determine if this extractor suports the given mime type. + * Determine if this extractor supports the given mime type. * Will only be called if supportsExtension returns false. * * @param string $mime diff --git a/src/Extractor/FileTextExtractor/Exception.php b/src/Extractor/FileTextExtractor/Exception.php new file mode 100644 index 0000000..1a54a80 --- /dev/null +++ b/src/Extractor/FileTextExtractor/Exception.php @@ -0,0 +1,7 @@ + or @siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - // Add line breaks before and after blocks - '@]*?>.*?@siu', + '@]*?>.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@config()->binary_location) { - $locations = array($location); + if ($location = $this->config()->get('binary_location')) { + $locations = [$location]; } else { - $locations = $this->config()->search_binary_locations; + $locations = $this->config()->get('search_binary_locations'); } // Find program in each path @@ -88,8 +86,9 @@ class PDFTextExtractor extends FileTextExtractor public function getContent($path) { if (!$path) { - return ""; - } // no file + // no file + return ''; + } $content = $this->getRawOutput($path); return $this->cleanupLigatures($content); } @@ -99,12 +98,12 @@ class PDFTextExtractor extends FileTextExtractor * * @param string $path * @return string Output - * @throws FileTextExtractor_Exception + * @throws Exception */ protected function getRawOutput($path) { if (!$this->isAvailable()) { - throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); + throw new Exception("getRawOutput called on unavailable extractor"); } exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); if ($err) { @@ -112,8 +111,11 @@ class PDFTextExtractor extends FileTextExtractor // For Windows compatibility $err = $content; } - throw new FileTextExtractor_Exception(sprintf( - 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) + + throw new Exception(sprintf( + 'PDFTextExtractor->getContent() failed for %s: %s', + $path, + implode(PHP_EOL, $err) )); } @@ -130,7 +132,7 @@ class PDFTextExtractor extends FileTextExtractor */ protected function cleanupLigatures($input) { - $mapping = array( + $mapping = [ 'ff' => 'ff', 'fi' => 'fi', 'fl' => 'fl', @@ -138,7 +140,7 @@ class PDFTextExtractor extends FileTextExtractor 'ffl' => 'ffl', 'ſt' => 'ft', 'st' => 'st' - ); + ]; return str_replace(array_keys($mapping), array_values($mapping), $input); } diff --git a/src/Extractor/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php index b2f149e..0e26b74 100644 --- a/src/Extractor/SolrCellTextExtractor.php +++ b/src/Extractor/SolrCellTextExtractor.php @@ -2,9 +2,11 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - GuzzleHttp\Client, - Psr\Log\LoggerInterface; +use Exception; +use GuzzleHttp\Client; +use InvalidArgumentException; +use Psr\Log\LoggerInterface; +use SilverStripe\Core\Injector\Injector; /** * Text extractor that calls an Apache Solr instance @@ -18,7 +20,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor, class SolrCellTextExtractor extends FileTextExtractor { /** - * Base URL to use for solr text extraction. + * Base URL to use for Solr text extraction. * E.g. http://localhost:8983/solr/update/extract * * @config @@ -27,43 +29,36 @@ class SolrCellTextExtractor extends FileTextExtractor private static $base_url; /** - * * @var int * @config */ private static $priority = 75; /** - * - * @var GuzzleHttp\Client + * @var Client */ protected $httpClient; /** - * - * @return GuzzleHttp\Client - * @throws InvalidArgumentException + * @return Client */ public function getHttpClient() { - if (!$this->config()->get('base_url')) { - throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); - } if (!$this->httpClient) { - $this->httpClient = new Client($this->config()->get('base_url')); + $this->httpClient = new Client(); } return $this->httpClient; } /** - * - * @param GuzzleHttp\Client $client - * @return void + * @param Client $client + * @return $this */ - public function setHttpClient($client) + public function setHttpClient(Client $client) { $this->httpClient = $client; + return $this; } /** @@ -73,30 +68,28 @@ class SolrCellTextExtractor extends FileTextExtractor { $url = $this->config()->get('base_url'); - return (boolean) $url; + return (bool) $url; } /** - * * @param string $extension - * @return boolean + * @return bool */ public function supportsExtension($extension) { return in_array( strtolower($extension), - array( + [ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' - ) + ] ); } /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -105,48 +98,55 @@ class SolrCellTextExtractor extends FileTextExtractor } /** - * - * @param string $path + * @param string $path * @return string + * @throws InvalidArgumentException */ public function getContent($path) { if (!$path) { - return ""; - } // no file + // no file + return ''; + } $fileName = basename($path); $client = $this->getHttpClient(); + // Get and validate base URL + $baseUrl = $this->config()->get('base_url'); + if (!$this->config()->get('base_url')) { + throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); + } + try { $request = $client - ->post() - ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) - ->addPostFiles(array('myfile' => $path)); + ->post($baseUrl) + ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text']) + ->addPostFiles(['myfile' => $path]); $response = $request->send(); - } catch (\InvalidArgumentException $e) { + } catch (InvalidArgumentException $e) { $msg = sprintf( - 'Error extracting text from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Error extracting text from "%s" (message: %s)', + $path, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; - } catch (\Exception $e) { + } catch (Exception $e) { // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. $msg = sprintf( - 'Tika server error attempting to extract from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Tika server error attempting to extract from "%s" (message: %s)', + $path, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } - // Just initialise it, it doesn't take miuch. + // Just initialise it, it doesn't take much. $matches = []; // Use preg match to avoid SimpleXML running out of memory on large text nodes diff --git a/src/Extractor/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php index 2ae38e8..1c477ec 100644 --- a/src/Extractor/TikaServerTextExtractor.php +++ b/src/Extractor/TikaServerTextExtractor.php @@ -2,10 +2,9 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\Environment, - SilverStripe\TextExtraction\Rest\TikaRestClient; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Rest\TikaRestClient; /** * Enables text extraction of file content via the Tika Rest Server @@ -35,18 +34,25 @@ class TikaServerTextExtractor extends FileTextExtractor */ protected $client = null; + /** + * Cache of supported mime types + * + * @var array + */ + protected $supportedMimes = []; + /** * @return TikaRestClient */ public function getClient() { - return $this->client ?: - ($this->client = - Injector::inst()->createWithArgs( - TikaRestClient::class, - array($this->getServerEndpoint()) - ) + if (!$this->client) { + $this->client = Injector::inst()->createWithArgs( + TikaRestClient::class, + [$this->getServerEndpoint()] ); + } + return $this->client; } /** @@ -59,19 +65,17 @@ class TikaServerTextExtractor extends FileTextExtractor } // Default to configured endpoint - return $this->config()->server_endpoint; + return $this->config()->get('server_endpoint'); } /** - * Get the version of tika installed, or 0 if not installed + * Get the version of Tika installed, or 0 if not installed * - * @return float version of tika + * @return float version of Tika */ public function getVersion() { - return $this - ->getClient() - ->getVersion(); + return $this->getClient()->getVersion(); } /** @@ -79,13 +83,12 @@ class TikaServerTextExtractor extends FileTextExtractor */ public function isAvailable() { - return $this->getServerEndpoint() && - $this->getClient()->isAvailable() && - version_compare($this->getVersion(), '1.7.0') >= 0; + return $this->getServerEndpoint() + && $this->getClient()->isAvailable() + && version_compare($this->getVersion(), '1.7.0') >= 0; } /** - * * @param string $extension * @return boolean */ @@ -95,31 +98,23 @@ class TikaServerTextExtractor extends FileTextExtractor return false; } - /** - * Cache of supported mime types - * - * @var array - */ - protected $supportedMimes = array(); - - /** - * * @param string $mime * @return boolean */ public function supportsMime($mime) { - $supported = $this->supportedMimes ?: - ($this->supportedMimes = $this->getClient()->getSupportedMimes()); + if (!$this->supportedMimes) { + $this->supportedMimes = $this->getClient()->getSupportedMimes(); + } // Check if supported (most common / quickest lookup) - if (isset($supported[$mime])) { + if (isset($this->supportedMimes[$mime])) { return true; } // Check aliases - foreach ($supported as $info) { + foreach ($this->supportedMimes as $info) { if (isset($info['alias']) && in_array($mime, $info['alias'])) { return true; } diff --git a/src/Extractor/TikaTextExtractor.php b/src/Extractor/TikaTextExtractor.php index 0d4b18f..a6a21bf 100644 --- a/src/Extractor/TikaTextExtractor.php +++ b/src/Extractor/TikaTextExtractor.php @@ -2,8 +2,6 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor; - /** * Enables text extraction of file content via the Tika CLI * @@ -47,13 +45,13 @@ class TikaTextExtractor extends FileTextExtractor */ protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') { - $descriptorSpecs = array( - 0 => array("pipe", "r"), - 1 => array("pipe", "w"), - 2 => array("pipe", "w") - ); + $descriptorSpecs = [ + 0 => ["pipe", "r"], + 1 => ["pipe", "w"], + 2 => ["pipe", "w"] + ]; // Invoke command - $pipes = array(); + $pipes = []; $proc = proc_open($command, $descriptorSpecs, $pipes); if (!is_resource($proc)) { @@ -75,7 +73,6 @@ class TikaTextExtractor extends FileTextExtractor } /** - * * @param string $path * @return string */ @@ -91,8 +88,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function isAvailable() { @@ -100,8 +96,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function supportsExtension($extension) { @@ -111,9 +106,8 @@ class TikaTextExtractor extends FileTextExtractor /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -121,8 +115,9 @@ class TikaTextExtractor extends FileTextExtractor $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); if ($code) { + // Error case return false; - } // Error case + } // Check if the mime type is inside the result $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); diff --git a/src/Rest/TikaRestClient.php b/src/Rest/TikaRestClient.php index 8473f67..cd8b5c7 100644 --- a/src/Rest/TikaRestClient.php +++ b/src/Rest/TikaRestClient.php @@ -2,11 +2,11 @@ namespace SilverStripe\TextExtraction\Rest; -use GuzzleHttp\Client, - GuzzleHttp\Exception\RequestException, - SilverStripe\Core\Environment, - Psr\Log\LoggerInterface, - SilverStripe\Core\Injector\Injector; +use GuzzleHttp\Client; +use GuzzleHttp\Exception\RequestException; +use Psr\Log\LoggerInterface; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; class TikaRestClient extends Client { @@ -15,12 +15,12 @@ class TikaRestClient extends Client * * @var array */ - protected $options = array('username' => null, 'password' => null); + protected $options = ['username' => null, 'password' => null]; /** * @var array */ - protected $mimes = array(); + protected $mimes = []; /** * @@ -29,16 +29,16 @@ class TikaRestClient extends Client */ public function __construct($baseUrl = '', $config = null) { - $psswd = Environment::getEnv('SS_TIKA_PASSWORD'); + $password = Environment::getEnv('SS_TIKA_PASSWORD'); - if (!empty($psswd)) { - $this->options = array( + if (!empty($password)) { + $this->options = [ 'username' => Environment::getEnv('SS_TIKA_USERNAME'), - 'password' => $psswd, - ); + 'password' => $password, + ]; } - parent::__construct($baseUrl, $config); + parent::__construct($config); } /** @@ -120,7 +120,7 @@ class TikaRestClient extends Client try { $response = $this->put( 'tika', - array('Accept' => 'text/plain'), + ['Accept' => 'text/plain'], file_get_contents($file) ); $response->setAuth($this->options['username'], $this->options['password']); From fe5148e67810f371435ef60da2049ec9fe1bbc82 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 11:35:24 +1200 Subject: [PATCH 04/16] API Add namespaces to tests and update SapphireTest implementation --- tests/FileTextCacheDatabaseTest.php | 20 ++++++------ tests/FileTextExtractableTest.php | 49 ++++++++++++++++------------- tests/HTMLTextExtractorTest.php | 9 +++++- tests/PDFTextExtractorTest.php | 15 ++++++--- tests/TikaTextExtractorTest.php | 10 ++++-- 5 files changed, 64 insertions(+), 39 deletions(-) diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php index e300c19..e7fb242 100644 --- a/tests/FileTextCacheDatabaseTest.php +++ b/tests/FileTextCacheDatabaseTest.php @@ -1,23 +1,23 @@ set(Database::class, 'max_content_length', 5); - Config::inst()->update('FileTextCache_Database', 'max_content_length', 5); - $cache = new FileTextCache_Database(); - $file = $this->getMock('File', array('write')); + $cache = new Database(); + $file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock(); $content = '0123456789'; $cache->save($file, $content); - $this->assertEquals($cache->load($file), '01234'); - Config::unnest(); + $this->assertEquals($cache->load($file), '01234'); } } diff --git a/tests/FileTextExtractableTest.php b/tests/FileTextExtractableTest.php index 166b1ee..4a1edf2 100644 --- a/tests/FileTextExtractableTest.php +++ b/tests/FileTextExtractableTest.php @@ -1,46 +1,53 @@ array('FileTextExtractable') - ); + protected $usesDatabase = true; - public function setUp() + protected static $required_extensions = [ + File::class => [ + FileTextExtractable::class, + ], + ]; + + protected function setUp() { parent::setUp(); // Ensure that html is a valid extension - Config::inst() - ->nest() - ->update('File', 'allowed_extensions', array('html')); - } - - public function tearDown() - { - Config::unnest(); - parent::tearDown(); + Config::modify()->merge(File::class, 'allowed_extensions', ['html']); } public function testExtractFileAsText() { // Create a copy of the file, as it may be clobbered by the test // ($file->extractFileAsText() calls $file->write) - copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); - + copy( + dirname(__FILE__) . '/fixtures/test1.html', + dirname(__FILE__) . '/fixtures/test1-copy.html' + ); + // Use HTML, since the extractor is always available - $file = new File(array( + $file = new File([ 'Name' => 'test1-copy.html', - 'Filename' => 'textextraction/tests/fixtures/test1-copy.html' - )); + 'Filename' => dirname(__FILE__) . '/fixtures/test1-copy.html' + ]); $file->write(); - + $content = $file->extractFileAsText(); $this->assertContains('Test Headline', $content); $this->assertContains('Test Text', $content); $this->assertEquals($content, $file->FileContentCache); - if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) { - unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); + if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) { + unlink(dirname(__FILE__) . '/fixtures/test1-copy.html'); } } } diff --git a/tests/HTMLTextExtractorTest.php b/tests/HTMLTextExtractorTest.php index 8ff8b0b..59b8018 100644 --- a/tests/HTMLTextExtractorTest.php +++ b/tests/HTMLTextExtractorTest.php @@ -1,11 +1,18 @@ getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); + $content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.html'); + $this->assertContains('Test Headline', $content); $this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); $this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); diff --git a/tests/PDFTextExtractorTest.php b/tests/PDFTextExtractorTest.php index 96ad1ec..dfbffcd 100644 --- a/tests/PDFTextExtractorTest.php +++ b/tests/PDFTextExtractorTest.php @@ -1,17 +1,22 @@ isAvailable()) { - $this->setExpectedException( - 'FileTextExtractor_Exception', - 'getRawOutput called on unavailable extractor' - ); + $this->expectException(Exception::class); + $this->expectExceptionMessage('getRawOutput called on unavailable extractor'); } - $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); + $content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.pdf'); $this->assertContains('This is a test file with a link', $content); } } diff --git a/tests/TikaTextExtractorTest.php b/tests/TikaTextExtractorTest.php index 0342dcf..6c8e801 100644 --- a/tests/TikaTextExtractorTest.php +++ b/tests/TikaTextExtractorTest.php @@ -1,5 +1,11 @@ getContent($file); $this->assertContains('This is a test file with a link', $content); @@ -31,7 +37,7 @@ class TikaTextExtractorTest extends SapphireTest } // Check file - $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; + $file = dirname(__FILE__) . '/fixtures/test1.pdf'; $content = $extractor->getContent($file); $this->assertContains('This is a test file with a link', $content); From 8d295ada9c9d799fb42a1afd55c7c1931c71c96c Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 11:35:52 +1200 Subject: [PATCH 05/16] Add phpunit/phpcs configuration and update Travis configuration --- .travis.yml | 48 ++++++++++++++++++++++++++++-------------------- codecov.yml | 3 +++ phpcs.xml.dist | 11 +++++++++++ phpunit.xml.dist | 14 ++++++++++++++ 4 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 codecov.yml create mode 100644 phpcs.xml.dist create mode 100644 phpunit.xml.dist diff --git a/.travis.yml b/.travis.yml index 0b96f17..986569e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,39 +1,47 @@ -# See https://github.com/silverstripe/silverstripe-travis-support for setup details language: php -sudo: false - addons: apt: packages: - poppler-utils +env: + global: + - COMPOSER_ROOT_VERSION=3.x-dev + - SS_TIKA_ENDPOINT="http://localhost:9998/" + matrix: include: - - php: 5.4 - env: DB=PGSQL CORE_RELEASE=3.2 - - php: 5.5 - env: DB=PGSQL CORE_RELEASE=3.3 - php: 5.6 - env: DB=PGSQL CORE_RELEASE=3.4 - - php: 5.6 - env: DB=MYSQL CORE_RELEASE=3.5 + env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1 - php: 7.0 - env: DB=MYSQL CORE_RELEASE=3.6 + env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1 - php: 7.1 - env: DB=MYSQL CORE_RELEASE=3 + env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1 + - php: 7.2 + env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1 before_script: - - composer self-update || true + # Init PHP + - phpenv rehash + - phpenv config-rm xdebug.ini + + # Configure Tika bin - mkdir -p $HOME/bin - export PATH=$PATH:$HOME/bin - - export SS_TIKA_ENDPOINT="http://localhost:9998/" - ./.travis/install_tika.sh - - git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support - - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - - cd ~/builds/ss - - composer install + - ($HOME/bin/tika-rest-server &) &> /dev/null + + # Install composer dependencies + - composer validate + - composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION" + - if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi + - composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile script: - - ($HOME/bin/tika-rest-server &) &> /dev/null - - vendor/bin/phpunit --verbose textextraction/tests/ + - if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi + - if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi + - if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi + +after_success: + - if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..9e7c838 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,3 @@ +comment: false +codecov: + branch: master diff --git a/phpcs.xml.dist b/phpcs.xml.dist new file mode 100644 index 0000000..a2a2e85 --- /dev/null +++ b/phpcs.xml.dist @@ -0,0 +1,11 @@ + + + CodeSniffer ruleset for SilverStripe coding conventions. + + + + + + + + diff --git a/phpunit.xml.dist b/phpunit.xml.dist new file mode 100644 index 0000000..ff569fc --- /dev/null +++ b/phpunit.xml.dist @@ -0,0 +1,14 @@ + + + tests/php + + + + + src/ + + tests/ + + + + From e2404fc9047aaf3352b84ba5da42c68e3ac410b9 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 11:36:04 +1200 Subject: [PATCH 06/16] Update gitattributes and Scrutinizer configuration --- .gitattributes | 1 + .scrutinizer.yml | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.gitattributes b/.gitattributes index 475f5f2..89eb187 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4,3 +4,4 @@ /.gitignore export-ignore /.travis.yml export-ignore /.scrutinizer.yml export-ignore +/codecov.yml export-ignore diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 61b0c9f..a22afca 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -1,9 +1,13 @@ inherit: true checks: - php: - code_rating: true - duplication: true + php: true + +build: + nodes: + analysis: + tests: + override: [php-scrutinizer-run] filter: - paths: [code/*, tests/*] + paths: [src/*, tests/*] From 8bd019b2aa9f6979aa251945007273dfdb7622cf Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 11:37:38 +1200 Subject: [PATCH 07/16] Update codebase to ensure relative PSR-2 compliance --- phpcs.xml.dist | 3 +-- src/Extension/FileTextExtractable.php | 4 ++-- src/Extractor/HTMLTextExtractor.php | 5 +++-- src/Extractor/PDFTextExtractor.php | 1 - src/Extractor/SolrCellTextExtractor.php | 3 ++- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/phpcs.xml.dist b/phpcs.xml.dist index a2a2e85..a504558 100644 --- a/phpcs.xml.dist +++ b/phpcs.xml.dist @@ -5,7 +5,6 @@ - - + diff --git a/src/Extension/FileTextExtractable.php b/src/Extension/FileTextExtractable.php index 8347486..b5f7896 100644 --- a/src/Extension/FileTextExtractable.php +++ b/src/Extension/FileTextExtractable.php @@ -80,8 +80,8 @@ class FileTextExtractable extends DataExtension } /** - * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. - * The value is also cached into the File record itself. + * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and + * returns the text. The value is also cached into the File record itself. * * @param boolean $disableCache If false, the file content is only parsed on demand. * If true, the content parsing is forced, bypassing diff --git a/src/Extractor/HTMLTextExtractor.php b/src/Extractor/HTMLTextExtractor.php index df05917..78c8440 100644 --- a/src/Extractor/HTMLTextExtractor.php +++ b/src/Extractor/HTMLTextExtractor.php @@ -3,7 +3,9 @@ namespace SilverStripe\TextExtraction\Extractor; /** - * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text. + * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not + * the best for readable text. + * * @author mstephens */ class HTMLTextExtractor extends FileTextExtractor @@ -82,5 +84,4 @@ class HTMLTextExtractor extends FileTextExtractor return strip_tags($content); } - } diff --git a/src/Extractor/PDFTextExtractor.php b/src/Extractor/PDFTextExtractor.php index fb2d793..b927900 100644 --- a/src/Extractor/PDFTextExtractor.php +++ b/src/Extractor/PDFTextExtractor.php @@ -144,5 +144,4 @@ class PDFTextExtractor extends FileTextExtractor return str_replace(array_keys($mapping), array_values($mapping), $input); } - } diff --git a/src/Extractor/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php index 0e26b74..a19282f 100644 --- a/src/Extractor/SolrCellTextExtractor.php +++ b/src/Extractor/SolrCellTextExtractor.php @@ -134,7 +134,8 @@ class SolrCellTextExtractor extends FileTextExtractor return null; } catch (Exception $e) { - // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. + // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search + // query in some cases. $msg = sprintf( 'Tika server error attempting to extract from "%s" (message: %s)', $path, From edb02e91897bf44ebe5150d28f8c3c07272e25d9 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 15:55:02 +1200 Subject: [PATCH 08/16] API FileTextExtractable::getContent now takes a File instance instead of a path --- _config/cache.yml | 3 +- _config/config.yml | 10 +++++ src/Extension/FileTextExtractable.php | 20 ++++----- src/Extractor/FileTextExtractor.php | 51 +++++++++++++++++++---- src/Extractor/HTMLTextExtractor.php | 9 ++-- src/Extractor/PDFTextExtractor.php | 16 ++++--- src/Extractor/SolrCellTextExtractor.php | 12 +++--- src/Extractor/TikaServerTextExtractor.php | 6 ++- src/Extractor/TikaTextExtractor.php | 11 +++-- tests/FileTextExtractableTest.php | 35 +++++++++------- tests/HTMLTextExtractorTest.php | 17 +++++++- tests/PDFTextExtractorTest.php | 9 +++- 12 files changed, 138 insertions(+), 61 deletions(-) create mode 100644 _config/config.yml diff --git a/_config/cache.yml b/_config/cache.yml index ff793b2..2f82c29 100644 --- a/_config/cache.yml +++ b/_config/cache.yml @@ -3,9 +3,8 @@ Name: textextractioncache After: - '#corecache' --- - SilverStripe\Core\Injector\Injector: Psr\SimpleCache\CacheInterface.FileTextCache_Cache: factory: SilverStripe\Core\Cache\CacheFactory constructor: - namespace: 'FileTextCache_Cache' \ No newline at end of file + namespace: 'FileTextCache_Cache' diff --git a/_config/config.yml b/_config/config.yml new file mode 100644 index 0000000..0a0982d --- /dev/null +++ b/_config/config.yml @@ -0,0 +1,10 @@ +--- +Name: textextractionconfig +--- +SilverStripe\Core\Injector\Injector: + # Define default FileTextCache implementation + SilverStripe\TextExtraction\Cache\FileTextCache: + class: SilverStripe\TextExtraction\Cache\FileTextCache\Database + +SilverStripe\TextExtraction\Cache\FileTextCache\Database: + max_content_length: 500000 diff --git a/src/Extension/FileTextExtractable.php b/src/Extension/FileTextExtractable.php index b5f7896..fedccbc 100644 --- a/src/Extension/FileTextExtractable.php +++ b/src/Extension/FileTextExtractable.php @@ -2,7 +2,7 @@ namespace SilverStripe\TextExtraction\Extension; -use SilverStripe\Control\Director; +use SilverStripe\Assets\File; use SilverStripe\ORM\DataExtension; use SilverStripe\TextExtraction\Cache\FileTextCache; use SilverStripe\TextExtraction\Extractor\FileTextExtractor; @@ -14,12 +14,10 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor; * Adds an additional property which is the cached contents, which is populated on demand. * * @author mstephens - * */ class FileTextExtractable extends DataExtension { /** - * * @var array * @config */ @@ -28,7 +26,6 @@ class FileTextExtractable extends DataExtension ]; /** - * * @var array * @config */ @@ -37,12 +34,11 @@ class FileTextExtractable extends DataExtension ]; /** - * * @var array * @config */ private static $dependencies = [ - 'TextCache' => FileTextCache\Cache::class, + 'TextCache' => '%$' . FileTextCache::class, ]; /** @@ -51,7 +47,6 @@ class FileTextExtractable extends DataExtension protected $fileTextCache = null; /** - * * @param FileTextCache $cache * @return $this */ @@ -90,27 +85,28 @@ class FileTextExtractable extends DataExtension */ public function extractFileAsText($disableCache = false) { + /** @var File $file */ + $file = $this->owner; if (!$disableCache) { - $text = $this->getTextCache()->load($this->owner); + $text = $this->getTextCache()->load($file); if ($text) { return $text; } } // Determine which extractor can process this file. - $path = Director::baseFolder() . '/' . $this->owner->getFilename(); - $extractor = FileTextExtractor::for_file($path); + $extractor = FileTextExtractor::for_file($file); if (!$extractor) { return null; } - $text = $extractor->getContent($path); + $text = $extractor->getContent($file); if (!$text) { return null; } if (!$disableCache) { - $this->getTextCache()->save($this->owner, $text); + $this->getTextCache()->save($file, $text); } return $text; diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index fd3cf5c..57a82ef 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -2,10 +2,12 @@ namespace SilverStripe\TextExtraction\Extractor; +use SilverStripe\Assets\File; use SilverStripe\Core\ClassInfo; use SilverStripe\Core\Config\Config; use SilverStripe\Core\Config\Configurable; use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; /** * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. @@ -83,17 +85,19 @@ abstract class FileTextExtractor } /** - * @param string $path + * Given a File object, decide which extractor instance to use to handle it + * + * @param File $file * @return FileTextExtractor|null */ - public static function for_file($path) + public static function for_file(File $file) { - if (!file_exists($path) || is_dir($path)) { + if (!$file) { return null; } - $extension = pathinfo($path, PATHINFO_EXTENSION); - $mime = self::get_mime($path); + $extension = $file->getExtension(); + $mime = $file->getMimeType(); foreach (self::get_extractor_classes() as $className) { $extractor = self::get_extractor($className); @@ -115,6 +119,37 @@ abstract class FileTextExtractor } } + /** + * Some text extractors (like pdftotext) may require a physical file to read from, so write the current + * file contents to a temp file and return its path + * + * @param File $file + * @return string + * @throws Exception + */ + protected function getPathFromFile(File $file) + { + $path = tempnam(TEMP_PATH, 'pdftextextractor_'); + if (false === $path) { + throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name'); + } + + // Append extension to temp file if one is set + if ($file->getExtension()) { + $path .= '.' . $file->getExtension(); + } + + // Remove any existing temp files with this name + unlink($path); + + $bytesWritten = file_put_contents($path, $file->getStream()); + if (false === $bytesWritten) { + throw new Exception(static::class . '->getPathFromFile() failed to write temporary file'); + } + + return $path; + } + /** * Checks if the extractor is supported on the current environment, * for example if the correct binaries or libraries are available. @@ -142,10 +177,10 @@ abstract class FileTextExtractor abstract public function supportsMime($mime); /** - * Given a file path, extract the contents as text. + * Given a File instance, extract the contents as text. * - * @param string $path + * @param File $file * @return string */ - abstract public function getContent($path); + abstract public function getContent(File $file); } diff --git a/src/Extractor/HTMLTextExtractor.php b/src/Extractor/HTMLTextExtractor.php index 78c8440..d1b56b9 100644 --- a/src/Extractor/HTMLTextExtractor.php +++ b/src/Extractor/HTMLTextExtractor.php @@ -2,6 +2,8 @@ namespace SilverStripe\TextExtraction\Extractor; +use SilverStripe\Assets\File; + /** * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not * the best for readable text. @@ -49,12 +51,13 @@ class HTMLTextExtractor extends FileTextExtractor * combined with regular expressions to remove non-content tags like