API Update namespaces for FileTextCache and add upgrader mapping

This commit is contained in:
Robbie Averill 2018-07-03 11:23:27 +12:00
parent f1bacd2aa9
commit 66c9db8c0d
16 changed files with 225 additions and 229 deletions

14
.upgrade.yml Normal file
View File

@ -0,0 +1,14 @@
mappings:
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient

View File

@ -1,12 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [2.0.1]
Using Symfony mime type detection
## [2.0.0]
Clarified Tika docs

View File

View File

@ -1,6 +1,6 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache;
use SilverStripe\Assets\File; use SilverStripe\Assets\File;

View File

@ -1,19 +1,21 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File, use Psr\SimpleCache\CacheInterface;
SilverStripe\Core\Config\Config, use SilverStripe\Assets\File;
SilverStripe\TextExtraction\Extension\FileTextCache, use SilverStripe\Core\Config\Configurable;
SilverStripe\Core\Flushable, use SilverStripe\Core\Flushable;
Psr\SimpleCache\CacheInterface, use SilverStripe\Core\Injector\Injector;
SilverStripe\Core\Injector\Injector; use SilverStripe\TextExtraction\Cache\FileTextCache;
/** /**
* Uses SS_Cache with a lifetime to cache extracted content * Uses SS_Cache with a lifetime to cache extracted content
*/ */
class FileTextCache_Cache implements FileTextCache, Flushable class Cache implements FileTextCache, Flushable
{ {
use Configurable;
/** /**
* Lifetime of cache in seconds * Lifetime of cache in seconds
* Null is indefinite * Null is indefinite
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/** /**
* *
* @param File $file * @param File $file
* @return type * @return mixed
*/ */
public function load(File $file) public function load(File $file)
{ {
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
*/ */
public function save(File $file, $content) public function save(File $file, $content)
{ {
$lifetime = Config::inst()->get(__CLASS__, 'lifetime'); $lifetime = $this->config()->get('lifetime') ?: 3600;
$lifetime = $lifetime ?: 3600;
$key = $this->getKey($file); $key = $this->getKey($file);
$cache = self::get_cache(); $cache = self::get_cache();
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/** /**
* *
* @param File $file * @param File $file
* @return type * @return bool
*/ */
public function invalidate(File $file) public function invalidate(File $file)
{ {

View File

@ -1,17 +1,25 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File, use SilverStripe\Assets\File;
SilverStripe\Core\Config\Config, use SilverStripe\Core\Config\Configurable;
SilverStripe\TextExtraction\Extension\FileTextCache; use SilverStripe\TextExtraction\Cache\FileTextCache;
/** /**
* Caches the extracted content on the record for the file. * Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits. * Limits the stored file content by default to avoid hitting query size limits.
*/ */
class FileTextCache_Database implements FileTextCache class Database implements FileTextCache
{ {
use Configurable;
/**
* @config
* @var int
*/
private static $max_content_length = null;
/** /**
* *
* @param File $file * @param File $file
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
*/ */
public function save(File $file, $content) public function save(File $file, $content)
{ {
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); $maxLength = $this->config()->get('max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write(); $file->write();
} }

View File

@ -1,9 +0,0 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -2,9 +2,10 @@
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension, use SilverStripe\Control\Director;
SilverStripe\TextExtraction\Extension\FileTextCache, use SilverStripe\ORM\DataExtension;
SilverStripe\Control\Director; use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/** /**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of * Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
@ -22,27 +23,27 @@ class FileTextExtractable extends DataExtension
* @var array * @var array
* @config * @config
*/ */
private static $db = array( private static $db = [
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); ];
/** /**
* *
* @var array * @var array
* @config * @config
*/ */
private static $casting = array( private static $casting = [
'FileContent' => 'Text' 'FileContent' => 'Text'
); ];
/** /**
* *
* @var array * @var array
* @config * @config
*/ */
private static $dependencies = array( private static $dependencies = [
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' 'TextCache' => FileTextCache\Cache::class,
); ];
/** /**
* @var FileTextCache * @var FileTextCache
@ -52,11 +53,12 @@ class FileTextExtractable extends DataExtension
/** /**
* *
* @param FileTextCache $cache * @param FileTextCache $cache
* @return void * @return $this
*/ */
public function setTextCache(FileTextCache $cache) public function setTextCache(FileTextCache $cache)
{ {
$this->fileTextCache = $cache; $this->fileTextCache = $cache;
return $this;
} }
/** /**
@ -84,7 +86,7 @@ class FileTextExtractable extends DataExtension
* @param boolean $disableCache If false, the file content is only parsed on demand. * @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing * If true, the content parsing is forced, bypassing
* the cached version * the cached version
* @return mixed string | null * @return string|null
*/ */
public function extractFileAsText($disableCache = false) public function extractFileAsText($disableCache = false)
{ {

View File

@ -2,17 +2,18 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config, use SilverStripe\Core\ClassInfo;
SilverStripe\Core\Injector\Injector, use SilverStripe\Core\Config\Config;
SilverStripe\Core\ClassInfo; use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injector;
/** /**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens * @author mstephens
*
*/ */
abstract class FileTextExtractor abstract class FileTextExtractor
{ {
use Configurable;
/** /**
* Set priority from 0-100. * Set priority from 0-100.
@ -45,7 +46,7 @@ abstract class FileTextExtractor
// Generate the sorted list of extractors on demand. // Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor(__CLASS__); $classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes); array_shift($classes);
$classPriorities = array(); $classPriorities = [];
foreach ($classes as $class) { foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority'); $classPriorities[$class] = Config::inst()->get($class, 'priority');
@ -76,19 +77,19 @@ abstract class FileTextExtractor
*/ */
protected static function get_mime($path) protected static function get_mime($path)
{ {
$file = new Symfony\Component\HttpFoundation\File\File($path); $file = new \Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType(); return $file->getMimeType();
} }
/** /**
* @param string $path * @param string $path
* @return mixed FileTextExtractor | null * @return FileTextExtractor|null
*/ */
public static function for_file($path) public static function for_file($path)
{ {
if (!file_exists($path) || is_dir($path)) { if (!file_exists($path) || is_dir($path)) {
return; return null;
} }
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = pathinfo($path, PATHINFO_EXTENSION);
@ -132,7 +133,7 @@ abstract class FileTextExtractor
abstract public function supportsExtension($extension); abstract public function supportsExtension($extension);
/** /**
* Determine if this extractor suports the given mime type. * Determine if this extractor supports the given mime type.
* Will only be called if supportsExtension returns false. * Will only be called if supportsExtension returns false.
* *
* @param string $mime * @param string $mime

View File

@ -0,0 +1,7 @@
<?php
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class Exception extends \Exception
{
}

View File

@ -2,18 +2,21 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/** /**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text. * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens * @author mstephens
*
*/ */
class HTMLTextExtractor extends FileTextExtractor class HTMLTextExtractor extends FileTextExtractor
{ {
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/** /**
*
* @return boolean * @return boolean
*/ */
public function isAvailable() public function isAvailable()
@ -22,19 +25,15 @@ class HTMLTextExtractor extends FileTextExtractor
} }
/** /**
*
* @param string $extension * @param string $extension
* @return array * @return array
*/ */
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
return in_array( return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
strtolower($extension), array("html", "htm", "xhtml")
);
} }
/** /**
*
* @param string $mime * @param string $mime
* @return string * @return string
*/ */
@ -43,14 +42,6 @@ class HTMLTextExtractor extends FileTextExtractor
return strtolower($mime) === 'text/html'; return strtolower($mime) === 'text/html';
} }
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/** /**
* Extracts content from regex, by using strip_tags() * Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>, * combined with regular expressions to remove non-content tags like <style> or <script>,
@ -65,29 +56,30 @@ class HTMLTextExtractor extends FileTextExtractor
// Yes, yes, regex'ing HTML is evil. // Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job. // Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace( $content = preg_replace(
array( [
// Remove invisible content // Remove invisible content
'@<head[^>]*?>.*?</head>@siu', '@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu', '@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu', '@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu', '@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu', '@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu', '@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu', '@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu', '@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu', '@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks // Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu', '@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu', '@</?((frameset)|(frame)|(iframe))@iu',
), array( ],
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0", [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
), $content $content
); );
return strip_tags($content); return strip_tags($content);
} }

View File

@ -2,17 +2,14 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
/** /**
* Text extractor that calls pdftotext to do the conversion. * Text extractor that calls pdftotext to do the conversion.
* @author mstephens * @author mstephens
*
*/ */
class PDFTextExtractor extends FileTextExtractor class PDFTextExtractor extends FileTextExtractor
{ {
/** /**
* Set to bin path this extractor can execute * Set to bin path this extractor can execute
* *
@ -27,10 +24,10 @@ class PDFTextExtractor extends FileTextExtractor
* @config * @config
* @var array * @var array
*/ */
private static $search_binary_locations = array( private static $search_binary_locations = [
'/usr/bin', '/usr/bin',
'/usr/local/bin', '/usr/local/bin',
); ];
public function isAvailable() public function isAvailable()
{ {
@ -46,12 +43,13 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime) public function supportsMime($mime)
{ {
return in_array( return in_array(
strtolower($mime), array( strtolower($mime),
'application/pdf', [
'application/x-pdf', 'application/pdf',
'application/x-bzpdf', 'application/x-pdf',
'application/x-gzpdf' 'application/x-bzpdf',
) 'application/x-gzpdf'
]
); );
} }
@ -64,10 +62,10 @@ class PDFTextExtractor extends FileTextExtractor
protected function bin($program = '') protected function bin($program = '')
{ {
// Get list of allowed search paths // Get list of allowed search paths
if ($location = $this->config()->binary_location) { if ($location = $this->config()->get('binary_location')) {
$locations = array($location); $locations = [$location];
} else { } else {
$locations = $this->config()->search_binary_locations; $locations = $this->config()->get('search_binary_locations');
} }
// Find program in each path // Find program in each path
@ -88,8 +86,9 @@ class PDFTextExtractor extends FileTextExtractor
public function getContent($path) public function getContent($path)
{ {
if (!$path) { if (!$path) {
return ""; // no file
} // no file return '';
}
$content = $this->getRawOutput($path); $content = $this->getRawOutput($path);
return $this->cleanupLigatures($content); return $this->cleanupLigatures($content);
} }
@ -99,12 +98,12 @@ class PDFTextExtractor extends FileTextExtractor
* *
* @param string $path * @param string $path
* @return string Output * @return string Output
* @throws FileTextExtractor_Exception * @throws Exception
*/ */
protected function getRawOutput($path) protected function getRawOutput($path)
{ {
if (!$this->isAvailable()) { if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); throw new Exception("getRawOutput called on unavailable extractor");
} }
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) { if ($err) {
@ -112,8 +111,11 @@ class PDFTextExtractor extends FileTextExtractor
// For Windows compatibility // For Windows compatibility
$err = $content; $err = $content;
} }
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
)); ));
} }
@ -130,7 +132,7 @@ class PDFTextExtractor extends FileTextExtractor
*/ */
protected function cleanupLigatures($input) protected function cleanupLigatures($input)
{ {
$mapping = array( $mapping = [
'ff' => 'ff', 'ff' => 'ff',
'fi' => 'fi', 'fi' => 'fi',
'fl' => 'fl', 'fl' => 'fl',
@ -138,7 +140,7 @@ class PDFTextExtractor extends FileTextExtractor
'ffl' => 'ffl', 'ffl' => 'ffl',
'ſt' => 'ft', 'ſt' => 'ft',
'st' => 'st' 'st' => 'st'
); ];
return str_replace(array_keys($mapping), array_values($mapping), $input); return str_replace(array_keys($mapping), array_values($mapping), $input);
} }

View File

@ -2,9 +2,11 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use Exception;
GuzzleHttp\Client, use GuzzleHttp\Client;
Psr\Log\LoggerInterface; use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Injector\Injector;
/** /**
* Text extractor that calls an Apache Solr instance * Text extractor that calls an Apache Solr instance
@ -18,7 +20,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
class SolrCellTextExtractor extends FileTextExtractor class SolrCellTextExtractor extends FileTextExtractor
{ {
/** /**
* Base URL to use for solr text extraction. * Base URL to use for Solr text extraction.
* E.g. http://localhost:8983/solr/update/extract * E.g. http://localhost:8983/solr/update/extract
* *
* @config * @config
@ -27,43 +29,36 @@ class SolrCellTextExtractor extends FileTextExtractor
private static $base_url; private static $base_url;
/** /**
*
* @var int * @var int
* @config * @config
*/ */
private static $priority = 75; private static $priority = 75;
/** /**
* * @var Client
* @var GuzzleHttp\Client
*/ */
protected $httpClient; protected $httpClient;
/** /**
* * @return Client
* @return GuzzleHttp\Client
* @throws InvalidArgumentException
*/ */
public function getHttpClient() public function getHttpClient()
{ {
if (!$this->config()->get('base_url')) {
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if (!$this->httpClient) { if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url')); $this->httpClient = new Client();
} }
return $this->httpClient; return $this->httpClient;
} }
/** /**
* * @param Client $client
* @param GuzzleHttp\Client $client * @return $this
* @return void
*/ */
public function setHttpClient($client) public function setHttpClient(Client $client)
{ {
$this->httpClient = $client; $this->httpClient = $client;
return $this;
} }
/** /**
@ -73,30 +68,28 @@ class SolrCellTextExtractor extends FileTextExtractor
{ {
$url = $this->config()->get('base_url'); $url = $this->config()->get('base_url');
return (boolean) $url; return (bool) $url;
} }
/** /**
*
* @param string $extension * @param string $extension
* @return boolean * @return bool
*/ */
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
return in_array( return in_array(
strtolower($extension), strtolower($extension),
array( [
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv' 'ppt', 'pptx', 'odp', 'fodp', 'csv'
) ]
); );
} }
/** /**
*
* @param string $mime * @param string $mime
* @return boolean * @return bool
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
@ -105,48 +98,55 @@ class SolrCellTextExtractor extends FileTextExtractor
} }
/** /**
* * @param string $path
* @param string $path
* @return string * @return string
* @throws InvalidArgumentException
*/ */
public function getContent($path) public function getContent($path)
{ {
if (!$path) { if (!$path) {
return ""; // no file
} // no file return '';
}
$fileName = basename($path); $fileName = basename($path);
$client = $this->getHttpClient(); $client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
try { try {
$request = $client $request = $client
->post() ->post($baseUrl)
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
->addPostFiles(array('myfile' => $path)); ->addPostFiles(['myfile' => $path]);
$response = $request->send(); $response = $request->send();
} catch (\InvalidArgumentException $e) { } catch (InvalidArgumentException $e) {
$msg = sprintf( $msg = sprintf(
'Error extracting text from "%s" (message: %s)', 'Error extracting text from "%s" (message: %s)',
$path, $path,
$e->getMessage() $e->getMessage()
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} catch (\Exception $e) { } catch (Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
$msg = sprintf( $msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)', 'Tika server error attempting to extract from "%s" (message: %s)',
$path, $path,
$e->getMessage() $e->getMessage()
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} }
// Just initialise it, it doesn't take miuch. // Just initialise it, it doesn't take much.
$matches = []; $matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes // Use preg match to avoid SimpleXML running out of memory on large text nodes

View File

@ -2,10 +2,9 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use SilverStripe\Core\Environment;
SilverStripe\Core\Injector\Injector, use SilverStripe\Core\Injector\Injector;
SilverStripe\Core\Environment, use SilverStripe\TextExtraction\Rest\TikaRestClient;
SilverStripe\TextExtraction\Rest\TikaRestClient;
/** /**
* Enables text extraction of file content via the Tika Rest Server * Enables text extraction of file content via the Tika Rest Server
@ -35,18 +34,25 @@ class TikaServerTextExtractor extends FileTextExtractor
*/ */
protected $client = null; protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/** /**
* @return TikaRestClient * @return TikaRestClient
*/ */
public function getClient() public function getClient()
{ {
return $this->client ?: if (!$this->client) {
($this->client = $this->client = Injector::inst()->createWithArgs(
Injector::inst()->createWithArgs( TikaRestClient::class,
TikaRestClient::class, [$this->getServerEndpoint()]
array($this->getServerEndpoint())
)
); );
}
return $this->client;
} }
/** /**
@ -59,19 +65,17 @@ class TikaServerTextExtractor extends FileTextExtractor
} }
// Default to configured endpoint // Default to configured endpoint
return $this->config()->server_endpoint; return $this->config()->get('server_endpoint');
} }
/** /**
* Get the version of tika installed, or 0 if not installed * Get the version of Tika installed, or 0 if not installed
* *
* @return float version of tika * @return float version of Tika
*/ */
public function getVersion() public function getVersion()
{ {
return $this return $this->getClient()->getVersion();
->getClient()
->getVersion();
} }
/** /**
@ -79,13 +83,12 @@ class TikaServerTextExtractor extends FileTextExtractor
*/ */
public function isAvailable() public function isAvailable()
{ {
return $this->getServerEndpoint() && return $this->getServerEndpoint()
$this->getClient()->isAvailable() && && $this->getClient()->isAvailable()
version_compare($this->getVersion(), '1.7.0') >= 0; && version_compare($this->getVersion(), '1.7.0') >= 0;
} }
/** /**
*
* @param string $extension * @param string $extension
* @return boolean * @return boolean
*/ */
@ -95,31 +98,23 @@ class TikaServerTextExtractor extends FileTextExtractor
return false; return false;
} }
/** /**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
/**
*
* @param string $mime * @param string $mime
* @return boolean * @return boolean
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
$supported = $this->supportedMimes ?: if (!$this->supportedMimes) {
($this->supportedMimes = $this->getClient()->getSupportedMimes()); $this->supportedMimes = $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup) // Check if supported (most common / quickest lookup)
if (isset($supported[$mime])) { if (isset($this->supportedMimes[$mime])) {
return true; return true;
} }
// Check aliases // Check aliases
foreach ($supported as $info) { foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'])) { if (isset($info['alias']) && in_array($mime, $info['alias'])) {
return true; return true;
} }

View File

@ -2,8 +2,6 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/** /**
* Enables text extraction of file content via the Tika CLI * Enables text extraction of file content via the Tika CLI
* *
@ -47,13 +45,13 @@ class TikaTextExtractor extends FileTextExtractor
*/ */
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{ {
$descriptorSpecs = array( $descriptorSpecs = [
0 => array("pipe", "r"), 0 => ["pipe", "r"],
1 => array("pipe", "w"), 1 => ["pipe", "w"],
2 => array("pipe", "w") 2 => ["pipe", "w"]
); ];
// Invoke command // Invoke command
$pipes = array(); $pipes = [];
$proc = proc_open($command, $descriptorSpecs, $pipes); $proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) { if (!is_resource($proc)) {
@ -75,7 +73,6 @@ class TikaTextExtractor extends FileTextExtractor
} }
/** /**
*
* @param string $path * @param string $path
* @return string * @return string
*/ */
@ -91,8 +88,7 @@ class TikaTextExtractor extends FileTextExtractor
} }
/** /**
* * @return bool
* @return boolean
*/ */
public function isAvailable() public function isAvailable()
{ {
@ -100,8 +96,7 @@ class TikaTextExtractor extends FileTextExtractor
} }
/** /**
* * @return bool
* @return boolean
*/ */
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
@ -111,9 +106,8 @@ class TikaTextExtractor extends FileTextExtractor
/** /**
*
* @param string $mime * @param string $mime
* @return boolean * @return bool
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
@ -121,8 +115,9 @@ class TikaTextExtractor extends FileTextExtractor
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) { if ($code) {
// Error case
return false; return false;
} // Error case }
// Check if the mime type is inside the result // Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));

View File

@ -2,11 +2,11 @@
namespace SilverStripe\TextExtraction\Rest; namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client, use GuzzleHttp\Client;
GuzzleHttp\Exception\RequestException, use GuzzleHttp\Exception\RequestException;
SilverStripe\Core\Environment, use Psr\Log\LoggerInterface;
Psr\Log\LoggerInterface, use SilverStripe\Core\Environment;
SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client class TikaRestClient extends Client
{ {
@ -15,12 +15,12 @@ class TikaRestClient extends Client
* *
* @var array * @var array
*/ */
protected $options = array('username' => null, 'password' => null); protected $options = ['username' => null, 'password' => null];
/** /**
* @var array * @var array
*/ */
protected $mimes = array(); protected $mimes = [];
/** /**
* *
@ -29,16 +29,16 @@ class TikaRestClient extends Client
*/ */
public function __construct($baseUrl = '', $config = null) public function __construct($baseUrl = '', $config = null)
{ {
$psswd = Environment::getEnv('SS_TIKA_PASSWORD'); $password = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) { if (!empty($password)) {
$this->options = array( $this->options = [
'username' => Environment::getEnv('SS_TIKA_USERNAME'), 'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $psswd, 'password' => $password,
); ];
} }
parent::__construct($baseUrl, $config); parent::__construct($config);
} }
/** /**
@ -120,7 +120,7 @@ class TikaRestClient extends Client
try { try {
$response = $this->put( $response = $this->put(
'tika', 'tika',
array('Accept' => 'text/plain'), ['Accept' => 'text/plain'],
file_get_contents($file) file_get_contents($file)
); );
$response->setAuth($this->options['username'], $this->options['password']); $response->setAuth($this->options['username'], $this->options['password']);