API Update namespaces for FileTextCache and add upgrader mapping

This commit is contained in:
Robbie Averill 2018-07-03 11:23:27 +12:00
parent f1bacd2aa9
commit 66c9db8c0d
16 changed files with 225 additions and 229 deletions

14
.upgrade.yml Normal file
View File

@ -0,0 +1,14 @@
mappings:
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient

View File

@ -1,12 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [2.0.1]
Using Symfony mime type detection
## [2.0.0]
Clarified Tika docs

View File

View File

@ -1,6 +1,6 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache;
use SilverStripe\Assets\File;

View File

@ -1,19 +1,21 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Core\Flushable,
Psr\SimpleCache\CacheInterface,
SilverStripe\Core\Injector\Injector;
use Psr\SimpleCache\CacheInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Flushable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_Cache implements FileTextCache, Flushable
class Cache implements FileTextCache, Flushable
{
use Configurable;
/**
* Lifetime of cache in seconds
* Null is indefinite
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/**
*
* @param File $file
* @return type
* @return mixed
*/
public function load(File $file)
{
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
*/
public function save(File $file, $content)
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$lifetime = $lifetime ?: 3600;
$lifetime = $this->config()->get('lifetime') ?: 3600;
$key = $this->getKey($file);
$cache = self::get_cache();
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/**
*
* @param File $file
* @return type
* @return bool
*/
public function invalidate(File $file)
{

View File

@ -1,17 +1,25 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
class Database implements FileTextCache
{
use Configurable;
/**
* @config
* @var int
*/
private static $max_content_length = null;
/**
*
* @param File $file
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
*/
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$maxLength = $this->config()->get('max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}

View File

@ -1,9 +0,0 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -2,9 +2,10 @@
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Control\Director;
use SilverStripe\Control\Director;
use SilverStripe\ORM\DataExtension;
use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
@ -22,27 +23,27 @@ class FileTextExtractable extends DataExtension
* @var array
* @config
*/
private static $db = array(
private static $db = [
'FileContentCache' => 'Text'
);
];
/**
*
* @var array
* @config
*/
private static $casting = array(
private static $casting = [
'FileContent' => 'Text'
);
];
/**
*
* @var array
* @config
*/
private static $dependencies = array(
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
);
private static $dependencies = [
'TextCache' => FileTextCache\Cache::class,
];
/**
* @var FileTextCache
@ -52,11 +53,12 @@ class FileTextExtractable extends DataExtension
/**
*
* @param FileTextCache $cache
* @return void
* @return $this
*/
public function setTextCache(FileTextCache $cache)
{
$this->fileTextCache = $cache;
return $this;
}
/**
@ -84,7 +86,7 @@ class FileTextExtractable extends DataExtension
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing
* the cached version
* @return mixed string | null
* @return string|null
*/
public function extractFileAsText($disableCache = false)
{

View File

@ -2,17 +2,18 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\ClassInfo;
use SilverStripe\Core\ClassInfo;
use SilverStripe\Core\Config\Config;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injector;
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*
*/
abstract class FileTextExtractor
{
use Configurable;
/**
* Set priority from 0-100.
@ -45,7 +46,7 @@ abstract class FileTextExtractor
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes);
$classPriorities = array();
$classPriorities = [];
foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
@ -76,19 +77,19 @@ abstract class FileTextExtractor
*/
protected static function get_mime($path)
{
$file = new Symfony\Component\HttpFoundation\File\File($path);
$file = new \Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType();
}
/**
* @param string $path
* @return mixed FileTextExtractor | null
* @return FileTextExtractor|null
*/
public static function for_file($path)
{
if (!file_exists($path) || is_dir($path)) {
return;
return null;
}
$extension = pathinfo($path, PATHINFO_EXTENSION);
@ -132,7 +133,7 @@ abstract class FileTextExtractor
abstract public function supportsExtension($extension);
/**
* Determine if this extractor suports the given mime type.
* Determine if this extractor supports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime

View File

@ -0,0 +1,7 @@
<?php
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class Exception extends \Exception
{
}

View File

@ -2,18 +2,21 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
*
* @return boolean
*/
public function isAvailable()
@ -22,19 +25,15 @@ class HTMLTextExtractor extends FileTextExtractor
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
}
/**
*
* @param string $mime
* @return string
*/
@ -43,14 +42,6 @@ class HTMLTextExtractor extends FileTextExtractor
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
@ -65,29 +56,30 @@ class HTMLTextExtractor extends FileTextExtractor
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
), array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), $content
[
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
],
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
$content
);
return strip_tags($content);
}

View File

@ -2,17 +2,14 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
class PDFTextExtractor extends FileTextExtractor
{
/**
* Set to bin path this extractor can execute
*
@ -27,10 +24,10 @@ class PDFTextExtractor extends FileTextExtractor
* @config
* @var array
*/
private static $search_binary_locations = array(
private static $search_binary_locations = [
'/usr/bin',
'/usr/local/bin',
);
];
public function isAvailable()
{
@ -46,12 +43,13 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime)
{
return in_array(
strtolower($mime), array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
strtolower($mime),
[
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
]
);
}
@ -64,10 +62,10 @@ class PDFTextExtractor extends FileTextExtractor
protected function bin($program = '')
{
// Get list of allowed search paths
if ($location = $this->config()->binary_location) {
$locations = array($location);
if ($location = $this->config()->get('binary_location')) {
$locations = [$location];
} else {
$locations = $this->config()->search_binary_locations;
$locations = $this->config()->get('search_binary_locations');
}
// Find program in each path
@ -88,8 +86,9 @@ class PDFTextExtractor extends FileTextExtractor
public function getContent($path)
{
if (!$path) {
return "";
} // no file
// no file
return '';
}
$content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
@ -99,12 +98,12 @@ class PDFTextExtractor extends FileTextExtractor
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
* @throws Exception
*/
protected function getRawOutput($path)
{
if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
throw new Exception("getRawOutput called on unavailable extractor");
}
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
@ -112,8 +111,11 @@ class PDFTextExtractor extends FileTextExtractor
// For Windows compatibility
$err = $content;
}
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
));
}
@ -130,7 +132,7 @@ class PDFTextExtractor extends FileTextExtractor
*/
protected function cleanupLigatures($input)
{
$mapping = array(
$mapping = [
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
@ -138,7 +140,7 @@ class PDFTextExtractor extends FileTextExtractor
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
];
return str_replace(array_keys($mapping), array_values($mapping), $input);
}

View File

@ -2,9 +2,11 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
GuzzleHttp\Client,
Psr\Log\LoggerInterface;
use Exception;
use GuzzleHttp\Client;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Injector\Injector;
/**
* Text extractor that calls an Apache Solr instance
@ -18,7 +20,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
class SolrCellTextExtractor extends FileTextExtractor
{
/**
* Base URL to use for solr text extraction.
* Base URL to use for Solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
@ -27,43 +29,36 @@ class SolrCellTextExtractor extends FileTextExtractor
private static $base_url;
/**
*
* @var int
* @config
*/
private static $priority = 75;
/**
*
* @var GuzzleHttp\Client
* @var Client
*/
protected $httpClient;
/**
*
* @return GuzzleHttp\Client
* @throws InvalidArgumentException
* @return Client
*/
public function getHttpClient()
{
if (!$this->config()->get('base_url')) {
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
$this->httpClient = new Client();
}
return $this->httpClient;
}
/**
*
* @param GuzzleHttp\Client $client
* @return void
* @param Client $client
* @return $this
*/
public function setHttpClient($client)
public function setHttpClient(Client $client)
{
$this->httpClient = $client;
return $this;
}
/**
@ -73,30 +68,28 @@ class SolrCellTextExtractor extends FileTextExtractor
{
$url = $this->config()->get('base_url');
return (boolean) $url;
return (bool) $url;
}
/**
*
* @param string $extension
* @return boolean
* @return bool
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension),
array(
[
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
]
);
}
/**
*
* @param string $mime
* @return boolean
* @return bool
*/
public function supportsMime($mime)
{
@ -105,48 +98,55 @@ class SolrCellTextExtractor extends FileTextExtractor
}
/**
*
* @param string $path
* @param string $path
* @return string
* @throws InvalidArgumentException
*/
public function getContent($path)
{
if (!$path) {
return "";
} // no file
// no file
return '';
}
$fileName = basename($path);
$client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
try {
$request = $client
->post()
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
->addPostFiles(array('myfile' => $path));
->post($baseUrl)
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
->addPostFiles(['myfile' => $path]);
$response = $request->send();
} catch (\InvalidArgumentException $e) {
} catch (InvalidArgumentException $e) {
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
);
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
} catch (\Exception $e) {
} catch (Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
);
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
}
// Just initialise it, it doesn't take miuch.
// Just initialise it, it doesn't take much.
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes

View File

@ -2,10 +2,9 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\Environment,
SilverStripe\TextExtraction\Rest\TikaRestClient;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* Enables text extraction of file content via the Tika Rest Server
@ -35,18 +34,25 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/**
* @return TikaRestClient
*/
public function getClient()
{
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
TikaRestClient::class,
array($this->getServerEndpoint())
)
if (!$this->client) {
$this->client = Injector::inst()->createWithArgs(
TikaRestClient::class,
[$this->getServerEndpoint()]
);
}
return $this->client;
}
/**
@ -59,19 +65,17 @@ class TikaServerTextExtractor extends FileTextExtractor
}
// Default to configured endpoint
return $this->config()->server_endpoint;
return $this->config()->get('server_endpoint');
}
/**
* Get the version of tika installed, or 0 if not installed
* Get the version of Tika installed, or 0 if not installed
*
* @return float version of tika
* @return float version of Tika
*/
public function getVersion()
{
return $this
->getClient()
->getVersion();
return $this->getClient()->getVersion();
}
/**
@ -79,13 +83,12 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
public function isAvailable()
{
return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() &&
version_compare($this->getVersion(), '1.7.0') >= 0;
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
&& version_compare($this->getVersion(), '1.7.0') >= 0;
}
/**
*
* @param string $extension
* @return boolean
*/
@ -95,31 +98,23 @@ class TikaServerTextExtractor extends FileTextExtractor
return false;
}
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
$supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
if (!$this->supportedMimes) {
$this->supportedMimes = $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)
if (isset($supported[$mime])) {
if (isset($this->supportedMimes[$mime])) {
return true;
}
// Check aliases
foreach ($supported as $info) {
foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
return true;
}

View File

@ -2,8 +2,6 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Enables text extraction of file content via the Tika CLI
*
@ -47,13 +45,13 @@ class TikaTextExtractor extends FileTextExtractor
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
$descriptorSpecs = [
0 => ["pipe", "r"],
1 => ["pipe", "w"],
2 => ["pipe", "w"]
];
// Invoke command
$pipes = array();
$pipes = [];
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) {
@ -75,7 +73,6 @@ class TikaTextExtractor extends FileTextExtractor
}
/**
*
* @param string $path
* @return string
*/
@ -91,8 +88,7 @@ class TikaTextExtractor extends FileTextExtractor
}
/**
*
* @return boolean
* @return bool
*/
public function isAvailable()
{
@ -100,8 +96,7 @@ class TikaTextExtractor extends FileTextExtractor
}
/**
*
* @return boolean
* @return bool
*/
public function supportsExtension($extension)
{
@ -111,9 +106,8 @@ class TikaTextExtractor extends FileTextExtractor
/**
*
* @param string $mime
* @return boolean
* @return bool
*/
public function supportsMime($mime)
{
@ -121,8 +115,9 @@ class TikaTextExtractor extends FileTextExtractor
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) {
// Error case
return false;
} // Error case
}
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));

View File

@ -2,11 +2,11 @@
namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client,
GuzzleHttp\Exception\RequestException,
SilverStripe\Core\Environment,
Psr\Log\LoggerInterface,
SilverStripe\Core\Injector\Injector;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client
{
@ -15,12 +15,12 @@ class TikaRestClient extends Client
*
* @var array
*/
protected $options = array('username' => null, 'password' => null);
protected $options = ['username' => null, 'password' => null];
/**
* @var array
*/
protected $mimes = array();
protected $mimes = [];
/**
*
@ -29,16 +29,16 @@ class TikaRestClient extends Client
*/
public function __construct($baseUrl = '', $config = null)
{
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
$password = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) {
$this->options = array(
if (!empty($password)) {
$this->options = [
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $psswd,
);
'password' => $password,
];
}
parent::__construct($baseUrl, $config);
parent::__construct($config);
}
/**
@ -120,7 +120,7 @@ class TikaRestClient extends Client
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
['Accept' => 'text/plain'],
file_get_contents($file)
);
$response->setAuth($this->options['username'], $this->options['password']);