FIX: First-pass SS4 compatibility.

- Added namespaces, use statements
- Added missing docblocks etc
- Uses SS4's new Cache system
- Uses proper environment vars
- Cannot instantiate 'FileTextCache' (interface) as a service. This can be configured through YML, so default to FileTextCache_Cache
- Modded YML config to make it run.
- Fixes to allow TIKA to actually get file contents.
- Addresses issues raised by @robbieaverill
- Rebased against github.com/silverstripe/silverstripe-textextraction:master
- Replaced `SS_Log` with Monolog.
This commit is contained in:
Russell Michell 2017-12-21 10:24:39 +13:00
parent 875e608d0f
commit f341010d7a
18 changed files with 541 additions and 281 deletions

11
_config/cache.yml Normal file
View File

@ -0,0 +1,11 @@
---
Name: textextractioncache
After:
- '#corecache'
---
SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory
constructor:
namespace: 'FileTextCache_Cache'

View File

@ -1,11 +0,0 @@
---
Name: textextraction
---
Injector:
FileTextCache: FileTextCache_Database
#SolrCellTextExtractor:
# base_url: 'http://localhost:8983/solr/update/extract'
FileTextCache_Database:
max_content_length: 500000

View File

@ -1,112 +0,0 @@
<?php
interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
{
public function load(File $file)
{
return $file->FileContentCache;
}
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_SSCache implements FileTextCache, Flushable
{
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return SS_Cache
*/
protected static function get_cache()
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime);
return $cache;
}
protected function getKey(File $file)
{
return md5($file->getFullPath());
}
public function load(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->load($key);
}
public function save(File $file, $content)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->save($content, $key);
}
public static function flush()
{
$cache = self::get_cache();
$cache->clean();
}
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->remove($key);
}
}

View File

@ -1,77 +0,0 @@
<?php
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
public function isAvailable()
{
return true;
}
public function supportsExtension($extension)
{
return in_array(
strtolower($extension),
array("html", "htm", "xhtml")
);
}
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
),
$content
);
return strip_tags($content);
}
}

View File

@ -16,15 +16,16 @@
}
],
"require": {
"php": ">=5.3.2",
"php": ">=5.6",
"composer/installers": "*",
"silverstripe/framework": "^3.1",
"guzzle/guzzle": "^3.9",
"silverstripe/framework": "4.0.x-dev",
"guzzlehttp/guzzle": "~3.8.1",
"symfony/event-dispatcher": "^2.6.0@stable",
"symfony/http-foundation": "^2.6.0"
"symfony/http-foundation": "^2.6.0",
"silverstripe/assets": "^1"
},
"require-dev": {
"phpunit/phpunit": "^3.7"
"phpunit/phpunit": "~5.0"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"

View File

@ -0,0 +1,9 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -0,0 +1,106 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Core\Flushable,
Psr\SimpleCache\CacheInterface,
SilverStripe\Core\Injector\Injector;
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_Cache implements FileTextCache, Flushable
{
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return CacheInterface
*/
protected static function get_cache()
{
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
return Injector::inst()->get($for);
}
/**
*
* @param File $file
* @return string
*/
protected function getKey(File $file)
{
return md5($file->getFilename());
}
/**
*
* @param File $file
* @return type
*/
public function load(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->get($key);
}
/**
* @param File $file
* @param string $content
* @return string
*/
public function save(File $file, $content)
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$lifetime = $lifetime ?: 3600;
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->set($key, $content, $lifetime);
}
/**
* @return void
*/
public static function flush()
{
$cache = self::get_cache();
$cache->clear();
}
/**
* Alias for $this->flush()
*
* @return void
*/
public static function clear()
{
$cache = self::get_cache();
$cache->clear();
}
/**
*
* @param File $file
* @return type
*/
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->delete($key);
}
}

View File

@ -0,0 +1,47 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache;
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
{
/**
*
* @param File $file
* @return FileTextCache
*/
public function load(File $file)
{
return $file->FileContentCache;
}
/**
* @param File $file
* @param mixed $content
*/
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
/**
* @param File $file
* @return void
*/
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}

View File

@ -0,0 +1,31 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File;
interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}

View File

@ -1,9 +1,15 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Control\Director;
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
* FileTextExtractor to do the extraction based on the content type of the file.
*
*
* Adds an additional property which is the cached contents, which is populated on demand.
*
* @author mstephens
@ -11,16 +17,31 @@
*/
class FileTextExtractable extends DataExtension
{
/**
*
* @var array
* @config
*/
private static $db = array(
'FileContentCache' => 'Text'
);
/**
*
* @var array
* @config
*/
private static $casting = array(
'FileContent' => 'Text'
);
/**
*
* @var array
* @config
*/
private static $dependencies = array(
'TextCache' => '%$FileTextCache'
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
);
/**
@ -30,7 +51,8 @@ class FileTextExtractable extends DataExtension
/**
*
* @param FileTextCache $cache
* @param FileTextCache $cache
* @return void
*/
public function setTextCache(FileTextCache $cache)
{
@ -58,10 +80,11 @@ class FileTextExtractable extends DataExtension
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
*
*
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing the cached version
* @return string
* If true, the content parsing is forced, bypassing
* the cached version
* @return mixed string | null
*/
public function extractFileAsText($disableCache = false)
{
@ -73,23 +96,27 @@ class FileTextExtractable extends DataExtension
}
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
$path = Director::baseFolder() . '/' . $this->owner->getFilename();
$extractor = FileTextExtractor::for_file($path);
if (!$extractor) {
return null;
}
$text = $extractor->getContent($this->owner->FullPath);
$text = $extractor->getContent($path);
if (!$text) {
return null;
}
if (!$disableCache) {
$this->getTextCache()->save($this->owner, $text);
$this->getTextCache()->save($this->owner, $text);
}
return $text;
}
/**
* @return void
*/
public function onBeforeWrite()
{
// Clear cache before changing file

View File

@ -1,12 +1,19 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\ClassInfo;
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*
*/
abstract class FileTextExtractor extends Object
abstract class FileTextExtractor
{
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
@ -34,11 +41,12 @@ abstract class FileTextExtractor extends Object
if (self::$sorted_extractor_classes) {
return self::$sorted_extractor_classes;
}
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
$classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes);
$classPriorities = array();
foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
@ -74,8 +82,8 @@ abstract class FileTextExtractor extends Object
}
/**
* @param string $path
* @return FileTextExtractor|null
* @param string $path
* @return mixed FileTextExtractor | null
*/
public static function for_file($path)
{
@ -85,6 +93,7 @@ abstract class FileTextExtractor extends Object
$extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
@ -108,7 +117,7 @@ abstract class FileTextExtractor extends Object
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
*
*
* @return boolean
*/
abstract public function isAvailable();
@ -125,7 +134,7 @@ abstract class FileTextExtractor extends Object
/**
* Determine if this extractor suports the given mime type.
* Will only be called if supportsExtension returns false.
*
*
* @param string $mime
* @return boolean
*/
@ -133,13 +142,9 @@ abstract class FileTextExtractor extends Object
/**
* Given a file path, extract the contents as text.
*
*
* @param string $path
* @return string
*/
abstract public function getContent($path);
}
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -0,0 +1,94 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
*
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
}
/**
*
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
), array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), $content
);
return strip_tags($content);
}
}

View File

@ -1,5 +1,10 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
@ -7,6 +12,7 @@
*/
class PDFTextExtractor extends FileTextExtractor
{
/**
* Set to bin path this extractor can execute
*
@ -40,13 +46,12 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime)
{
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
strtolower($mime), array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
@ -66,16 +71,16 @@ class PDFTextExtractor extends FileTextExtractor
}
// Find program in each path
foreach($locations as $location) {
foreach ($locations as $location) {
$path = "{$location}/{$program}";
if(file_exists($path)) {
if (file_exists($path)) {
return $path;
}
if (file_exists($path.'.exe')) {
return $path.'.exe';
if (file_exists($path . '.exe')) {
return $path . '.exe';
}
}
// Not found
return null;
}
@ -92,13 +97,13 @@ class PDFTextExtractor extends FileTextExtractor
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path)
{
if(!$this->isAvailable()) {
if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
}
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
@ -108,11 +113,10 @@ class PDFTextExtractor extends FileTextExtractor
$err = $content;
}
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
));
}
return implode(PHP_EOL, $content);
}
@ -135,6 +139,8 @@ class PDFTextExtractor extends FileTextExtractor
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}

View File

@ -1,12 +1,18 @@
<?php
use Guzzle\Http\Client;
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
Guzzle\Http\Client,
\InvalidArgumentException,
Psr\Log\LoggerInterface;
/**
* Text extractor that calls an Apache Solr instance
* and extracts content via the "ExtractingRequestHandler" endpoint.
* Does not alter the Solr index itself, but uses it purely
* for its file parsing abilities.
*
*
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
@ -21,10 +27,24 @@ class SolrCellTextExtractor extends FileTextExtractor
*/
private static $base_url;
/**
*
* @var int
* @config
*/
private static $priority = 75;
/**
*
* @var Guzzle\Http\Client
*/
protected $httpClient;
/**
*
* @return Guzzle\Http\Client
* @throws InvalidArgumentException
*/
public function getHttpClient()
{
if (!$this->config()->get('base_url')) {
@ -33,20 +53,35 @@ class SolrCellTextExtractor extends FileTextExtractor
if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
}
return $this->httpClient;
}
/**
*
* @param Guzzle\Http\Client $client
* @return void
*/
public function setHttpClient($client)
{
$this->httpClient = $client;
}
/**
* @return string
*/
public function isAvailable()
{
$url = $this->config()->get('base_url');
return (boolean) $url;
}
/**
*
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension)
{
return in_array(
@ -59,12 +94,22 @@ class SolrCellTextExtractor extends FileTextExtractor
);
}
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
// Rely on supportsExtension
return false;
}
/**
*
* @param string $path
* @return string
*/
public function getContent($path)
{
if (!$path) {
@ -73,6 +118,7 @@ class SolrCellTextExtractor extends FileTextExtractor
$fileName = basename($path);
$client = $this->getHttpClient();
try {
$request = $client
->post()
@ -80,27 +126,30 @@ class SolrCellTextExtractor extends FileTextExtractor
->addPostFiles(array('myfile' => $path));
$response = $request->send();
} catch (InvalidArgumentException $e) {
SS_Log::log(
sprintf(
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
),
SS_Log::NOTICE
);
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
} catch (Guzzle\Http\Exception\ServerErrorResponseException $e) {
//catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
SS_Log::log(
sprintf(
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
),
SS_Log::NOTICE
);
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
}
// Just initialise it, it doesn't take miuch.
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),

View File

@ -1,5 +1,12 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\Environment,
SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* Enables text extraction of file content via the Tika Rest Server
*
@ -36,20 +43,19 @@ class TikaServerTextExtractor extends FileTextExtractor
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
TikaRestClient::class,
array($this->getServerEndpoint())
)
);
}
/**
* @return string
*/
public function getServerEndpoint()
{
if (defined('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT;
}
if (getenv('SS_TIKA_ENDPOINT')) {
return getenv('SS_TIKA_ENDPOINT');
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
return $endpoint;
}
// Default to configured endpoint
@ -68,6 +74,9 @@ class TikaServerTextExtractor extends FileTextExtractor
->getVersion();
}
/**
* @return boolean
*/
public function isAvailable()
{
return $this->getServerEndpoint() &&
@ -75,6 +84,11 @@ class TikaServerTextExtractor extends FileTextExtractor
version_compare($this->getVersion(), '1.7.0') >= 0;
}
/**
*
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension)
{
// Determine support via mime type only
@ -89,6 +103,11 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
protected $supportedMimes = array();
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
$supported = $this->supportedMimes ?:

View File

@ -1,8 +1,12 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Enables text extraction of file content via the Tika CLI
*
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaTextExtractor extends FileTextExtractor
@ -18,7 +22,7 @@ class TikaTextExtractor extends FileTextExtractor
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
* @return mixed float | int The version of tika
*/
public function getVersion()
{
@ -35,11 +39,11 @@ class TikaTextExtractor extends FileTextExtractor
/**
* Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
@ -51,6 +55,7 @@ class TikaTextExtractor extends FileTextExtractor
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) {
return 255;
}
@ -68,38 +73,60 @@ class TikaTextExtractor extends FileTextExtractor
// Get result
return proc_close($proc);
}
/**
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if ($code == 0) {
return $output;
}
}
/**
*
* @return boolean
*/
public function isAvailable()
{
return $this->getVersion() > 0;
}
/**
*
* @return boolean
*/
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) {
return false;
} // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
}

View File

@ -1,7 +1,12 @@
<?php
use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException;
namespace SilverStripe\TextExtraction\Rest;
use Guzzle\Http\Client,
Guzzle\Http\Exception\RequestException,
SilverStripe\Core\Environment,
Psr\Log\LoggerInterface,
SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client
{
@ -17,14 +22,22 @@ class TikaRestClient extends Client
*/
protected $mimes = array();
/**
*
* @param string $baseUrl
* @param array $config
*/
public function __construct($baseUrl = '', $config = null)
{
if (defined('SS_TIKA_USERNAME') && defined('SS_TIKA_PASSWORD')) {
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) {
$this->options = array(
'username' => SS_TIKA_USERNAME,
'password' => SS_TIKA_PASSWORD,
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $psswd,
);
}
parent::__construct($baseUrl, $config);
}
@ -39,11 +52,14 @@ class TikaRestClient extends Client
$result = $this->get(null);
$result->setAuth($this->options['username'], $this->options['password']);
$result->send();
if ($result->getResponse()->getStatusCode() == 200) {
return true;
}
} catch (RequestException $ex) {
SS_Log::log(sprintf("Tika unavailable - %s", $ex->getMessage()), SS_Log::ERR);
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
Injector::inst()->get(LoggerInterface::class)->error($msg);
return false;
}
}
@ -59,12 +75,14 @@ class TikaRestClient extends Client
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
$version = 0.0;
// Parse output
if ($response->getResponse()->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
) {
$version = (float)$matches['version'];
}
return $version;
}
@ -78,12 +96,14 @@ class TikaRestClient extends Client
if ($this->mimes) {
return $this->mimes;
}
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
return $this->mimes = $response->getResponse()->json();
}
@ -91,7 +111,7 @@ class TikaRestClient extends Client
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
* @param string $file Full filesystem path to a file to post
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file)
@ -118,8 +138,10 @@ class TikaRestClient extends Client
if ($body) {
$msg .= ' Body: ' . $body;
}
SS_Log::log($msg, SS_Log::NOTICE);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
}
return $text;
}
}

View File

@ -1,10 +1,16 @@
<?php
use SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
SilverStripe\Dev\SapphireTest,
SilverStripe\Core\Config\Config;
class FileTextCacheDatabaseTest extends SapphireTest
{
public function testTruncatesByMaxLength()
{
Config::nest();
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
$cache = new FileTextCache_Database();
$file = $this->getMock('File', array('write'));