FIX: First-pass SS4 compatibility.

- Added namespaces, use statements
- Added missing docblocks etc
- Uses SS4's new Cache system
- Uses proper environment vars
- Cannot instantiate 'FileTextCache' (interface) as a service. This can be configured through YML, so default to FileTextCache_Cache
- Modded YML config to make it run.
- Fixes to allow TIKA to actually get file contents.
- Addresses issues raised by @robbieaverill
- Rebased against github.com/silverstripe/silverstripe-textextraction:master
- Replaced `SS_Log` with Monolog.
This commit is contained in:
Russell Michell 2017-12-21 10:24:39 +13:00
parent 875e608d0f
commit f341010d7a
18 changed files with 541 additions and 281 deletions

11
_config/cache.yml Normal file
View File

@ -0,0 +1,11 @@
---
Name: textextractioncache
After:
- '#corecache'
---
SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory
constructor:
namespace: 'FileTextCache_Cache'

View File

@ -1,11 +0,0 @@
---
Name: textextraction
---
Injector:
FileTextCache: FileTextCache_Database
#SolrCellTextExtractor:
# base_url: 'http://localhost:8983/solr/update/extract'
FileTextCache_Database:
max_content_length: 500000

View File

@ -1,112 +0,0 @@
<?php
interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
{
public function load(File $file)
{
return $file->FileContentCache;
}
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_SSCache implements FileTextCache, Flushable
{
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return SS_Cache
*/
protected static function get_cache()
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime);
return $cache;
}
protected function getKey(File $file)
{
return md5($file->getFullPath());
}
public function load(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->load($key);
}
public function save(File $file, $content)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->save($content, $key);
}
public static function flush()
{
$cache = self::get_cache();
$cache->clean();
}
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->remove($key);
}
}

View File

@ -1,77 +0,0 @@
<?php
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
public function isAvailable()
{
return true;
}
public function supportsExtension($extension)
{
return in_array(
strtolower($extension),
array("html", "htm", "xhtml")
);
}
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
),
$content
);
return strip_tags($content);
}
}

View File

@ -16,15 +16,16 @@
} }
], ],
"require": { "require": {
"php": ">=5.3.2", "php": ">=5.6",
"composer/installers": "*", "composer/installers": "*",
"silverstripe/framework": "^3.1", "silverstripe/framework": "4.0.x-dev",
"guzzle/guzzle": "^3.9", "guzzlehttp/guzzle": "~3.8.1",
"symfony/event-dispatcher": "^2.6.0@stable", "symfony/event-dispatcher": "^2.6.0@stable",
"symfony/http-foundation": "^2.6.0" "symfony/http-foundation": "^2.6.0",
"silverstripe/assets": "^1"
}, },
"require-dev": { "require-dev": {
"phpunit/phpunit": "^3.7" "phpunit/phpunit": "~5.0"
}, },
"suggest": { "suggest": {
"ext-fileinfo": "Improved support for file mime detection" "ext-fileinfo": "Improved support for file mime detection"

View File

@ -0,0 +1,9 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -0,0 +1,106 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Core\Flushable,
Psr\SimpleCache\CacheInterface,
SilverStripe\Core\Injector\Injector;
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_Cache implements FileTextCache, Flushable
{
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return CacheInterface
*/
protected static function get_cache()
{
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
return Injector::inst()->get($for);
}
/**
*
* @param File $file
* @return string
*/
protected function getKey(File $file)
{
return md5($file->getFilename());
}
/**
*
* @param File $file
* @return type
*/
public function load(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->get($key);
}
/**
* @param File $file
* @param string $content
* @return string
*/
public function save(File $file, $content)
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$lifetime = $lifetime ?: 3600;
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->set($key, $content, $lifetime);
}
/**
* @return void
*/
public static function flush()
{
$cache = self::get_cache();
$cache->clear();
}
/**
* Alias for $this->flush()
*
* @return void
*/
public static function clear()
{
$cache = self::get_cache();
$cache->clear();
}
/**
*
* @param File $file
* @return type
*/
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->delete($key);
}
}

View File

@ -0,0 +1,47 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache;
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
{
/**
*
* @param File $file
* @return FileTextCache
*/
public function load(File $file)
{
return $file->FileContentCache;
}
/**
* @param File $file
* @param mixed $content
*/
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
/**
* @param File $file
* @return void
*/
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}

View File

@ -0,0 +1,31 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File;
interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}

View File

@ -1,5 +1,11 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Control\Director;
/** /**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of * Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
* FileTextExtractor to do the extraction based on the content type of the file. * FileTextExtractor to do the extraction based on the content type of the file.
@ -11,16 +17,31 @@
*/ */
class FileTextExtractable extends DataExtension class FileTextExtractable extends DataExtension
{ {
/**
*
* @var array
* @config
*/
private static $db = array( private static $db = array(
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); );
/**
*
* @var array
* @config
*/
private static $casting = array( private static $casting = array(
'FileContent' => 'Text' 'FileContent' => 'Text'
); );
/**
*
* @var array
* @config
*/
private static $dependencies = array( private static $dependencies = array(
'TextCache' => '%$FileTextCache' 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
); );
/** /**
@ -31,6 +52,7 @@ class FileTextExtractable extends DataExtension
/** /**
* *
* @param FileTextCache $cache * @param FileTextCache $cache
* @return void
*/ */
public function setTextCache(FileTextCache $cache) public function setTextCache(FileTextCache $cache)
{ {
@ -60,8 +82,9 @@ class FileTextExtractable extends DataExtension
* The value is also cached into the File record itself. * The value is also cached into the File record itself.
* *
* @param boolean $disableCache If false, the file content is only parsed on demand. * @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing the cached version * If true, the content parsing is forced, bypassing
* @return string * the cached version
* @return mixed string | null
*/ */
public function extractFileAsText($disableCache = false) public function extractFileAsText($disableCache = false)
{ {
@ -73,12 +96,13 @@ class FileTextExtractable extends DataExtension
} }
// Determine which extractor can process this file. // Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath); $path = Director::baseFolder() . '/' . $this->owner->getFilename();
$extractor = FileTextExtractor::for_file($path);
if (!$extractor) { if (!$extractor) {
return null; return null;
} }
$text = $extractor->getContent($this->owner->FullPath); $text = $extractor->getContent($path);
if (!$text) { if (!$text) {
return null; return null;
} }
@ -90,6 +114,9 @@ class FileTextExtractable extends DataExtension
return $text; return $text;
} }
/**
* @return void
*/
public function onBeforeWrite() public function onBeforeWrite()
{ {
// Clear cache before changing file // Clear cache before changing file

View File

@ -1,12 +1,19 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\ClassInfo;
/** /**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens * @author mstephens
* *
*/ */
abstract class FileTextExtractor extends Object abstract class FileTextExtractor
{ {
/** /**
* Set priority from 0-100. * Set priority from 0-100.
* The highest priority extractor for a given content type will be selected. * The highest priority extractor for a given content type will be selected.
@ -36,9 +43,10 @@ abstract class FileTextExtractor extends Object
} }
// Generate the sorted list of extractors on demand. // Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor"); $classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes); array_shift($classes);
$classPriorities = array(); $classPriorities = array();
foreach ($classes as $class) { foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority'); $classPriorities[$class] = Config::inst()->get($class, 'priority');
} }
@ -75,7 +83,7 @@ abstract class FileTextExtractor extends Object
/** /**
* @param string $path * @param string $path
* @return FileTextExtractor|null * @return mixed FileTextExtractor | null
*/ */
public static function for_file($path) public static function for_file($path)
{ {
@ -85,6 +93,7 @@ abstract class FileTextExtractor extends Object
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path); $mime = self::get_mime($path);
foreach (self::get_extractor_classes() as $className) { foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className); $extractor = self::get_extractor($className);
@ -139,7 +148,3 @@ abstract class FileTextExtractor extends Object
*/ */
abstract public function getContent($path); abstract public function getContent($path);
} }
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -0,0 +1,94 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
*
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
}
/**
*
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
), array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), $content
);
return strip_tags($content);
}
}

View File

@ -1,5 +1,10 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
/** /**
* Text extractor that calls pdftotext to do the conversion. * Text extractor that calls pdftotext to do the conversion.
* @author mstephens * @author mstephens
@ -7,6 +12,7 @@
*/ */
class PDFTextExtractor extends FileTextExtractor class PDFTextExtractor extends FileTextExtractor
{ {
/** /**
* Set to bin path this extractor can execute * Set to bin path this extractor can execute
* *
@ -40,8 +46,7 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime) public function supportsMime($mime)
{ {
return in_array( return in_array(
strtolower($mime), strtolower($mime), array(
array(
'application/pdf', 'application/pdf',
'application/x-pdf', 'application/x-pdf',
'application/x-bzpdf', 'application/x-bzpdf',
@ -108,11 +113,10 @@ class PDFTextExtractor extends FileTextExtractor
$err = $content; $err = $content;
} }
throw new FileTextExtractor_Exception(sprintf( throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
$path,
implode(PHP_EOL, $err)
)); ));
} }
return implode(PHP_EOL, $content); return implode(PHP_EOL, $content);
} }
@ -135,6 +139,8 @@ class PDFTextExtractor extends FileTextExtractor
'ſt' => 'ft', 'ſt' => 'ft',
'st' => 'st' 'st' => 'st'
); );
return str_replace(array_keys($mapping), array_values($mapping), $input); return str_replace(array_keys($mapping), array_values($mapping), $input);
} }
} }

View File

@ -1,5 +1,11 @@
<?php <?php
use Guzzle\Http\Client;
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
Guzzle\Http\Client,
\InvalidArgumentException,
Psr\Log\LoggerInterface;
/** /**
* Text extractor that calls an Apache Solr instance * Text extractor that calls an Apache Solr instance
@ -21,10 +27,24 @@ class SolrCellTextExtractor extends FileTextExtractor
*/ */
private static $base_url; private static $base_url;
/**
*
* @var int
* @config
*/
private static $priority = 75; private static $priority = 75;
/**
*
* @var Guzzle\Http\Client
*/
protected $httpClient; protected $httpClient;
/**
*
* @return Guzzle\Http\Client
* @throws InvalidArgumentException
*/
public function getHttpClient() public function getHttpClient()
{ {
if (!$this->config()->get('base_url')) { if (!$this->config()->get('base_url')) {
@ -33,20 +53,35 @@ class SolrCellTextExtractor extends FileTextExtractor
if (!$this->httpClient) { if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url')); $this->httpClient = new Client($this->config()->get('base_url'));
} }
return $this->httpClient; return $this->httpClient;
} }
/**
*
* @param Guzzle\Http\Client $client
* @return void
*/
public function setHttpClient($client) public function setHttpClient($client)
{ {
$this->httpClient = $client; $this->httpClient = $client;
} }
/**
* @return string
*/
public function isAvailable() public function isAvailable()
{ {
$url = $this->config()->get('base_url'); $url = $this->config()->get('base_url');
return (boolean) $url; return (boolean) $url;
} }
/**
*
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
return in_array( return in_array(
@ -59,12 +94,22 @@ class SolrCellTextExtractor extends FileTextExtractor
); );
} }
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime) public function supportsMime($mime)
{ {
// Rely on supportsExtension // Rely on supportsExtension
return false; return false;
} }
/**
*
* @param string $path
* @return string
*/
public function getContent($path) public function getContent($path)
{ {
if (!$path) { if (!$path) {
@ -73,6 +118,7 @@ class SolrCellTextExtractor extends FileTextExtractor
$fileName = basename($path); $fileName = basename($path);
$client = $this->getHttpClient(); $client = $this->getHttpClient();
try { try {
$request = $client $request = $client
->post() ->post()
@ -80,27 +126,30 @@ class SolrCellTextExtractor extends FileTextExtractor
->addPostFiles(array('myfile' => $path)); ->addPostFiles(array('myfile' => $path));
$response = $request->send(); $response = $request->send();
} catch (InvalidArgumentException $e) { } catch (InvalidArgumentException $e) {
SS_Log::log( $msg = sprintf(
sprintf(
'Error extracting text from "%s" (message: %s)', 'Error extracting text from "%s" (message: %s)',
$path, $path,
$e->getMessage() $e->getMessage()
),
SS_Log::NOTICE
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} catch (Guzzle\Http\Exception\ServerErrorResponseException $e) { } catch (Guzzle\Http\Exception\ServerErrorResponseException $e) {
//catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
SS_Log::log( $msg = sprintf(
sprintf(
'Tika server error attempting to extract from "%s" (message: %s)', 'Tika server error attempting to extract from "%s" (message: %s)',
$path, $path,
$e->getMessage() $e->getMessage()
),
SS_Log::NOTICE
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} }
// Just initialise it, it doesn't take miuch.
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes // Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match( preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)), sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),

View File

@ -1,5 +1,12 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\Environment,
SilverStripe\TextExtraction\Rest\TikaRestClient;
/** /**
* Enables text extraction of file content via the Tika Rest Server * Enables text extraction of file content via the Tika Rest Server
* *
@ -36,20 +43,19 @@ class TikaServerTextExtractor extends FileTextExtractor
return $this->client ?: return $this->client ?:
($this->client = ($this->client =
Injector::inst()->createWithArgs( Injector::inst()->createWithArgs(
'TikaRestClient', TikaRestClient::class,
array($this->getServerEndpoint()) array($this->getServerEndpoint())
) )
); );
} }
/**
* @return string
*/
public function getServerEndpoint() public function getServerEndpoint()
{ {
if (defined('SS_TIKA_ENDPOINT')) { if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT; return $endpoint;
}
if (getenv('SS_TIKA_ENDPOINT')) {
return getenv('SS_TIKA_ENDPOINT');
} }
// Default to configured endpoint // Default to configured endpoint
@ -68,6 +74,9 @@ class TikaServerTextExtractor extends FileTextExtractor
->getVersion(); ->getVersion();
} }
/**
* @return boolean
*/
public function isAvailable() public function isAvailable()
{ {
return $this->getServerEndpoint() && return $this->getServerEndpoint() &&
@ -75,6 +84,11 @@ class TikaServerTextExtractor extends FileTextExtractor
version_compare($this->getVersion(), '1.7.0') >= 0; version_compare($this->getVersion(), '1.7.0') >= 0;
} }
/**
*
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
// Determine support via mime type only // Determine support via mime type only
@ -89,6 +103,11 @@ class TikaServerTextExtractor extends FileTextExtractor
*/ */
protected $supportedMimes = array(); protected $supportedMimes = array();
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime) public function supportsMime($mime)
{ {
$supported = $this->supportedMimes ?: $supported = $this->supportedMimes ?:

View File

@ -1,5 +1,9 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/** /**
* Enables text extraction of file content via the Tika CLI * Enables text extraction of file content via the Tika CLI
* *
@ -18,7 +22,7 @@ class TikaTextExtractor extends FileTextExtractor
/** /**
* Get the version of tika installed, or 0 if not installed * Get the version of tika installed, or 0 if not installed
* *
* @return float version of tika * @return mixed float | int The version of tika
*/ */
public function getVersion() public function getVersion()
{ {
@ -51,6 +55,7 @@ class TikaTextExtractor extends FileTextExtractor
// Invoke command // Invoke command
$pipes = array(); $pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes); $proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) { if (!is_resource($proc)) {
return 255; return 255;
} }
@ -69,37 +74,59 @@ class TikaTextExtractor extends FileTextExtractor
return proc_close($proc); return proc_close($proc);
} }
/**
*
* @param string $path
* @return string
*/
public function getContent($path) public function getContent($path)
{ {
$mode = $this->config()->output_mode; $mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output); $code = $this->runShell($command, $output);
if ($code == 0) { if ($code == 0) {
return $output; return $output;
} }
} }
/**
*
* @return boolean
*/
public function isAvailable() public function isAvailable()
{ {
return $this->getVersion() > 0; return $this->getVersion() > 0;
} }
/**
*
* @return boolean
*/
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
// Determine support via mime type only // Determine support via mime type only
return false; return false;
} }
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime) public function supportsMime($mime)
{ {
// Get list of supported mime types // Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) { if ($code) {
return false; return false;
} // Error case } // Error case
// Check if the mime type is inside the result // Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes); return (bool)preg_match($pattern, $supportedTypes);
} }
} }

View File

@ -1,7 +1,12 @@
<?php <?php
use Guzzle\Http\Client; namespace SilverStripe\TextExtraction\Rest;
use Guzzle\Http\Exception\RequestException;
use Guzzle\Http\Client,
Guzzle\Http\Exception\RequestException,
SilverStripe\Core\Environment,
Psr\Log\LoggerInterface,
SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client class TikaRestClient extends Client
{ {
@ -17,14 +22,22 @@ class TikaRestClient extends Client
*/ */
protected $mimes = array(); protected $mimes = array();
/**
*
* @param string $baseUrl
* @param array $config
*/
public function __construct($baseUrl = '', $config = null) public function __construct($baseUrl = '', $config = null)
{ {
if (defined('SS_TIKA_USERNAME') && defined('SS_TIKA_PASSWORD')) { $psswd = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) {
$this->options = array( $this->options = array(
'username' => SS_TIKA_USERNAME, 'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => SS_TIKA_PASSWORD, 'password' => $psswd,
); );
} }
parent::__construct($baseUrl, $config); parent::__construct($baseUrl, $config);
} }
@ -39,11 +52,14 @@ class TikaRestClient extends Client
$result = $this->get(null); $result = $this->get(null);
$result->setAuth($this->options['username'], $this->options['password']); $result->setAuth($this->options['username'], $this->options['password']);
$result->send(); $result->send();
if ($result->getResponse()->getStatusCode() == 200) { if ($result->getResponse()->getStatusCode() == 200) {
return true; return true;
} }
} catch (RequestException $ex) { } catch (RequestException $ex) {
SS_Log::log(sprintf("Tika unavailable - %s", $ex->getMessage()), SS_Log::ERR); $msg = sprintf("Tika unavailable - %s", $ex->getMessage());
Injector::inst()->get(LoggerInterface::class)->error($msg);
return false; return false;
} }
} }
@ -59,12 +75,14 @@ class TikaRestClient extends Client
$response->setAuth($this->options['username'], $this->options['password']); $response->setAuth($this->options['username'], $this->options['password']);
$response->send(); $response->send();
$version = 0.0; $version = 0.0;
// Parse output // Parse output
if ($response->getResponse()->getStatusCode() == 200 && if ($response->getResponse()->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches) preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
) { ) {
$version = (float)$matches['version']; $version = (float)$matches['version'];
} }
return $version; return $version;
} }
@ -78,12 +96,14 @@ class TikaRestClient extends Client
if ($this->mimes) { if ($this->mimes) {
return $this->mimes; return $this->mimes;
} }
$response = $this->get( $response = $this->get(
'mime-types', 'mime-types',
array('Accept' => 'application/json') array('Accept' => 'application/json')
); );
$response->setAuth($this->options['username'], $this->options['password']); $response->setAuth($this->options['username'], $this->options['password']);
$response->send(); $response->send();
return $this->mimes = $response->getResponse()->json(); return $this->mimes = $response->getResponse()->json();
} }
@ -118,8 +138,10 @@ class TikaRestClient extends Client
if ($body) { if ($body) {
$msg .= ' Body: ' . $body; $msg .= ' Body: ' . $body;
} }
SS_Log::log($msg, SS_Log::NOTICE);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
} }
return $text; return $text;
} }
} }

View File

@ -1,4 +1,10 @@
<?php <?php
use SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
SilverStripe\Dev\SapphireTest,
SilverStripe\Core\Config\Config;
class FileTextCacheDatabaseTest extends SapphireTest class FileTextCacheDatabaseTest extends SapphireTest
{ {
public function testTruncatesByMaxLength() public function testTruncatesByMaxLength()