mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API Update namespaces for FileTextCache and add upgrader mapping
This commit is contained in:
parent
f1bacd2aa9
commit
66c9db8c0d
14
.upgrade.yml
Normal file
14
.upgrade.yml
Normal file
@ -0,0 +1,14 @@
|
||||
mappings:
|
||||
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
|
||||
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
|
||||
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
|
||||
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
|
||||
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
|
||||
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
|
||||
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
|
||||
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
|
||||
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient
|
||||
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,12 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
This project adheres to [Semantic Versioning](http://semver.org/).
|
||||
|
||||
|
||||
## [2.0.1]
|
||||
Using Symfony mime type detection
|
||||
|
||||
## [2.0.0]
|
||||
Clarified Tika docs
|
@ -1,6 +1,6 @@
|
||||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
namespace SilverStripe\TextExtraction\Cache;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
@ -1,19 +1,21 @@
|
||||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use SilverStripe\Assets\File,
|
||||
SilverStripe\Core\Config\Config,
|
||||
SilverStripe\TextExtraction\Extension\FileTextCache,
|
||||
SilverStripe\Core\Flushable,
|
||||
Psr\SimpleCache\CacheInterface,
|
||||
SilverStripe\Core\Injector\Injector;
|
||||
use Psr\SimpleCache\CacheInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Flushable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Uses SS_Cache with a lifetime to cache extracted content
|
||||
*/
|
||||
class FileTextCache_Cache implements FileTextCache, Flushable
|
||||
class Cache implements FileTextCache, Flushable
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* Lifetime of cache in seconds
|
||||
* Null is indefinite
|
||||
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return type
|
||||
* @return mixed
|
||||
*/
|
||||
public function load(File $file)
|
||||
{
|
||||
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
||||
$lifetime = $lifetime ?: 3600;
|
||||
$lifetime = $this->config()->get('lifetime') ?: 3600;
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return type
|
||||
* @return bool
|
||||
*/
|
||||
public function invalidate(File $file)
|
||||
{
|
@ -1,17 +1,25 @@
|
||||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use SilverStripe\Assets\File,
|
||||
SilverStripe\Core\Config\Config,
|
||||
SilverStripe\TextExtraction\Extension\FileTextCache;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Caches the extracted content on the record for the file.
|
||||
* Limits the stored file content by default to avoid hitting query size limits.
|
||||
*/
|
||||
class FileTextCache_Database implements FileTextCache
|
||||
class Database implements FileTextCache
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* @config
|
||||
* @var int
|
||||
*/
|
||||
private static $max_content_length = null;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
||||
$maxLength = $this->config()->get('max_content_length');
|
||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||
$file->write();
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Exception;
|
||||
|
||||
use \Exception;
|
||||
|
||||
class FileTextExtractor_Exception extends Exception
|
||||
{
|
||||
}
|
@ -2,9 +2,10 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
|
||||
use SilverStripe\ORM\DataExtension,
|
||||
SilverStripe\TextExtraction\Extension\FileTextCache,
|
||||
SilverStripe\Control\Director;
|
||||
use SilverStripe\Control\Director;
|
||||
use SilverStripe\ORM\DataExtension;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
/**
|
||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||
@ -22,27 +23,27 @@ class FileTextExtractable extends DataExtension
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $db = array(
|
||||
private static $db = [
|
||||
'FileContentCache' => 'Text'
|
||||
);
|
||||
];
|
||||
|
||||
/**
|
||||
*
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $casting = array(
|
||||
private static $casting = [
|
||||
'FileContent' => 'Text'
|
||||
);
|
||||
];
|
||||
|
||||
/**
|
||||
*
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $dependencies = array(
|
||||
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
|
||||
);
|
||||
private static $dependencies = [
|
||||
'TextCache' => FileTextCache\Cache::class,
|
||||
];
|
||||
|
||||
/**
|
||||
* @var FileTextCache
|
||||
@ -52,11 +53,12 @@ class FileTextExtractable extends DataExtension
|
||||
/**
|
||||
*
|
||||
* @param FileTextCache $cache
|
||||
* @return void
|
||||
* @return $this
|
||||
*/
|
||||
public function setTextCache(FileTextCache $cache)
|
||||
{
|
||||
$this->fileTextCache = $cache;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -84,7 +86,7 @@ class FileTextExtractable extends DataExtension
|
||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||
* If true, the content parsing is forced, bypassing
|
||||
* the cached version
|
||||
* @return mixed string | null
|
||||
* @return string|null
|
||||
*/
|
||||
public function extractFileAsText($disableCache = false)
|
||||
{
|
||||
|
@ -2,17 +2,18 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Core\Config\Config,
|
||||
SilverStripe\Core\Injector\Injector,
|
||||
SilverStripe\Core\ClassInfo;
|
||||
use SilverStripe\Core\ClassInfo;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
/**
|
||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
abstract class FileTextExtractor
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
@ -45,7 +46,7 @@ abstract class FileTextExtractor
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor(__CLASS__);
|
||||
array_shift($classes);
|
||||
$classPriorities = array();
|
||||
$classPriorities = [];
|
||||
|
||||
foreach ($classes as $class) {
|
||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||
@ -76,19 +77,19 @@ abstract class FileTextExtractor
|
||||
*/
|
||||
protected static function get_mime($path)
|
||||
{
|
||||
$file = new Symfony\Component\HttpFoundation\File\File($path);
|
||||
$file = new \Symfony\Component\HttpFoundation\File\File($path);
|
||||
|
||||
return $file->getMimeType();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $path
|
||||
* @return mixed FileTextExtractor | null
|
||||
* @return FileTextExtractor|null
|
||||
*/
|
||||
public static function for_file($path)
|
||||
{
|
||||
if (!file_exists($path) || is_dir($path)) {
|
||||
return;
|
||||
return null;
|
||||
}
|
||||
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
@ -132,7 +133,7 @@ abstract class FileTextExtractor
|
||||
abstract public function supportsExtension($extension);
|
||||
|
||||
/**
|
||||
* Determine if this extractor suports the given mime type.
|
||||
* Determine if this extractor supports the given mime type.
|
||||
* Will only be called if supportsExtension returns false.
|
||||
*
|
||||
* @param string $mime
|
||||
|
7
src/Extractor/FileTextExtractor/Exception.php
Normal file
7
src/Extractor/FileTextExtractor/Exception.php
Normal file
@ -0,0 +1,7 @@
|
||||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
class Exception extends \Exception
|
||||
{
|
||||
}
|
@ -2,18 +2,21 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
public function isAvailable()
|
||||
@ -22,19 +25,15 @@ class HTMLTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $extension
|
||||
* @return array
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension), array("html", "htm", "xhtml")
|
||||
);
|
||||
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $mime
|
||||
* @return string
|
||||
*/
|
||||
@ -43,14 +42,6 @@ class HTMLTextExtractor extends FileTextExtractor
|
||||
return strtolower($mime) === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
@ -65,29 +56,30 @@ class HTMLTextExtractor extends FileTextExtractor
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
array(
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
), array(
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
||||
), $content
|
||||
[
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
],
|
||||
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
||||
$content
|
||||
);
|
||||
|
||||
return strip_tags($content);
|
||||
}
|
||||
|
||||
|
@ -2,17 +2,14 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor
|
||||
{
|
||||
|
||||
/**
|
||||
* Set to bin path this extractor can execute
|
||||
*
|
||||
@ -27,10 +24,10 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
* @config
|
||||
* @var array
|
||||
*/
|
||||
private static $search_binary_locations = array(
|
||||
private static $search_binary_locations = [
|
||||
'/usr/bin',
|
||||
'/usr/local/bin',
|
||||
);
|
||||
];
|
||||
|
||||
public function isAvailable()
|
||||
{
|
||||
@ -46,12 +43,13 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($mime), array(
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
)
|
||||
strtolower($mime),
|
||||
[
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
@ -64,10 +62,10 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
protected function bin($program = '')
|
||||
{
|
||||
// Get list of allowed search paths
|
||||
if ($location = $this->config()->binary_location) {
|
||||
$locations = array($location);
|
||||
if ($location = $this->config()->get('binary_location')) {
|
||||
$locations = [$location];
|
||||
} else {
|
||||
$locations = $this->config()->search_binary_locations;
|
||||
$locations = $this->config()->get('search_binary_locations');
|
||||
}
|
||||
|
||||
// Find program in each path
|
||||
@ -88,8 +86,9 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
public function getContent($path)
|
||||
{
|
||||
if (!$path) {
|
||||
return "";
|
||||
} // no file
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
$content = $this->getRawOutput($path);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
@ -99,12 +98,12 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Output
|
||||
* @throws FileTextExtractor_Exception
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function getRawOutput($path)
|
||||
{
|
||||
if (!$this->isAvailable()) {
|
||||
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
|
||||
throw new Exception("getRawOutput called on unavailable extractor");
|
||||
}
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||
if ($err) {
|
||||
@ -112,8 +111,11 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
// For Windows compatibility
|
||||
$err = $content;
|
||||
}
|
||||
throw new FileTextExtractor_Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
|
||||
|
||||
throw new Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
$path,
|
||||
implode(PHP_EOL, $err)
|
||||
));
|
||||
}
|
||||
|
||||
@ -130,7 +132,7 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
*/
|
||||
protected function cleanupLigatures($input)
|
||||
{
|
||||
$mapping = array(
|
||||
$mapping = [
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
@ -138,7 +140,7 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
);
|
||||
];
|
||||
|
||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||
}
|
||||
|
@ -2,9 +2,11 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||
GuzzleHttp\Client,
|
||||
Psr\Log\LoggerInterface;
|
||||
use Exception;
|
||||
use GuzzleHttp\Client;
|
||||
use InvalidArgumentException;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
@ -18,7 +20,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||
class SolrCellTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Base URL to use for solr text extraction.
|
||||
* Base URL to use for Solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
@ -27,43 +29,36 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
private static $base_url;
|
||||
|
||||
/**
|
||||
*
|
||||
* @var int
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 75;
|
||||
|
||||
/**
|
||||
*
|
||||
* @var GuzzleHttp\Client
|
||||
* @var Client
|
||||
*/
|
||||
protected $httpClient;
|
||||
|
||||
/**
|
||||
*
|
||||
* @return GuzzleHttp\Client
|
||||
* @throws InvalidArgumentException
|
||||
* @return Client
|
||||
*/
|
||||
public function getHttpClient()
|
||||
{
|
||||
if (!$this->config()->get('base_url')) {
|
||||
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
if (!$this->httpClient) {
|
||||
$this->httpClient = new Client($this->config()->get('base_url'));
|
||||
$this->httpClient = new Client();
|
||||
}
|
||||
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param GuzzleHttp\Client $client
|
||||
* @return void
|
||||
* @param Client $client
|
||||
* @return $this
|
||||
*/
|
||||
public function setHttpClient($client)
|
||||
public function setHttpClient(Client $client)
|
||||
{
|
||||
$this->httpClient = $client;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -73,30 +68,28 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
{
|
||||
$url = $this->config()->get('base_url');
|
||||
|
||||
return (boolean) $url;
|
||||
return (bool) $url;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array(
|
||||
[
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
@ -105,48 +98,55 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $path
|
||||
* @param string $path
|
||||
* @return string
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function getContent($path)
|
||||
{
|
||||
if (!$path) {
|
||||
return "";
|
||||
} // no file
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
|
||||
$fileName = basename($path);
|
||||
$client = $this->getHttpClient();
|
||||
|
||||
// Get and validate base URL
|
||||
$baseUrl = $this->config()->get('base_url');
|
||||
if (!$this->config()->get('base_url')) {
|
||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
|
||||
try {
|
||||
$request = $client
|
||||
->post()
|
||||
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
||||
->addPostFiles(array('myfile' => $path));
|
||||
->post($baseUrl)
|
||||
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
||||
->addPostFiles(['myfile' => $path]);
|
||||
$response = $request->send();
|
||||
} catch (\InvalidArgumentException $e) {
|
||||
} catch (InvalidArgumentException $e) {
|
||||
$msg = sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
);
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
|
||||
return null;
|
||||
} catch (\Exception $e) {
|
||||
} catch (Exception $e) {
|
||||
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
|
||||
$msg = sprintf(
|
||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
);
|
||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
);
|
||||
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Just initialise it, it doesn't take miuch.
|
||||
// Just initialise it, it doesn't take much.
|
||||
$matches = [];
|
||||
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
|
@ -2,10 +2,9 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||
SilverStripe\Core\Injector\Injector,
|
||||
SilverStripe\Core\Environment,
|
||||
SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
@ -35,18 +34,25 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = [];
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient()
|
||||
{
|
||||
return $this->client ?:
|
||||
($this->client =
|
||||
Injector::inst()->createWithArgs(
|
||||
TikaRestClient::class,
|
||||
array($this->getServerEndpoint())
|
||||
)
|
||||
if (!$this->client) {
|
||||
$this->client = Injector::inst()->createWithArgs(
|
||||
TikaRestClient::class,
|
||||
[$this->getServerEndpoint()]
|
||||
);
|
||||
}
|
||||
return $this->client;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -59,19 +65,17 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->server_endpoint;
|
||||
return $this->config()->get('server_endpoint');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
* Get the version of Tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
* @return float version of Tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
return $this
|
||||
->getClient()
|
||||
->getVersion();
|
||||
return $this->getClient()->getVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -79,13 +83,12 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getServerEndpoint() &&
|
||||
$this->getClient()->isAvailable() &&
|
||||
version_compare($this->getVersion(), '1.7.0') >= 0;
|
||||
return $this->getServerEndpoint()
|
||||
&& $this->getClient()->isAvailable()
|
||||
&& version_compare($this->getVersion(), '1.7.0') >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
@ -95,31 +98,23 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = array();
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
$supported = $this->supportedMimes ?:
|
||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||
if (!$this->supportedMimes) {
|
||||
$this->supportedMimes = $this->getClient()->getSupportedMimes();
|
||||
}
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if (isset($supported[$mime])) {
|
||||
if (isset($this->supportedMimes[$mime])) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check aliases
|
||||
foreach ($supported as $info) {
|
||||
foreach ($this->supportedMimes as $info) {
|
||||
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
||||
return true;
|
||||
}
|
||||
|
@ -2,8 +2,6 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika CLI
|
||||
*
|
||||
@ -47,13 +45,13 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
*/
|
||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
||||
{
|
||||
$descriptorSpecs = array(
|
||||
0 => array("pipe", "r"),
|
||||
1 => array("pipe", "w"),
|
||||
2 => array("pipe", "w")
|
||||
);
|
||||
$descriptorSpecs = [
|
||||
0 => ["pipe", "r"],
|
||||
1 => ["pipe", "w"],
|
||||
2 => ["pipe", "w"]
|
||||
];
|
||||
// Invoke command
|
||||
$pipes = array();
|
||||
$pipes = [];
|
||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||
|
||||
if (!is_resource($proc)) {
|
||||
@ -75,7 +73,6 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
@ -91,8 +88,7 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return boolean
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
@ -100,8 +96,7 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return boolean
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
@ -111,9 +106,8 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
@ -121,8 +115,9 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||
|
||||
if ($code) {
|
||||
// Error case
|
||||
return false;
|
||||
} // Error case
|
||||
}
|
||||
|
||||
// Check if the mime type is inside the result
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||
|
@ -2,11 +2,11 @@
|
||||
|
||||
namespace SilverStripe\TextExtraction\Rest;
|
||||
|
||||
use GuzzleHttp\Client,
|
||||
GuzzleHttp\Exception\RequestException,
|
||||
SilverStripe\Core\Environment,
|
||||
Psr\Log\LoggerInterface,
|
||||
SilverStripe\Core\Injector\Injector;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
class TikaRestClient extends Client
|
||||
{
|
||||
@ -15,12 +15,12 @@ class TikaRestClient extends Client
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $options = array('username' => null, 'password' => null);
|
||||
protected $options = ['username' => null, 'password' => null];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $mimes = array();
|
||||
protected $mimes = [];
|
||||
|
||||
/**
|
||||
*
|
||||
@ -29,16 +29,16 @@ class TikaRestClient extends Client
|
||||
*/
|
||||
public function __construct($baseUrl = '', $config = null)
|
||||
{
|
||||
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||
$password = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||
|
||||
if (!empty($psswd)) {
|
||||
$this->options = array(
|
||||
if (!empty($password)) {
|
||||
$this->options = [
|
||||
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
||||
'password' => $psswd,
|
||||
);
|
||||
'password' => $password,
|
||||
];
|
||||
}
|
||||
|
||||
parent::__construct($baseUrl, $config);
|
||||
parent::__construct($config);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -120,7 +120,7 @@ class TikaRestClient extends Client
|
||||
try {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array('Accept' => 'text/plain'),
|
||||
['Accept' => 'text/plain'],
|
||||
file_get_contents($file)
|
||||
);
|
||||
$response->setAuth($this->options['username'], $this->options['password']);
|
||||
|
Loading…
Reference in New Issue
Block a user