mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
FIX: First-pass SS4 compatibility.
- Added namespaces, use statements - Added missing docblocks etc - Uses SS4's new Cache system - Uses proper environment vars - Cannot instantiate 'FileTextCache' (interface) as a service. This can be configured through YML, so default to FileTextCache_Cache - Modded YML config to make it run. - Fixes to allow TIKA to actually get file contents. - Addresses issues raised by @robbieaverill - Rebased against github.com/silverstripe/silverstripe-textextraction:master - Replaced `SS_Log` with Monolog.
This commit is contained in:
parent
875e608d0f
commit
f341010d7a
11
_config/cache.yml
Normal file
11
_config/cache.yml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
Name: textextractioncache
|
||||||
|
After:
|
||||||
|
- '#corecache'
|
||||||
|
---
|
||||||
|
|
||||||
|
SilverStripe\Core\Injector\Injector:
|
||||||
|
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
||||||
|
factory: SilverStripe\Core\Cache\CacheFactory
|
||||||
|
constructor:
|
||||||
|
namespace: 'FileTextCache_Cache'
|
@ -1,11 +0,0 @@
|
|||||||
---
|
|
||||||
Name: textextraction
|
|
||||||
---
|
|
||||||
Injector:
|
|
||||||
FileTextCache: FileTextCache_Database
|
|
||||||
|
|
||||||
#SolrCellTextExtractor:
|
|
||||||
# base_url: 'http://localhost:8983/solr/update/extract'
|
|
||||||
|
|
||||||
FileTextCache_Database:
|
|
||||||
max_content_length: 500000
|
|
@ -1,112 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
interface FileTextCache
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Save extracted content for a given File entity
|
|
||||||
*
|
|
||||||
* @param File $file
|
|
||||||
* @param string $content
|
|
||||||
*/
|
|
||||||
public function save(File $file, $content);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return any cached extracted content for a given file entity
|
|
||||||
*
|
|
||||||
* @param File $file
|
|
||||||
*/
|
|
||||||
public function load(File $file);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Invalidate the cache for a given file.
|
|
||||||
* Invoked in onBeforeWrite on the file
|
|
||||||
*
|
|
||||||
* @param File $file
|
|
||||||
*/
|
|
||||||
public function invalidate(File $file);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Caches the extracted content on the record for the file.
|
|
||||||
* Limits the stored file content by default to avoid hitting query size limits.
|
|
||||||
*/
|
|
||||||
class FileTextCache_Database implements FileTextCache
|
|
||||||
{
|
|
||||||
public function load(File $file)
|
|
||||||
{
|
|
||||||
return $file->FileContentCache;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function save(File $file, $content)
|
|
||||||
{
|
|
||||||
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
|
||||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
|
||||||
$file->write();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function invalidate(File $file)
|
|
||||||
{
|
|
||||||
// To prevent writing to the cache from invalidating it
|
|
||||||
if (!$file->isChanged('FileContentCache')) {
|
|
||||||
$file->FileContentCache = '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Uses SS_Cache with a lifetime to cache extracted content
|
|
||||||
*/
|
|
||||||
class FileTextCache_SSCache implements FileTextCache, Flushable
|
|
||||||
{
|
|
||||||
/**
|
|
||||||
* Lifetime of cache in seconds
|
|
||||||
* Null is indefinite
|
|
||||||
*
|
|
||||||
* @var int|null
|
|
||||||
* @config
|
|
||||||
*/
|
|
||||||
private static $lifetime = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return SS_Cache
|
|
||||||
*/
|
|
||||||
protected static function get_cache()
|
|
||||||
{
|
|
||||||
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
|
||||||
$cache = SS_Cache::factory(__CLASS__);
|
|
||||||
$cache->setLifetime($lifetime);
|
|
||||||
return $cache;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected function getKey(File $file)
|
|
||||||
{
|
|
||||||
return md5($file->getFullPath());
|
|
||||||
}
|
|
||||||
|
|
||||||
public function load(File $file)
|
|
||||||
{
|
|
||||||
$key = $this->getKey($file);
|
|
||||||
$cache = self::get_cache();
|
|
||||||
return $cache->load($key);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function save(File $file, $content)
|
|
||||||
{
|
|
||||||
$key = $this->getKey($file);
|
|
||||||
$cache = self::get_cache();
|
|
||||||
return $cache->save($content, $key);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static function flush()
|
|
||||||
{
|
|
||||||
$cache = self::get_cache();
|
|
||||||
$cache->clean();
|
|
||||||
}
|
|
||||||
|
|
||||||
public function invalidate(File $file)
|
|
||||||
{
|
|
||||||
$key = $this->getKey($file);
|
|
||||||
$cache = self::get_cache();
|
|
||||||
return $cache->remove($key);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,77 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
|
||||||
* @author mstephens
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
class HTMLTextExtractor extends FileTextExtractor
|
|
||||||
{
|
|
||||||
public function isAvailable()
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public function supportsExtension($extension)
|
|
||||||
{
|
|
||||||
return in_array(
|
|
||||||
strtolower($extension),
|
|
||||||
array("html", "htm", "xhtml")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public function supportsMime($mime)
|
|
||||||
{
|
|
||||||
return strtolower($mime) === 'text/html';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
|
||||||
*
|
|
||||||
* @config
|
|
||||||
* @var integer
|
|
||||||
*/
|
|
||||||
private static $priority = 10;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Extracts content from regex, by using strip_tags()
|
|
||||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
|
||||||
* as well as adding line breaks after block tags.
|
|
||||||
*
|
|
||||||
* @param string $path
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getContent($path)
|
|
||||||
{
|
|
||||||
$content = file_get_contents($path);
|
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
|
||||||
$content = preg_replace(
|
|
||||||
array(
|
|
||||||
// Remove invisible content
|
|
||||||
'@<head[^>]*?>.*?</head>@siu',
|
|
||||||
'@<style[^>]*?>.*?</style>@siu',
|
|
||||||
'@<script[^>]*?.*?</script>@siu',
|
|
||||||
'@<object[^>]*?.*?</object>@siu',
|
|
||||||
'@<embed[^>]*?.*?</embed>@siu',
|
|
||||||
'@<applet[^>]*?.*?</applet>@siu',
|
|
||||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
|
||||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
|
||||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
|
||||||
// Add line breaks before and after blocks
|
|
||||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
|
||||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
|
||||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
|
||||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
|
||||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
|
||||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
|
||||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
|
||||||
),
|
|
||||||
array(
|
|
||||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
|
||||||
),
|
|
||||||
$content
|
|
||||||
);
|
|
||||||
return strip_tags($content);
|
|
||||||
}
|
|
||||||
}
|
|
@ -16,15 +16,16 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.2",
|
"php": ">=5.6",
|
||||||
"composer/installers": "*",
|
"composer/installers": "*",
|
||||||
"silverstripe/framework": "^3.1",
|
"silverstripe/framework": "4.0.x-dev",
|
||||||
"guzzle/guzzle": "^3.9",
|
"guzzlehttp/guzzle": "~3.8.1",
|
||||||
"symfony/event-dispatcher": "^2.6.0@stable",
|
"symfony/event-dispatcher": "^2.6.0@stable",
|
||||||
"symfony/http-foundation": "^2.6.0"
|
"symfony/http-foundation": "^2.6.0",
|
||||||
|
"silverstripe/assets": "^1"
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"phpunit/phpunit": "^3.7"
|
"phpunit/phpunit": "~5.0"
|
||||||
},
|
},
|
||||||
"suggest": {
|
"suggest": {
|
||||||
"ext-fileinfo": "Improved support for file mime detection"
|
"ext-fileinfo": "Improved support for file mime detection"
|
||||||
|
9
src/Exception/FileTextExtractor_Exception.php
Normal file
9
src/Exception/FileTextExtractor_Exception.php
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Exception;
|
||||||
|
|
||||||
|
use \Exception;
|
||||||
|
|
||||||
|
class FileTextExtractor_Exception extends Exception
|
||||||
|
{
|
||||||
|
}
|
106
src/Extension/FieldTextCache_Cache.php
Normal file
106
src/Extension/FieldTextCache_Cache.php
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File,
|
||||||
|
SilverStripe\Core\Config\Config,
|
||||||
|
SilverStripe\TextExtraction\Extension\FileTextCache,
|
||||||
|
SilverStripe\Core\Flushable,
|
||||||
|
Psr\SimpleCache\CacheInterface,
|
||||||
|
SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses SS_Cache with a lifetime to cache extracted content
|
||||||
|
*/
|
||||||
|
class FileTextCache_Cache implements FileTextCache, Flushable
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Lifetime of cache in seconds
|
||||||
|
* Null is indefinite
|
||||||
|
*
|
||||||
|
* @var int|null
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $lifetime = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return CacheInterface
|
||||||
|
*/
|
||||||
|
protected static function get_cache()
|
||||||
|
{
|
||||||
|
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
|
||||||
|
|
||||||
|
return Injector::inst()->get($for);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function getKey(File $file)
|
||||||
|
{
|
||||||
|
return md5($file->getFilename());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return type
|
||||||
|
*/
|
||||||
|
public function load(File $file)
|
||||||
|
{
|
||||||
|
$key = $this->getKey($file);
|
||||||
|
$cache = self::get_cache();
|
||||||
|
|
||||||
|
return $cache->get($key);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param File $file
|
||||||
|
* @param string $content
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function save(File $file, $content)
|
||||||
|
{
|
||||||
|
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
||||||
|
$lifetime = $lifetime ?: 3600;
|
||||||
|
$key = $this->getKey($file);
|
||||||
|
$cache = self::get_cache();
|
||||||
|
|
||||||
|
return $cache->set($key, $content, $lifetime);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public static function flush()
|
||||||
|
{
|
||||||
|
$cache = self::get_cache();
|
||||||
|
$cache->clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alias for $this->flush()
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public static function clear()
|
||||||
|
{
|
||||||
|
$cache = self::get_cache();
|
||||||
|
$cache->clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return type
|
||||||
|
*/
|
||||||
|
public function invalidate(File $file)
|
||||||
|
{
|
||||||
|
$key = $this->getKey($file);
|
||||||
|
$cache = self::get_cache();
|
||||||
|
|
||||||
|
return $cache->delete($key);
|
||||||
|
}
|
||||||
|
}
|
47
src/Extension/FieldTextCache_Database.php
Normal file
47
src/Extension/FieldTextCache_Database.php
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File,
|
||||||
|
SilverStripe\Core\Config\Config,
|
||||||
|
SilverStripe\TextExtraction\Extension\FileTextCache;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Caches the extracted content on the record for the file.
|
||||||
|
* Limits the stored file content by default to avoid hitting query size limits.
|
||||||
|
*/
|
||||||
|
class FileTextCache_Database implements FileTextCache
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return FileTextCache
|
||||||
|
*/
|
||||||
|
public function load(File $file)
|
||||||
|
{
|
||||||
|
return $file->FileContentCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param File $file
|
||||||
|
* @param mixed $content
|
||||||
|
*/
|
||||||
|
public function save(File $file, $content)
|
||||||
|
{
|
||||||
|
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
||||||
|
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||||
|
$file->write();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param File $file
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function invalidate(File $file)
|
||||||
|
{
|
||||||
|
// To prevent writing to the cache from invalidating it
|
||||||
|
if (!$file->isChanged('FileContentCache')) {
|
||||||
|
$file->FileContentCache = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
31
src/Extension/FileTextCache.php
Normal file
31
src/Extension/FileTextCache.php
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
|
||||||
|
interface FileTextCache
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Save extracted content for a given File entity
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @param string $content
|
||||||
|
*/
|
||||||
|
public function save(File $file, $content);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return any cached extracted content for a given file entity
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
*/
|
||||||
|
public function load(File $file);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invalidate the cache for a given file.
|
||||||
|
* Invoked in onBeforeWrite on the file
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
*/
|
||||||
|
public function invalidate(File $file);
|
||||||
|
}
|
@ -1,5 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
|
use SilverStripe\ORM\DataExtension,
|
||||||
|
SilverStripe\TextExtraction\Extension\FileTextCache,
|
||||||
|
SilverStripe\Control\Director;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||||
* FileTextExtractor to do the extraction based on the content type of the file.
|
* FileTextExtractor to do the extraction based on the content type of the file.
|
||||||
@ -11,16 +17,31 @@
|
|||||||
*/
|
*/
|
||||||
class FileTextExtractable extends DataExtension
|
class FileTextExtractable extends DataExtension
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
private static $db = array(
|
private static $db = array(
|
||||||
'FileContentCache' => 'Text'
|
'FileContentCache' => 'Text'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
private static $casting = array(
|
private static $casting = array(
|
||||||
'FileContent' => 'Text'
|
'FileContent' => 'Text'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
private static $dependencies = array(
|
private static $dependencies = array(
|
||||||
'TextCache' => '%$FileTextCache'
|
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -31,6 +52,7 @@ class FileTextExtractable extends DataExtension
|
|||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param FileTextCache $cache
|
* @param FileTextCache $cache
|
||||||
|
* @return void
|
||||||
*/
|
*/
|
||||||
public function setTextCache(FileTextCache $cache)
|
public function setTextCache(FileTextCache $cache)
|
||||||
{
|
{
|
||||||
@ -60,8 +82,9 @@ class FileTextExtractable extends DataExtension
|
|||||||
* The value is also cached into the File record itself.
|
* The value is also cached into the File record itself.
|
||||||
*
|
*
|
||||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||||
* If true, the content parsing is forced, bypassing the cached version
|
* If true, the content parsing is forced, bypassing
|
||||||
* @return string
|
* the cached version
|
||||||
|
* @return mixed string | null
|
||||||
*/
|
*/
|
||||||
public function extractFileAsText($disableCache = false)
|
public function extractFileAsText($disableCache = false)
|
||||||
{
|
{
|
||||||
@ -73,12 +96,13 @@ class FileTextExtractable extends DataExtension
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Determine which extractor can process this file.
|
// Determine which extractor can process this file.
|
||||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
$path = Director::baseFolder() . '/' . $this->owner->getFilename();
|
||||||
|
$extractor = FileTextExtractor::for_file($path);
|
||||||
if (!$extractor) {
|
if (!$extractor) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$text = $extractor->getContent($this->owner->FullPath);
|
$text = $extractor->getContent($path);
|
||||||
if (!$text) {
|
if (!$text) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -90,6 +114,9 @@ class FileTextExtractable extends DataExtension
|
|||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
public function onBeforeWrite()
|
public function onBeforeWrite()
|
||||||
{
|
{
|
||||||
// Clear cache before changing file
|
// Clear cache before changing file
|
@ -1,12 +1,19 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Core\Config\Config,
|
||||||
|
SilverStripe\Core\Injector\Injector,
|
||||||
|
SilverStripe\Core\ClassInfo;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor extends Object
|
abstract class FileTextExtractor
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set priority from 0-100.
|
* Set priority from 0-100.
|
||||||
* The highest priority extractor for a given content type will be selected.
|
* The highest priority extractor for a given content type will be selected.
|
||||||
@ -36,9 +43,10 @@ abstract class FileTextExtractor extends Object
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Generate the sorted list of extractors on demand.
|
// Generate the sorted list of extractors on demand.
|
||||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
$classes = ClassInfo::subclassesFor(__CLASS__);
|
||||||
array_shift($classes);
|
array_shift($classes);
|
||||||
$classPriorities = array();
|
$classPriorities = array();
|
||||||
|
|
||||||
foreach ($classes as $class) {
|
foreach ($classes as $class) {
|
||||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||||
}
|
}
|
||||||
@ -75,7 +83,7 @@ abstract class FileTextExtractor extends Object
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $path
|
* @param string $path
|
||||||
* @return FileTextExtractor|null
|
* @return mixed FileTextExtractor | null
|
||||||
*/
|
*/
|
||||||
public static function for_file($path)
|
public static function for_file($path)
|
||||||
{
|
{
|
||||||
@ -85,6 +93,7 @@ abstract class FileTextExtractor extends Object
|
|||||||
|
|
||||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||||
$mime = self::get_mime($path);
|
$mime = self::get_mime($path);
|
||||||
|
|
||||||
foreach (self::get_extractor_classes() as $className) {
|
foreach (self::get_extractor_classes() as $className) {
|
||||||
$extractor = self::get_extractor($className);
|
$extractor = self::get_extractor($className);
|
||||||
|
|
||||||
@ -139,7 +148,3 @@ abstract class FileTextExtractor extends Object
|
|||||||
*/
|
*/
|
||||||
abstract public function getContent($path);
|
abstract public function getContent($path);
|
||||||
}
|
}
|
||||||
|
|
||||||
class FileTextExtractor_Exception extends Exception
|
|
||||||
{
|
|
||||||
}
|
|
94
src/Extractor/HTMLTextExtractor.php
Normal file
94
src/Extractor/HTMLTextExtractor.php
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
||||||
|
* @author mstephens
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class HTMLTextExtractor extends FileTextExtractor
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public function isAvailable()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $extension
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function supportsExtension($extension)
|
||||||
|
{
|
||||||
|
return in_array(
|
||||||
|
strtolower($extension), array("html", "htm", "xhtml")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $mime
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function supportsMime($mime)
|
||||||
|
{
|
||||||
|
return strtolower($mime) === 'text/html';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||||
|
*
|
||||||
|
* @config
|
||||||
|
* @var integer
|
||||||
|
*/
|
||||||
|
private static $priority = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts content from regex, by using strip_tags()
|
||||||
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||||
|
* as well as adding line breaks after block tags.
|
||||||
|
*
|
||||||
|
* @param string $path
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function getContent($path)
|
||||||
|
{
|
||||||
|
$content = file_get_contents($path);
|
||||||
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
|
$content = preg_replace(
|
||||||
|
array(
|
||||||
|
// Remove invisible content
|
||||||
|
'@<head[^>]*?>.*?</head>@siu',
|
||||||
|
'@<style[^>]*?>.*?</style>@siu',
|
||||||
|
'@<script[^>]*?.*?</script>@siu',
|
||||||
|
'@<object[^>]*?.*?</object>@siu',
|
||||||
|
'@<embed[^>]*?.*?</embed>@siu',
|
||||||
|
'@<applet[^>]*?.*?</applet>@siu',
|
||||||
|
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||||
|
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||||
|
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||||
|
// Add line breaks before and after blocks
|
||||||
|
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||||
|
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||||
|
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||||
|
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||||
|
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||||
|
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||||
|
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||||
|
), array(
|
||||||
|
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
||||||
|
), $content
|
||||||
|
);
|
||||||
|
return strip_tags($content);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,10 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||||
|
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that calls pdftotext to do the conversion.
|
* Text extractor that calls pdftotext to do the conversion.
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
@ -7,6 +12,7 @@
|
|||||||
*/
|
*/
|
||||||
class PDFTextExtractor extends FileTextExtractor
|
class PDFTextExtractor extends FileTextExtractor
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set to bin path this extractor can execute
|
* Set to bin path this extractor can execute
|
||||||
*
|
*
|
||||||
@ -40,8 +46,7 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
strtolower($mime),
|
strtolower($mime), array(
|
||||||
array(
|
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/x-pdf',
|
'application/x-pdf',
|
||||||
'application/x-bzpdf',
|
'application/x-bzpdf',
|
||||||
@ -108,11 +113,10 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
$err = $content;
|
$err = $content;
|
||||||
}
|
}
|
||||||
throw new FileTextExtractor_Exception(sprintf(
|
throw new FileTextExtractor_Exception(sprintf(
|
||||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
|
||||||
$path,
|
|
||||||
implode(PHP_EOL, $err)
|
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
return implode(PHP_EOL, $content);
|
return implode(PHP_EOL, $content);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,6 +139,8 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
'ſt' => 'ft',
|
'ſt' => 'ft',
|
||||||
'st' => 'st'
|
'st' => 'st'
|
||||||
);
|
);
|
||||||
|
|
||||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,5 +1,11 @@
|
|||||||
<?php
|
<?php
|
||||||
use Guzzle\Http\Client;
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||||
|
Guzzle\Http\Client,
|
||||||
|
\InvalidArgumentException,
|
||||||
|
Psr\Log\LoggerInterface;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that calls an Apache Solr instance
|
* Text extractor that calls an Apache Solr instance
|
||||||
@ -21,10 +27,24 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
private static $base_url;
|
private static $base_url;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @var int
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
private static $priority = 75;
|
private static $priority = 75;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @var Guzzle\Http\Client
|
||||||
|
*/
|
||||||
protected $httpClient;
|
protected $httpClient;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return Guzzle\Http\Client
|
||||||
|
* @throws InvalidArgumentException
|
||||||
|
*/
|
||||||
public function getHttpClient()
|
public function getHttpClient()
|
||||||
{
|
{
|
||||||
if (!$this->config()->get('base_url')) {
|
if (!$this->config()->get('base_url')) {
|
||||||
@ -33,20 +53,35 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
if (!$this->httpClient) {
|
if (!$this->httpClient) {
|
||||||
$this->httpClient = new Client($this->config()->get('base_url'));
|
$this->httpClient = new Client($this->config()->get('base_url'));
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->httpClient;
|
return $this->httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param Guzzle\Http\Client $client
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
public function setHttpClient($client)
|
public function setHttpClient($client)
|
||||||
{
|
{
|
||||||
$this->httpClient = $client;
|
$this->httpClient = $client;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
$url = $this->config()->get('base_url');
|
$url = $this->config()->get('base_url');
|
||||||
|
|
||||||
return (boolean) $url;
|
return (boolean) $url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $extension
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
@ -59,12 +94,22 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $mime
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
// Rely on supportsExtension
|
// Rely on supportsExtension
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $path
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
public function getContent($path)
|
public function getContent($path)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$path) {
|
||||||
@ -73,6 +118,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
|
|
||||||
$fileName = basename($path);
|
$fileName = basename($path);
|
||||||
$client = $this->getHttpClient();
|
$client = $this->getHttpClient();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$request = $client
|
$request = $client
|
||||||
->post()
|
->post()
|
||||||
@ -80,27 +126,30 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
->addPostFiles(array('myfile' => $path));
|
->addPostFiles(array('myfile' => $path));
|
||||||
$response = $request->send();
|
$response = $request->send();
|
||||||
} catch (InvalidArgumentException $e) {
|
} catch (InvalidArgumentException $e) {
|
||||||
SS_Log::log(
|
$msg = sprintf(
|
||||||
sprintf(
|
|
||||||
'Error extracting text from "%s" (message: %s)',
|
'Error extracting text from "%s" (message: %s)',
|
||||||
$path,
|
$path,
|
||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
),
|
|
||||||
SS_Log::NOTICE
|
|
||||||
);
|
);
|
||||||
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
} catch (Guzzle\Http\Exception\ServerErrorResponseException $e) {
|
} catch (Guzzle\Http\Exception\ServerErrorResponseException $e) {
|
||||||
//catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
|
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
|
||||||
SS_Log::log(
|
$msg = sprintf(
|
||||||
sprintf(
|
|
||||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||||
$path,
|
$path,
|
||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
),
|
|
||||||
SS_Log::NOTICE
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Just initialise it, it doesn't take miuch.
|
||||||
|
$matches = [];
|
||||||
|
|
||||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||||
preg_match(
|
preg_match(
|
||||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
@ -1,5 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
||||||
|
SilverStripe\Core\Injector\Injector,
|
||||||
|
SilverStripe\Core\Environment,
|
||||||
|
SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables text extraction of file content via the Tika Rest Server
|
* Enables text extraction of file content via the Tika Rest Server
|
||||||
*
|
*
|
||||||
@ -36,20 +43,19 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
return $this->client ?:
|
return $this->client ?:
|
||||||
($this->client =
|
($this->client =
|
||||||
Injector::inst()->createWithArgs(
|
Injector::inst()->createWithArgs(
|
||||||
'TikaRestClient',
|
TikaRestClient::class,
|
||||||
array($this->getServerEndpoint())
|
array($this->getServerEndpoint())
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
public function getServerEndpoint()
|
public function getServerEndpoint()
|
||||||
{
|
{
|
||||||
if (defined('SS_TIKA_ENDPOINT')) {
|
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
|
||||||
return SS_TIKA_ENDPOINT;
|
return $endpoint;
|
||||||
}
|
|
||||||
|
|
||||||
if (getenv('SS_TIKA_ENDPOINT')) {
|
|
||||||
return getenv('SS_TIKA_ENDPOINT');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default to configured endpoint
|
// Default to configured endpoint
|
||||||
@ -68,6 +74,9 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
->getVersion();
|
->getVersion();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
return $this->getServerEndpoint() &&
|
return $this->getServerEndpoint() &&
|
||||||
@ -75,6 +84,11 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
version_compare($this->getVersion(), '1.7.0') >= 0;
|
version_compare($this->getVersion(), '1.7.0') >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $extension
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
// Determine support via mime type only
|
// Determine support via mime type only
|
||||||
@ -89,6 +103,11 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected $supportedMimes = array();
|
protected $supportedMimes = array();
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $mime
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
$supported = $this->supportedMimes ?:
|
$supported = $this->supportedMimes ?:
|
@ -1,5 +1,9 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables text extraction of file content via the Tika CLI
|
* Enables text extraction of file content via the Tika CLI
|
||||||
*
|
*
|
||||||
@ -18,7 +22,7 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
/**
|
/**
|
||||||
* Get the version of tika installed, or 0 if not installed
|
* Get the version of tika installed, or 0 if not installed
|
||||||
*
|
*
|
||||||
* @return float version of tika
|
* @return mixed float | int The version of tika
|
||||||
*/
|
*/
|
||||||
public function getVersion()
|
public function getVersion()
|
||||||
{
|
{
|
||||||
@ -51,6 +55,7 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
// Invoke command
|
// Invoke command
|
||||||
$pipes = array();
|
$pipes = array();
|
||||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||||
|
|
||||||
if (!is_resource($proc)) {
|
if (!is_resource($proc)) {
|
||||||
return 255;
|
return 255;
|
||||||
}
|
}
|
||||||
@ -69,37 +74,59 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
return proc_close($proc);
|
return proc_close($proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $path
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
public function getContent($path)
|
public function getContent($path)
|
||||||
{
|
{
|
||||||
$mode = $this->config()->output_mode;
|
$mode = $this->config()->output_mode;
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
$code = $this->runShell($command, $output);
|
$code = $this->runShell($command, $output);
|
||||||
|
|
||||||
if ($code == 0) {
|
if ($code == 0) {
|
||||||
return $output;
|
return $output;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
return $this->getVersion() > 0;
|
return $this->getVersion() > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
// Determine support via mime type only
|
// Determine support via mime type only
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $mime
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
// Get list of supported mime types
|
// Get list of supported mime types
|
||||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||||
|
|
||||||
if ($code) {
|
if ($code) {
|
||||||
return false;
|
return false;
|
||||||
} // Error case
|
} // Error case
|
||||||
|
|
||||||
// Check if the mime type is inside the result
|
// Check if the mime type is inside the result
|
||||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||||
|
|
||||||
return (bool)preg_match($pattern, $supportedTypes);
|
return (bool)preg_match($pattern, $supportedTypes);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,7 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
use Guzzle\Http\Client;
|
namespace SilverStripe\TextExtraction\Rest;
|
||||||
use Guzzle\Http\Exception\RequestException;
|
|
||||||
|
use Guzzle\Http\Client,
|
||||||
|
Guzzle\Http\Exception\RequestException,
|
||||||
|
SilverStripe\Core\Environment,
|
||||||
|
Psr\Log\LoggerInterface,
|
||||||
|
SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
class TikaRestClient extends Client
|
class TikaRestClient extends Client
|
||||||
{
|
{
|
||||||
@ -17,14 +22,22 @@ class TikaRestClient extends Client
|
|||||||
*/
|
*/
|
||||||
protected $mimes = array();
|
protected $mimes = array();
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param string $baseUrl
|
||||||
|
* @param array $config
|
||||||
|
*/
|
||||||
public function __construct($baseUrl = '', $config = null)
|
public function __construct($baseUrl = '', $config = null)
|
||||||
{
|
{
|
||||||
if (defined('SS_TIKA_USERNAME') && defined('SS_TIKA_PASSWORD')) {
|
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||||
|
|
||||||
|
if (!empty($psswd)) {
|
||||||
$this->options = array(
|
$this->options = array(
|
||||||
'username' => SS_TIKA_USERNAME,
|
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
||||||
'password' => SS_TIKA_PASSWORD,
|
'password' => $psswd,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
parent::__construct($baseUrl, $config);
|
parent::__construct($baseUrl, $config);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -39,11 +52,14 @@ class TikaRestClient extends Client
|
|||||||
$result = $this->get(null);
|
$result = $this->get(null);
|
||||||
$result->setAuth($this->options['username'], $this->options['password']);
|
$result->setAuth($this->options['username'], $this->options['password']);
|
||||||
$result->send();
|
$result->send();
|
||||||
|
|
||||||
if ($result->getResponse()->getStatusCode() == 200) {
|
if ($result->getResponse()->getStatusCode() == 200) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} catch (RequestException $ex) {
|
} catch (RequestException $ex) {
|
||||||
SS_Log::log(sprintf("Tika unavailable - %s", $ex->getMessage()), SS_Log::ERR);
|
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
||||||
|
Injector::inst()->get(LoggerInterface::class)->error($msg);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -59,12 +75,14 @@ class TikaRestClient extends Client
|
|||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
$response->setAuth($this->options['username'], $this->options['password']);
|
||||||
$response->send();
|
$response->send();
|
||||||
$version = 0.0;
|
$version = 0.0;
|
||||||
|
|
||||||
// Parse output
|
// Parse output
|
||||||
if ($response->getResponse()->getStatusCode() == 200 &&
|
if ($response->getResponse()->getStatusCode() == 200 &&
|
||||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
|
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
|
||||||
) {
|
) {
|
||||||
$version = (float)$matches['version'];
|
$version = (float)$matches['version'];
|
||||||
}
|
}
|
||||||
|
|
||||||
return $version;
|
return $version;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,12 +96,14 @@ class TikaRestClient extends Client
|
|||||||
if ($this->mimes) {
|
if ($this->mimes) {
|
||||||
return $this->mimes;
|
return $this->mimes;
|
||||||
}
|
}
|
||||||
|
|
||||||
$response = $this->get(
|
$response = $this->get(
|
||||||
'mime-types',
|
'mime-types',
|
||||||
array('Accept' => 'application/json')
|
array('Accept' => 'application/json')
|
||||||
);
|
);
|
||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
$response->setAuth($this->options['username'], $this->options['password']);
|
||||||
$response->send();
|
$response->send();
|
||||||
|
|
||||||
return $this->mimes = $response->getResponse()->json();
|
return $this->mimes = $response->getResponse()->json();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,8 +138,10 @@ class TikaRestClient extends Client
|
|||||||
if ($body) {
|
if ($body) {
|
||||||
$msg .= ' Body: ' . $body;
|
$msg .= ' Body: ' . $body;
|
||||||
}
|
}
|
||||||
SS_Log::log($msg, SS_Log::NOTICE);
|
|
||||||
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -1,4 +1,10 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
use SilverStripe\TextExtraction\Extension\FileTextCache,
|
||||||
|
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
|
||||||
|
SilverStripe\Dev\SapphireTest,
|
||||||
|
SilverStripe\Core\Config\Config;
|
||||||
|
|
||||||
class FileTextCacheDatabaseTest extends SapphireTest
|
class FileTextCacheDatabaseTest extends SapphireTest
|
||||||
{
|
{
|
||||||
public function testTruncatesByMaxLength()
|
public function testTruncatesByMaxLength()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user