diff --git a/_config/cache.yml b/_config/cache.yml
new file mode 100644
index 0000000..ff793b2
--- /dev/null
+++ b/_config/cache.yml
@@ -0,0 +1,11 @@
+---
+Name: textextractioncache
+After:
+ - '#corecache'
+---
+
+SilverStripe\Core\Injector\Injector:
+ Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
+ factory: SilverStripe\Core\Cache\CacheFactory
+ constructor:
+ namespace: 'FileTextCache_Cache'
\ No newline at end of file
diff --git a/_config/config.yml b/_config/config.yml
deleted file mode 100644
index bed07e8..0000000
--- a/_config/config.yml
+++ /dev/null
@@ -1,11 +0,0 @@
----
-Name: textextraction
----
-Injector:
- FileTextCache: FileTextCache_Database
-
-#SolrCellTextExtractor:
-# base_url: 'http://localhost:8983/solr/update/extract'
-
-FileTextCache_Database:
- max_content_length: 500000
diff --git a/code/extensions/FileTextCache.php b/code/extensions/FileTextCache.php
deleted file mode 100644
index 385d848..0000000
--- a/code/extensions/FileTextCache.php
+++ /dev/null
@@ -1,112 +0,0 @@
-FileContentCache;
- }
-
- public function save(File $file, $content)
- {
- $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
- $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
- $file->write();
- }
-
- public function invalidate(File $file)
- {
- // To prevent writing to the cache from invalidating it
- if (!$file->isChanged('FileContentCache')) {
- $file->FileContentCache = '';
- }
- }
-}
-
-/**
- * Uses SS_Cache with a lifetime to cache extracted content
- */
-class FileTextCache_SSCache implements FileTextCache, Flushable
-{
- /**
- * Lifetime of cache in seconds
- * Null is indefinite
- *
- * @var int|null
- * @config
- */
- private static $lifetime = null;
-
- /**
- * @return SS_Cache
- */
- protected static function get_cache()
- {
- $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
- $cache = SS_Cache::factory(__CLASS__);
- $cache->setLifetime($lifetime);
- return $cache;
- }
-
- protected function getKey(File $file)
- {
- return md5($file->getFullPath());
- }
-
- public function load(File $file)
- {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->load($key);
- }
-
- public function save(File $file, $content)
- {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->save($content, $key);
- }
-
- public static function flush()
- {
- $cache = self::get_cache();
- $cache->clean();
- }
-
- public function invalidate(File $file)
- {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->remove($key);
- }
-}
diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php
deleted file mode 100644
index 810f473..0000000
--- a/code/extractors/HTMLTextExtractor.php
+++ /dev/null
@@ -1,77 +0,0 @@
- or @siu',
- '@@siu',
- '@@siu',
- '@@siu',
- '@]*?.*?@siu',
- '@@siu',
- '@]*?.*?@siu',
- // Add line breaks before and after blocks
- '@?((address)|(blockquote)|(center)|(del))@iu',
- '@?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
- '@?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
- '@?((table)|(th)|(td)|(caption))@iu',
- '@?((form)|(button)|(fieldset)|(legend)|(input))@iu',
- '@?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
- '@?((frameset)|(frame)|(iframe))@iu',
- ),
- array(
- ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
- ),
- $content
- );
- return strip_tags($content);
- }
-}
diff --git a/composer.json b/composer.json
index 1fa622e..c75f866 100644
--- a/composer.json
+++ b/composer.json
@@ -16,15 +16,16 @@
}
],
"require": {
- "php": ">=5.3.2",
+ "php": ">=5.6",
"composer/installers": "*",
- "silverstripe/framework": "^3.1",
- "guzzle/guzzle": "^3.9",
+ "silverstripe/framework": "4.0.x-dev",
+ "guzzlehttp/guzzle": "~3.8.1",
"symfony/event-dispatcher": "^2.6.0@stable",
- "symfony/http-foundation": "^2.6.0"
+ "symfony/http-foundation": "^2.6.0",
+ "silverstripe/assets": "^1"
},
"require-dev": {
- "phpunit/phpunit": "^3.7"
+ "phpunit/phpunit": "~5.0"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
diff --git a/src/Exception/FileTextExtractor_Exception.php b/src/Exception/FileTextExtractor_Exception.php
new file mode 100644
index 0000000..4fa1038
--- /dev/null
+++ b/src/Exception/FileTextExtractor_Exception.php
@@ -0,0 +1,9 @@
+get($for);
+ }
+
+ /**
+ *
+ * @param File $file
+ * @return string
+ */
+ protected function getKey(File $file)
+ {
+ return md5($file->getFilename());
+ }
+
+ /**
+ *
+ * @param File $file
+ * @return type
+ */
+ public function load(File $file)
+ {
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+
+ return $cache->get($key);
+ }
+
+ /**
+ * @param File $file
+ * @param string $content
+ * @return string
+ */
+ public function save(File $file, $content)
+ {
+ $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
+ $lifetime = $lifetime ?: 3600;
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+
+ return $cache->set($key, $content, $lifetime);
+ }
+
+ /**
+ * @return void
+ */
+ public static function flush()
+ {
+ $cache = self::get_cache();
+ $cache->clear();
+ }
+
+ /**
+ * Alias for $this->flush()
+ *
+ * @return void
+ */
+ public static function clear()
+ {
+ $cache = self::get_cache();
+ $cache->clear();
+ }
+
+ /**
+ *
+ * @param File $file
+ * @return type
+ */
+ public function invalidate(File $file)
+ {
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+
+ return $cache->delete($key);
+ }
+}
diff --git a/src/Extension/FieldTextCache_Database.php b/src/Extension/FieldTextCache_Database.php
new file mode 100644
index 0000000..a96ff60
--- /dev/null
+++ b/src/Extension/FieldTextCache_Database.php
@@ -0,0 +1,47 @@
+FileContentCache;
+ }
+
+ /**
+ * @param File $file
+ * @param mixed $content
+ */
+ public function save(File $file, $content)
+ {
+ $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
+ $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
+ $file->write();
+ }
+
+ /**
+ * @param File $file
+ * @return void
+ */
+ public function invalidate(File $file)
+ {
+ // To prevent writing to the cache from invalidating it
+ if (!$file->isChanged('FileContentCache')) {
+ $file->FileContentCache = '';
+ }
+ }
+}
diff --git a/src/Extension/FileTextCache.php b/src/Extension/FileTextCache.php
new file mode 100644
index 0000000..d0ccd70
--- /dev/null
+++ b/src/Extension/FileTextCache.php
@@ -0,0 +1,31 @@
+ 'Text'
);
+ /**
+ *
+ * @var array
+ * @config
+ */
private static $casting = array(
'FileContent' => 'Text'
);
+ /**
+ *
+ * @var array
+ * @config
+ */
private static $dependencies = array(
- 'TextCache' => '%$FileTextCache'
+ 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
);
/**
@@ -30,7 +51,8 @@ class FileTextExtractable extends DataExtension
/**
*
- * @param FileTextCache $cache
+ * @param FileTextCache $cache
+ * @return void
*/
public function setTextCache(FileTextCache $cache)
{
@@ -58,10 +80,11 @@ class FileTextExtractable extends DataExtension
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
- *
+ *
* @param boolean $disableCache If false, the file content is only parsed on demand.
- * If true, the content parsing is forced, bypassing the cached version
- * @return string
+ * If true, the content parsing is forced, bypassing
+ * the cached version
+ * @return mixed string | null
*/
public function extractFileAsText($disableCache = false)
{
@@ -73,23 +96,27 @@ class FileTextExtractable extends DataExtension
}
// Determine which extractor can process this file.
- $extractor = FileTextExtractor::for_file($this->owner->FullPath);
+ $path = Director::baseFolder() . '/' . $this->owner->getFilename();
+ $extractor = FileTextExtractor::for_file($path);
if (!$extractor) {
return null;
}
- $text = $extractor->getContent($this->owner->FullPath);
+ $text = $extractor->getContent($path);
if (!$text) {
return null;
}
if (!$disableCache) {
- $this->getTextCache()->save($this->owner, $text);
+ $this->getTextCache()->save($this->owner, $text);
}
return $text;
}
+ /**
+ * @return void
+ */
public function onBeforeWrite()
{
// Clear cache before changing file
diff --git a/code/extractors/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php
similarity index 91%
rename from code/extractors/FileTextExtractor.php
rename to src/Extractor/FileTextExtractor.php
index cc7c176..115d679 100644
--- a/code/extractors/FileTextExtractor.php
+++ b/src/Extractor/FileTextExtractor.php
@@ -1,12 +1,19 @@
get($class, 'priority');
}
@@ -74,8 +82,8 @@ abstract class FileTextExtractor extends Object
}
/**
- * @param string $path
- * @return FileTextExtractor|null
+ * @param string $path
+ * @return mixed FileTextExtractor | null
*/
public static function for_file($path)
{
@@ -85,6 +93,7 @@ abstract class FileTextExtractor extends Object
$extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
+
foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
@@ -108,7 +117,7 @@ abstract class FileTextExtractor extends Object
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
- *
+ *
* @return boolean
*/
abstract public function isAvailable();
@@ -125,7 +134,7 @@ abstract class FileTextExtractor extends Object
/**
* Determine if this extractor suports the given mime type.
* Will only be called if supportsExtension returns false.
- *
+ *
* @param string $mime
* @return boolean
*/
@@ -133,13 +142,9 @@ abstract class FileTextExtractor extends Object
/**
* Given a file path, extract the contents as text.
- *
+ *
* @param string $path
* @return string
*/
abstract public function getContent($path);
}
-
-class FileTextExtractor_Exception extends Exception
-{
-}
diff --git a/src/Extractor/HTMLTextExtractor.php b/src/Extractor/HTMLTextExtractor.php
new file mode 100644
index 0000000..ace1400
--- /dev/null
+++ b/src/Extractor/HTMLTextExtractor.php
@@ -0,0 +1,94 @@
+ or @siu',
+ '@@siu',
+ '@@siu',
+ '@@siu',
+ '@]*?.*?@siu',
+ '@@siu',
+ '@]*?.*?@siu',
+ // Add line breaks before and after blocks
+ '@?((address)|(blockquote)|(center)|(del))@iu',
+ '@?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
+ '@?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
+ '@?((table)|(th)|(td)|(caption))@iu',
+ '@?((form)|(button)|(fieldset)|(legend)|(input))@iu',
+ '@?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
+ '@?((frameset)|(frame)|(iframe))@iu',
+ ), array(
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
+ ), $content
+ );
+ return strip_tags($content);
+ }
+
+}
diff --git a/code/extractors/PDFTextExtractor.php b/src/Extractor/PDFTextExtractor.php
similarity index 80%
rename from code/extractors/PDFTextExtractor.php
rename to src/Extractor/PDFTextExtractor.php
index 09608e8..6894be1 100644
--- a/code/extractors/PDFTextExtractor.php
+++ b/src/Extractor/PDFTextExtractor.php
@@ -1,5 +1,10 @@
isAvailable()) {
+ if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
}
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
@@ -108,11 +113,10 @@ class PDFTextExtractor extends FileTextExtractor
$err = $content;
}
throw new FileTextExtractor_Exception(sprintf(
- 'PDFTextExtractor->getContent() failed for %s: %s',
- $path,
- implode(PHP_EOL, $err)
+ 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
));
}
+
return implode(PHP_EOL, $content);
}
@@ -135,6 +139,8 @@ class PDFTextExtractor extends FileTextExtractor
'ſt' => 'ft',
'st' => 'st'
);
+
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
+
}
diff --git a/code/extractors/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php
similarity index 70%
rename from code/extractors/SolrCellTextExtractor.php
rename to src/Extractor/SolrCellTextExtractor.php
index 2590153..27852f1 100644
--- a/code/extractors/SolrCellTextExtractor.php
+++ b/src/Extractor/SolrCellTextExtractor.php
@@ -1,12 +1,18 @@
config()->get('base_url')) {
@@ -33,20 +53,35 @@ class SolrCellTextExtractor extends FileTextExtractor
if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
}
+
return $this->httpClient;
}
+ /**
+ *
+ * @param Guzzle\Http\Client $client
+ * @return void
+ */
public function setHttpClient($client)
{
$this->httpClient = $client;
}
+ /**
+ * @return string
+ */
public function isAvailable()
{
$url = $this->config()->get('base_url');
+
return (boolean) $url;
}
+ /**
+ *
+ * @param string $extension
+ * @return boolean
+ */
public function supportsExtension($extension)
{
return in_array(
@@ -59,12 +94,22 @@ class SolrCellTextExtractor extends FileTextExtractor
);
}
+ /**
+ *
+ * @param string $mime
+ * @return boolean
+ */
public function supportsMime($mime)
{
// Rely on supportsExtension
return false;
}
-
+
+ /**
+ *
+ * @param string $path
+ * @return string
+ */
public function getContent($path)
{
if (!$path) {
@@ -73,6 +118,7 @@ class SolrCellTextExtractor extends FileTextExtractor
$fileName = basename($path);
$client = $this->getHttpClient();
+
try {
$request = $client
->post()
@@ -80,27 +126,30 @@ class SolrCellTextExtractor extends FileTextExtractor
->addPostFiles(array('myfile' => $path));
$response = $request->send();
} catch (InvalidArgumentException $e) {
- SS_Log::log(
- sprintf(
+ $msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
- ),
- SS_Log::NOTICE
- );
+ );
+ Injector::inst()->get(LoggerInterface::class)->notice($msg);
+
return null;
} catch (Guzzle\Http\Exception\ServerErrorResponseException $e) {
- //catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
- SS_Log::log(
- sprintf(
+ // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
+ $msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
- ),
- SS_Log::NOTICE
- );
+ );
+
+ Injector::inst()->get(LoggerInterface::class)->notice($msg);
+
return null;
}
+
+ // Just initialise it, it doesn't take miuch.
+ $matches = [];
+
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)),
diff --git a/code/extractors/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php
similarity index 79%
rename from code/extractors/TikaServerTextExtractor.php
rename to src/Extractor/TikaServerTextExtractor.php
index 5e42bc9..2ae38e8 100644
--- a/code/extractors/TikaServerTextExtractor.php
+++ b/src/Extractor/TikaServerTextExtractor.php
@@ -1,5 +1,12 @@
client ?:
($this->client =
Injector::inst()->createWithArgs(
- 'TikaRestClient',
+ TikaRestClient::class,
array($this->getServerEndpoint())
)
);
}
+ /**
+ * @return string
+ */
public function getServerEndpoint()
{
- if (defined('SS_TIKA_ENDPOINT')) {
- return SS_TIKA_ENDPOINT;
- }
-
- if (getenv('SS_TIKA_ENDPOINT')) {
- return getenv('SS_TIKA_ENDPOINT');
+ if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
+ return $endpoint;
}
// Default to configured endpoint
@@ -68,6 +74,9 @@ class TikaServerTextExtractor extends FileTextExtractor
->getVersion();
}
+ /**
+ * @return boolean
+ */
public function isAvailable()
{
return $this->getServerEndpoint() &&
@@ -75,6 +84,11 @@ class TikaServerTextExtractor extends FileTextExtractor
version_compare($this->getVersion(), '1.7.0') >= 0;
}
+ /**
+ *
+ * @param string $extension
+ * @return boolean
+ */
public function supportsExtension($extension)
{
// Determine support via mime type only
@@ -89,6 +103,11 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
protected $supportedMimes = array();
+ /**
+ *
+ * @param string $mime
+ * @return boolean
+ */
public function supportsMime($mime)
{
$supported = $this->supportedMimes ?:
diff --git a/code/extractors/TikaTextExtractor.php b/src/Extractor/TikaTextExtractor.php
similarity index 78%
rename from code/extractors/TikaTextExtractor.php
rename to src/Extractor/TikaTextExtractor.php
index 0150058..0d4b18f 100644
--- a/code/extractors/TikaTextExtractor.php
+++ b/src/Extractor/TikaTextExtractor.php
@@ -1,8 +1,12 @@
config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
+
if ($code == 0) {
return $output;
}
}
+ /**
+ *
+ * @return boolean
+ */
public function isAvailable()
{
return $this->getVersion() > 0;
}
+ /**
+ *
+ * @return boolean
+ */
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
+
+ /**
+ *
+ * @param string $mime
+ * @return boolean
+ */
public function supportsMime($mime)
{
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
+
if ($code) {
return false;
} // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
+
return (bool)preg_match($pattern, $supportedTypes);
}
}
diff --git a/code/tika/TikaRestClient.php b/src/Rest/TikaRestClient.php
similarity index 80%
rename from code/tika/TikaRestClient.php
rename to src/Rest/TikaRestClient.php
index 4ae6242..764dcac 100644
--- a/code/tika/TikaRestClient.php
+++ b/src/Rest/TikaRestClient.php
@@ -1,7 +1,12 @@
options = array(
- 'username' => SS_TIKA_USERNAME,
- 'password' => SS_TIKA_PASSWORD,
+ 'username' => Environment::getEnv('SS_TIKA_USERNAME'),
+ 'password' => $psswd,
);
}
+
parent::__construct($baseUrl, $config);
}
@@ -39,11 +52,14 @@ class TikaRestClient extends Client
$result = $this->get(null);
$result->setAuth($this->options['username'], $this->options['password']);
$result->send();
+
if ($result->getResponse()->getStatusCode() == 200) {
return true;
}
} catch (RequestException $ex) {
- SS_Log::log(sprintf("Tika unavailable - %s", $ex->getMessage()), SS_Log::ERR);
+ $msg = sprintf("Tika unavailable - %s", $ex->getMessage());
+ Injector::inst()->get(LoggerInterface::class)->error($msg);
+
return false;
}
}
@@ -59,12 +75,14 @@ class TikaRestClient extends Client
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
$version = 0.0;
+
// Parse output
if ($response->getResponse()->getStatusCode() == 200 &&
preg_match('/Apache Tika (?[\.\d]+)/', $response->getResponse()->getBody(), $matches)
) {
$version = (float)$matches['version'];
}
+
return $version;
}
@@ -78,12 +96,14 @@ class TikaRestClient extends Client
if ($this->mimes) {
return $this->mimes;
}
+
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
+
return $this->mimes = $response->getResponse()->json();
}
@@ -91,7 +111,7 @@ class TikaRestClient extends Client
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
- * @param string $file Full filesystem path to a file to post
+ * @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file)
@@ -118,8 +138,10 @@ class TikaRestClient extends Client
if ($body) {
$msg .= ' Body: ' . $body;
}
- SS_Log::log($msg, SS_Log::NOTICE);
+
+ Injector::inst()->get(LoggerInterface::class)->notice($msg);
}
+
return $text;
}
}
diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php
index 6b8d784..e300c19 100644
--- a/tests/FileTextCacheDatabaseTest.php
+++ b/tests/FileTextCacheDatabaseTest.php
@@ -1,10 +1,16 @@
update('FileTextCache_Database', 'max_content_length', 5);
$cache = new FileTextCache_Database();
$file = $this->getMock('File', array('write'));