Merge pull request #21 from helpfulrobot/convert-to-psr-2

Converted to PSR-2
This commit is contained in:
Daniel Hensby 2015-11-18 23:30:07 +00:00
commit ebfa07dc5f
14 changed files with 892 additions and 809 deletions

View File

@ -1,7 +1,7 @@
<?php <?php
interface FileTextCache { interface FileTextCache
{
/** /**
* Save extracted content for a given File entity * Save extracted content for a given File entity
* *
@ -30,32 +30,34 @@ interface FileTextCache {
* Caches the extracted content on the record for the file. * Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits. * Limits the stored file content by default to avoid hitting query size limits.
*/ */
class FileTextCache_Database implements FileTextCache { class FileTextCache_Database implements FileTextCache
{
public function load(File $file) { public function load(File $file)
{
return $file->FileContentCache; return $file->FileContentCache;
} }
public function save(File $file, $content) { public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write(); $file->write();
} }
public function invalidate(File $file) { public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it // To prevent writing to the cache from invalidating it
if(!$file->isChanged('FileContentCache')) { if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = ''; $file->FileContentCache = '';
} }
} }
} }
/** /**
* Uses SS_Cache with a lifetime to cache extracted content * Uses SS_Cache with a lifetime to cache extracted content
*/ */
class FileTextCache_SSCache implements FileTextCache, Flushable { class FileTextCache_SSCache implements FileTextCache, Flushable
{
/** /**
* Lifetime of cache in seconds * Lifetime of cache in seconds
* Null is indefinite * Null is indefinite
@ -68,38 +70,43 @@ class FileTextCache_SSCache implements FileTextCache, Flushable {
/** /**
* @return SS_Cache * @return SS_Cache
*/ */
protected static function get_cache() { protected static function get_cache()
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime'); $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__); $cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime); $cache->setLifetime($lifetime);
return $cache; return $cache;
} }
protected function getKey(File $file) { protected function getKey(File $file)
{
return md5($file->getFullPath()); return md5($file->getFullPath());
} }
public function load(File $file) { public function load(File $file)
{
$key = $this->getKey($file); $key = $this->getKey($file);
$cache = self::get_cache(); $cache = self::get_cache();
return $cache->load($key); return $cache->load($key);
} }
public function save(File $file, $content) { public function save(File $file, $content)
{
$key = $this->getKey($file); $key = $this->getKey($file);
$cache = self::get_cache(); $cache = self::get_cache();
return $cache->save($content, $key); return $cache->save($content, $key);
} }
public static function flush() { public static function flush()
{
$cache = self::get_cache(); $cache = self::get_cache();
$cache->clean(); $cache->clean();
} }
public function invalidate(File $file) { public function invalidate(File $file)
{
$key = $this->getKey($file); $key = $this->getKey($file);
$cache = self::get_cache(); $cache = self::get_cache();
return $cache->remove($key); return $cache->remove($key);
} }
} }

View File

@ -9,8 +9,8 @@
* @author mstephens * @author mstephens
* *
*/ */
class FileTextExtractable extends DataExtension { class FileTextExtractable extends DataExtension
{
private static $db = array( private static $db = array(
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); );
@ -32,14 +32,16 @@ class FileTextExtractable extends DataExtension {
* *
* @param FileTextCache $cache * @param FileTextCache $cache
*/ */
public function setTextCache(FileTextCache $cache) { public function setTextCache(FileTextCache $cache)
{
$this->fileTextCache = $cache; $this->fileTextCache = $cache;
} }
/** /**
* @return FileTextCache * @return FileTextCache
*/ */
public function getTextCache() { public function getTextCache()
{
return $this->fileTextCache; return $this->fileTextCache;
} }
@ -48,7 +50,8 @@ class FileTextExtractable extends DataExtension {
* *
* @return string * @return string
*/ */
public function getFileContent() { public function getFileContent()
{
return $this->extractFileAsText(); return $this->extractFileAsText();
} }
@ -60,10 +63,11 @@ class FileTextExtractable extends DataExtension {
* If true, the content parsing is forced, bypassing the cached version * If true, the content parsing is forced, bypassing the cached version
* @return string * @return string
*/ */
public function extractFileAsText($disableCache = false) { public function extractFileAsText($disableCache = false)
{
if (!$disableCache) { if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner); $text = $this->getTextCache()->load($this->owner);
if($text) { if ($text) {
return $text; return $text;
} }
} }
@ -84,7 +88,8 @@ class FileTextExtractable extends DataExtension {
return $text; return $text;
} }
public function onBeforeWrite() { public function onBeforeWrite()
{
// Clear cache before changing file // Clear cache before changing file
$this->getTextCache()->invalidate($this->owner); $this->getTextCache()->invalidate($this->owner);
} }

View File

@ -5,8 +5,8 @@
* @author mstephens * @author mstephens
* *
*/ */
abstract class FileTextExtractor extends Object { abstract class FileTextExtractor extends Object
{
/** /**
* Set priority from 0-100. * Set priority from 0-100.
* The highest priority extractor for a given content type will be selected. * The highest priority extractor for a given content type will be selected.
@ -28,15 +28,18 @@ abstract class FileTextExtractor extends Object {
* *
* @return array * @return array
*/ */
protected static function get_extractor_classes() { protected static function get_extractor_classes()
{
// Check cache // Check cache
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes; if (self::$sorted_extractor_classes) {
return self::$sorted_extractor_classes;
}
// Generate the sorted list of extractors on demand. // Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor"); $classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes); array_shift($classes);
$classPriorities = array(); $classPriorities = array();
foreach($classes as $class) { foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority'); $classPriorities[$class] = Config::inst()->get($class, 'priority');
} }
arsort($classPriorities); arsort($classPriorities);
@ -52,7 +55,8 @@ abstract class FileTextExtractor extends Object {
* @param string $class * @param string $class
* @return FileTextExtractor * @return FileTextExtractor
*/ */
protected static function get_extractor($class) { protected static function get_extractor($class)
{
return Injector::inst()->get($class); return Injector::inst()->get($class);
} }
@ -62,7 +66,8 @@ abstract class FileTextExtractor extends Object {
* @param string $path * @param string $path
* @return string Mime type if found * @return string Mime type if found
*/ */
protected static function get_mime($path) { protected static function get_mime($path)
{
$file = new Symfony\Component\HttpFoundation\File\File($path); $file = new Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType(); return $file->getMimeType();
@ -72,26 +77,29 @@ abstract class FileTextExtractor extends Object {
* @param string $path * @param string $path
* @return FileTextExtractor|null * @return FileTextExtractor|null
*/ */
static function for_file($path) { public static function for_file($path)
if(!file_exists($path) || is_dir($path)) { {
if (!file_exists($path) || is_dir($path)) {
return; return;
} }
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path); $mime = self::get_mime($path);
foreach(self::get_extractor_classes() as $className) { foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className); $extractor = self::get_extractor($className);
// Skip unavailable extractors // Skip unavailable extractors
if(!$extractor->isAvailable()) continue; if (!$extractor->isAvailable()) {
continue;
}
// Check extension // Check extension
if($extension && $extractor->supportsExtension($extension)) { if ($extension && $extractor->supportsExtension($extension)) {
return $extractor; return $extractor;
} }
// Check mime // Check mime
if($mime && $extractor->supportsMime($mime)) { if ($mime && $extractor->supportsMime($mime)) {
return $extractor; return $extractor;
} }
} }
@ -132,4 +140,6 @@ abstract class FileTextExtractor extends Object {
abstract public function getContent($path); abstract public function getContent($path);
} }
class FileTextExtractor_Exception extends Exception {} class FileTextExtractor_Exception extends Exception
{
}

View File

@ -5,20 +5,23 @@
* @author mstephens * @author mstephens
* *
*/ */
class HTMLTextExtractor extends FileTextExtractor { class HTMLTextExtractor extends FileTextExtractor
{
public function isAvailable() { public function isAvailable()
{
return true; return true;
} }
public function supportsExtension($extension) { public function supportsExtension($extension)
{
return in_array( return in_array(
strtolower($extension), strtolower($extension),
array("html", "htm", "xhtml") array("html", "htm", "xhtml")
); );
} }
public function supportsMime($mime) { public function supportsMime($mime)
{
return strtolower($mime) === 'text/html'; return strtolower($mime) === 'text/html';
} }
@ -38,7 +41,8 @@ class HTMLTextExtractor extends FileTextExtractor {
* @param string $path * @param string $path
* @return string * @return string
*/ */
public function getContent($path) { public function getContent($path)
{
$content = file_get_contents($path); $content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil. // Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job. // Since we don't care about well-formedness or markup here, it does the job.
@ -64,7 +68,7 @@ class HTMLTextExtractor extends FileTextExtractor {
'@</?((frameset)|(frame)|(iframe))@iu', '@</?((frameset)|(frame)|(iframe))@iu',
), ),
array( array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0", ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), ),
$content $content
); );

View File

@ -5,18 +5,21 @@
* @author mstephens * @author mstephens
* *
*/ */
class PDFTextExtractor extends FileTextExtractor { class PDFTextExtractor extends FileTextExtractor
{
public function isAvailable() { public function isAvailable()
{
$bin = $this->bin('pdftotext'); $bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin)); return (file_exists($bin) && is_executable($bin));
} }
public function supportsExtension($extension) { public function supportsExtension($extension)
{
return strtolower($extension) === 'pdf'; return strtolower($extension) === 'pdf';
} }
public function supportsMime($mime) { public function supportsMime($mime)
{
return in_array( return in_array(
strtolower($mime), strtolower($mime),
array( array(
@ -34,7 +37,8 @@ class PDFTextExtractor extends FileTextExtractor {
* @param string $prog Name of binary * @param string $prog Name of binary
* @return string * @return string
*/ */
protected function bin($prog = '') { protected function bin($prog = '')
{
if ($this->config()->binary_location) { if ($this->config()->binary_location) {
// By config // By config
$path = $this->config()->binary_location; $path = $this->config()->binary_location;
@ -47,11 +51,14 @@ class PDFTextExtractor extends FileTextExtractor {
$path = '.'; // Hope it's in path $path = '.'; // Hope it's in path
} }
return ( $path ? $path . '/' : '' ) . $prog; return ($path ? $path . '/' : '') . $prog;
} }
public function getContent($path) { public function getContent($path)
if(!$path) return ""; // no file {
if (!$path) {
return "";
} // no file
$content = $this->getRawOutput($path); $content = $this->getRawOutput($path);
return $this->cleanupLigatures($content); return $this->cleanupLigatures($content);
} }
@ -63,9 +70,10 @@ class PDFTextExtractor extends FileTextExtractor {
* @return string Output * @return string Output
* @throws FileTextExtractor_Exception * @throws FileTextExtractor_Exception
*/ */
protected function getRawOutput($path) { protected function getRawOutput($path)
{
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if($err) { if ($err) {
throw new FileTextExtractor_Exception(sprintf( throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', 'PDFTextExtractor->getContent() failed for %s: %s',
$path, $path,
@ -83,7 +91,8 @@ class PDFTextExtractor extends FileTextExtractor {
* @param string $input * @param string $input
* @return string * @return string
*/ */
protected function cleanupLigatures($input) { protected function cleanupLigatures($input)
{
$mapping = array( $mapping = array(
'ff' => 'ff', 'ff' => 'ff',
'fi' => 'fi', 'fi' => 'fi',

View File

@ -10,8 +10,8 @@ use Guzzle\Http\Client;
* @author ischommer * @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler * @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/ */
class SolrCellTextExtractor extends FileTextExtractor { class SolrCellTextExtractor extends FileTextExtractor
{
/** /**
* Base URL to use for solr text extraction. * Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract * E.g. http://localhost:8983/solr/update/extract
@ -25,24 +25,30 @@ class SolrCellTextExtractor extends FileTextExtractor {
protected $httpClient; protected $httpClient;
public function getHttpClient() { public function getHttpClient()
if(!$this->config()->get('base_url')) { {
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
} }
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url')); if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
}
return $this->httpClient; return $this->httpClient;
} }
public function setHttpClient($client) { public function setHttpClient($client)
{
$this->httpClient = $client; $this->httpClient = $client;
} }
public function isAvailable() { public function isAvailable()
{
$url = $this->config()->get('base_url'); $url = $this->config()->get('base_url');
return (boolean) $url; return (boolean) $url;
} }
public function supportsExtension($extension) { public function supportsExtension($extension)
{
return in_array( return in_array(
strtolower($extension), strtolower($extension),
array( array(
@ -53,13 +59,17 @@ class SolrCellTextExtractor extends FileTextExtractor {
); );
} }
public function supportsMime($mime) { public function supportsMime($mime)
{
// Rely on supportsExtension // Rely on supportsExtension
return false; return false;
} }
public function getContent($path) { public function getContent($path)
if (!$path) return ""; // no file {
if (!$path) {
return "";
} // no file
$fileName = basename($path); $fileName = basename($path);
$client = $this->getHttpClient(); $client = $this->getHttpClient();
@ -69,7 +79,7 @@ class SolrCellTextExtractor extends FileTextExtractor {
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
->addPostFiles(array('myfile' => $path)); ->addPostFiles(array('myfile' => $path));
$response = $request->send(); $response = $request->send();
} catch(InvalidArgumentException $e) { } catch (InvalidArgumentException $e) {
SS_Log::log( SS_Log::log(
sprintf( sprintf(
'Error extracting text from "%s" (message: %s)', 'Error extracting text from "%s" (message: %s)',

View File

@ -5,8 +5,8 @@
* *
* {@link http://tika.apache.org/1.7/gettingstarted.html} * {@link http://tika.apache.org/1.7/gettingstarted.html}
*/ */
class TikaServerTextExtractor extends FileTextExtractor { class TikaServerTextExtractor extends FileTextExtractor
{
/** /**
* Tika server is pretty efficient so use it immediately if available * Tika server is pretty efficient so use it immediately if available
* *
@ -31,7 +31,8 @@ class TikaServerTextExtractor extends FileTextExtractor {
/** /**
* @return TikaRestClient * @return TikaRestClient
*/ */
public function getClient() { public function getClient()
{
return $this->client ?: return $this->client ?:
($this->client = ($this->client =
Injector::inst()->createWithArgs( Injector::inst()->createWithArgs(
@ -41,12 +42,15 @@ class TikaServerTextExtractor extends FileTextExtractor {
); );
} }
public function getServerEndpoint() { public function getServerEndpoint()
if(defined('SS_TIKA_ENDPOINT')) { {
if (defined('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT; return SS_TIKA_ENDPOINT;
} }
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT'); if (getenv('SS_TIKA_ENDPOINT')) {
return getenv('SS_TIKA_ENDPOINT');
}
// Default to configured endpoint // Default to configured endpoint
return $this->config()->server_endpoint; return $this->config()->server_endpoint;
@ -57,19 +61,22 @@ class TikaServerTextExtractor extends FileTextExtractor {
* *
* @return float version of tika * @return float version of tika
*/ */
public function getVersion() { public function getVersion()
{
return $this return $this
->getClient() ->getClient()
->getVersion(); ->getVersion();
} }
public function isAvailable() { public function isAvailable()
{
return $this->getServerEndpoint() && return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() && $this->getClient()->isAvailable() &&
$this->getVersion() >= 1.7; $this->getVersion() >= 1.7;
} }
public function supportsExtension($extension) { public function supportsExtension($extension)
{
// Determine support via mime type only // Determine support via mime type only
return false; return false;
} }
@ -82,23 +89,28 @@ class TikaServerTextExtractor extends FileTextExtractor {
*/ */
protected $supportedMimes = array(); protected $supportedMimes = array();
public function supportsMime($mime) { public function supportsMime($mime)
{
$supported = $this->supportedMimes ?: $supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes()); ($this->supportedMimes = $this->getClient()->getSupportedMimes());
// Check if supported (most common / quickest lookup) // Check if supported (most common / quickest lookup)
if(isset($supported[$mime])) return true; if (isset($supported[$mime])) {
return true;
}
// Check aliases // Check aliases
foreach($supported as $info) { foreach ($supported as $info) {
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true; if (isset($info['alias']) && in_array($mime, $info['alias'])) {
return true;
}
} }
return false; return false;
} }
public function getContent($path) { public function getContent($path)
{
return $this->getClient()->tika($path); return $this->getClient()->tika($path);
} }
} }

View File

@ -5,8 +5,8 @@
* *
* {@link http://tika.apache.org/1.7/gettingstarted.html} * {@link http://tika.apache.org/1.7/gettingstarted.html}
*/ */
class TikaTextExtractor extends FileTextExtractor { class TikaTextExtractor extends FileTextExtractor
{
/** /**
* Text extraction mode. Defaults to -t (plain text) * Text extraction mode. Defaults to -t (plain text)
* *
@ -20,11 +20,12 @@ class TikaTextExtractor extends FileTextExtractor {
* *
* @return float version of tika * @return float version of tika
*/ */
public function getVersion() { public function getVersion()
{
$code = $this->runShell('tika --version', $stdout); $code = $this->runShell('tika --version', $stdout);
// Parse output // Parse output
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) { if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
return $matches['version']; return $matches['version'];
} }
@ -40,7 +41,8 @@ class TikaTextExtractor extends FileTextExtractor {
* @param string $input Content to pass via standard input * @param string $input Content to pass via standard input
* @return int Exit code. 0 is success * @return int Exit code. 0 is success
*/ */
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') { protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
$descriptorSpecs = array( $descriptorSpecs = array(
0 => array("pipe", "r"), 0 => array("pipe", "r"),
1 => array("pipe", "w"), 1 => array("pipe", "w"),
@ -49,7 +51,9 @@ class TikaTextExtractor extends FileTextExtractor {
// Invoke command // Invoke command
$pipes = array(); $pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes); $proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) return 255; if (!is_resource($proc)) {
return 255;
}
// Send content as input // Send content as input
fwrite($pipes[0], $input); fwrite($pipes[0], $input);
@ -65,30 +69,37 @@ class TikaTextExtractor extends FileTextExtractor {
return proc_close($proc); return proc_close($proc);
} }
public function getContent($path) { public function getContent($path)
{
$mode = $this->config()->output_mode; $mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output); $code = $this->runShell($command, $output);
if($code == 0) return $output; if ($code == 0) {
return $output;
}
} }
public function isAvailable() { public function isAvailable()
{
return $this->getVersion() > 0; return $this->getVersion() > 0;
} }
public function supportsExtension($extension) { public function supportsExtension($extension)
{
// Determine support via mime type only // Determine support via mime type only
return false; return false;
} }
public function supportsMime($mime) { public function supportsMime($mime)
{
// Get list of supported mime types // Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if($code) return false; // Error case if ($code) {
return false;
} // Error case
// Check if the mime type is inside the result // Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes); return (bool)preg_match($pattern, $supportedTypes);
} }
} }

View File

@ -3,14 +3,15 @@
use Guzzle\Http\Client; use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException; use Guzzle\Http\Exception\RequestException;
class TikaRestClient extends Client { class TikaRestClient extends Client
{
/** /**
* Detect if the service is available * Detect if the service is available
* *
* @return bool * @return bool
*/ */
public function isAvailable() { public function isAvailable()
{
try { try {
return $this return $this
->get()->send() ->get()->send()
@ -25,10 +26,11 @@ class TikaRestClient extends Client {
* *
* @return float * @return float
*/ */
public function getVersion() { public function getVersion()
{
$response = $this->get('version')->send(); $response = $this->get('version')->send();
// Parse output // Parse output
if($response->getStatusCode() == 200 && if ($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches) preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) { ) {
return (float)$matches['version']; return (float)$matches['version'];
@ -44,8 +46,11 @@ class TikaRestClient extends Client {
* *
* @return array * @return array
*/ */
public function getSupportedMimes() { public function getSupportedMimes()
if($this->mimes) return $this->mimes; {
if ($this->mimes) {
return $this->mimes;
}
$response = $this->get( $response = $this->get(
'mime-types', 'mime-types',
@ -62,7 +67,8 @@ class TikaRestClient extends Client {
* @param string $file Full filesystem path to a file to post * @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text * @return string Content of the file extracted as plain text
*/ */
public function tika($file) { public function tika($file)
{
$text = null; $text = null;
try { try {
$response = $this->put( $response = $this->put(
@ -71,7 +77,7 @@ class TikaRestClient extends Client {
file_get_contents($file) file_get_contents($file)
)->send(); )->send();
$text = $response->getBody(true); $text = $response->getBody(true);
} catch(RequestException $e) { } catch (RequestException $e) {
$msg = sprintf( $msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.', 'TikaRestClient was not able to process %s. Response: %s %s.',
$file, $file,
@ -81,7 +87,7 @@ class TikaRestClient extends Client {
// Only available if tika-server was started with --includeStack // Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true); $body = $e->getResponse()->getBody(true);
if($body) { if ($body) {
$msg .= ' Body: ' . $body; $msg .= ' Body: ' . $body;
} }
@ -90,5 +96,4 @@ class TikaRestClient extends Client {
return $text; return $text;
} }
} }

View File

@ -1,7 +1,8 @@
<?php <?php
class FileTextCacheDatabaseTest extends SapphireTest { class FileTextCacheDatabaseTest extends SapphireTest
{
public function testTruncatesByMaxLength() { public function testTruncatesByMaxLength()
{
Config::nest(); Config::nest();
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5); Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
@ -13,5 +14,4 @@ class FileTextCacheDatabaseTest extends SapphireTest {
Config::unnest(); Config::unnest();
} }
} }

View File

@ -1,11 +1,12 @@
<?php <?php
class FileTextExtractableTest extends SapphireTest { class FileTextExtractableTest extends SapphireTest
{
protected $requiredExtensions = array( protected $requiredExtensions = array(
'File' => array('FileTextExtractable') 'File' => array('FileTextExtractable')
); );
public function setUp() { public function setUp()
{
parent::setUp(); parent::setUp();
// Ensure that html is a valid extension // Ensure that html is a valid extension
@ -14,15 +15,17 @@ class FileTextExtractableTest extends SapphireTest {
->update('File', 'allowed_extensions', array('html')); ->update('File', 'allowed_extensions', array('html'));
} }
public function tearDown() { public function tearDown()
{
Config::unnest(); Config::unnest();
parent::tearDown(); parent::tearDown();
} }
function testExtractFileAsText() { public function testExtractFileAsText()
{
// Create a copy of the file, as it may be clobbered by the test // Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write) // ($file->extractFileAsText() calls $file->write)
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// Use HTML, since the extractor is always available // Use HTML, since the extractor is always available
$file = new File(array( $file = new File(array(
@ -36,8 +39,8 @@ class FileTextExtractableTest extends SapphireTest {
$this->assertContains('Test Text', $content); $this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache); $this->assertEquals($content, $file->FileContentCache);
if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
}
} }
} }

View File

@ -1,7 +1,8 @@
<?php <?php
class HTMLTextExtractorTest extends SapphireTest { class HTMLTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
{
$extractor = new HTMLTextExtractor(); $extractor = new HTMLTextExtractor();
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
@ -10,5 +11,4 @@ class HTMLTextExtractorTest extends SapphireTest {
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); $this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags'); $this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
} }
} }

View File

@ -1,12 +1,14 @@
<?php <?php
class PDFTextExtractorTest extends SapphireTest { class PDFTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
{
$extractor = new PDFTextExtractor(); $extractor = new PDFTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('pdftotext not available'); if (!$extractor->isAvailable()) {
$this->markTestSkipped('pdftotext not available');
}
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);
} }
} }

View File

@ -3,11 +3,14 @@
/** /**
* Tests the {@see TikaTextExtractor} class * Tests the {@see TikaTextExtractor} class
*/ */
class TikaTextExtractorTest extends SapphireTest { class TikaTextExtractorTest extends SapphireTest
{
function testExtraction() { public function testExtraction()
{
$extractor = new TikaTextExtractor(); $extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available'); if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika cli not available');
}
// Check file // Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
@ -20,9 +23,12 @@ class TikaTextExtractorTest extends SapphireTest {
$this->assertFalse($extractor->supportsMime('application/not-supported')); $this->assertFalse($extractor->supportsMime('application/not-supported'));
} }
function testServerExtraction() { public function testServerExtraction()
{
$extractor = new TikaServerTextExtractor(); $extractor = new TikaServerTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available'); if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file // Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
@ -34,5 +40,4 @@ class TikaTextExtractorTest extends SapphireTest {
$this->assertTrue($extractor->supportsMime('text/html')); $this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported')); $this->assertFalse($extractor->supportsMime('application/not-supported'));
} }
} }