diff --git a/code/extensions/FileTextCache.php b/code/extensions/FileTextCache.php
index cd1d046..385d848 100644
--- a/code/extensions/FileTextCache.php
+++ b/code/extensions/FileTextCache.php
@@ -1,105 +1,112 @@
FileContentCache;
- }
+class FileTextCache_Database implements FileTextCache
+{
+ public function load(File $file)
+ {
+ return $file->FileContentCache;
+ }
- public function save(File $file, $content) {
- $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
- $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
- $file->write();
- }
-
- public function invalidate(File $file) {
- // To prevent writing to the cache from invalidating it
- if(!$file->isChanged('FileContentCache')) {
- $file->FileContentCache = '';
- }
- }
+ public function save(File $file, $content)
+ {
+ $maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
+ $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
+ $file->write();
+ }
+ public function invalidate(File $file)
+ {
+ // To prevent writing to the cache from invalidating it
+ if (!$file->isChanged('FileContentCache')) {
+ $file->FileContentCache = '';
+ }
+ }
}
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
-class FileTextCache_SSCache implements FileTextCache, Flushable {
+class FileTextCache_SSCache implements FileTextCache, Flushable
+{
+ /**
+ * Lifetime of cache in seconds
+ * Null is indefinite
+ *
+ * @var int|null
+ * @config
+ */
+ private static $lifetime = null;
- /**
- * Lifetime of cache in seconds
- * Null is indefinite
- *
- * @var int|null
- * @config
- */
- private static $lifetime = null;
+ /**
+ * @return SS_Cache
+ */
+ protected static function get_cache()
+ {
+ $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
+ $cache = SS_Cache::factory(__CLASS__);
+ $cache->setLifetime($lifetime);
+ return $cache;
+ }
- /**
- * @return SS_Cache
- */
- protected static function get_cache() {
- $lifetime = Config::inst()->get(__CLASS__, 'lifetime');
- $cache = SS_Cache::factory(__CLASS__);
- $cache->setLifetime($lifetime);
- return $cache;
- }
+ protected function getKey(File $file)
+ {
+ return md5($file->getFullPath());
+ }
- protected function getKey(File $file) {
- return md5($file->getFullPath());
- }
+ public function load(File $file)
+ {
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+ return $cache->load($key);
+ }
- public function load(File $file) {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->load($key);
- }
+ public function save(File $file, $content)
+ {
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+ return $cache->save($content, $key);
+ }
- public function save(File $file, $content) {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->save($content, $key);
- }
-
- public static function flush() {
- $cache = self::get_cache();
- $cache->clean();
- }
-
- public function invalidate(File $file) {
- $key = $this->getKey($file);
- $cache = self::get_cache();
- return $cache->remove($key);
- }
+ public static function flush()
+ {
+ $cache = self::get_cache();
+ $cache->clean();
+ }
+ public function invalidate(File $file)
+ {
+ $key = $this->getKey($file);
+ $cache = self::get_cache();
+ return $cache->remove($key);
+ }
}
diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php
index 16110e3..f39d407 100644
--- a/code/extensions/FileTextExtractable.php
+++ b/code/extensions/FileTextExtractable.php
@@ -9,83 +9,88 @@
* @author mstephens
*
*/
-class FileTextExtractable extends DataExtension {
-
- private static $db = array(
- 'FileContentCache' => 'Text'
- );
+class FileTextExtractable extends DataExtension
+{
+ private static $db = array(
+ 'FileContentCache' => 'Text'
+ );
- private static $casting = array(
- 'FileContent' => 'Text'
- );
+ private static $casting = array(
+ 'FileContent' => 'Text'
+ );
- private static $dependencies = array(
- 'TextCache' => '%$FileTextCache'
- );
+ private static $dependencies = array(
+ 'TextCache' => '%$FileTextCache'
+ );
- /**
- * @var FileTextCache
- */
- protected $fileTextCache = null;
+ /**
+ * @var FileTextCache
+ */
+ protected $fileTextCache = null;
- /**
- *
- * @param FileTextCache $cache
- */
- public function setTextCache(FileTextCache $cache) {
- $this->fileTextCache = $cache;
- }
+ /**
+ *
+ * @param FileTextCache $cache
+ */
+ public function setTextCache(FileTextCache $cache)
+ {
+ $this->fileTextCache = $cache;
+ }
- /**
- * @return FileTextCache
- */
- public function getTextCache() {
- return $this->fileTextCache;
- }
+ /**
+ * @return FileTextCache
+ */
+ public function getTextCache()
+ {
+ return $this->fileTextCache;
+ }
- /**
- * Helper function for template
- *
- * @return string
- */
- public function getFileContent() {
- return $this->extractFileAsText();
- }
+ /**
+ * Helper function for template
+ *
+ * @return string
+ */
+ public function getFileContent()
+ {
+ return $this->extractFileAsText();
+ }
- /**
- * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
- * The value is also cached into the File record itself.
- *
- * @param boolean $disableCache If false, the file content is only parsed on demand.
- * If true, the content parsing is forced, bypassing the cached version
- * @return string
- */
- public function extractFileAsText($disableCache = false) {
- if (!$disableCache) {
- $text = $this->getTextCache()->load($this->owner);
- if($text) {
- return $text;
- }
- }
+ /**
+ * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
+ * The value is also cached into the File record itself.
+ *
+ * @param boolean $disableCache If false, the file content is only parsed on demand.
+ * If true, the content parsing is forced, bypassing the cached version
+ * @return string
+ */
+ public function extractFileAsText($disableCache = false)
+ {
+ if (!$disableCache) {
+ $text = $this->getTextCache()->load($this->owner);
+ if ($text) {
+ return $text;
+ }
+ }
- // Determine which extractor can process this file.
- $extractor = FileTextExtractor::for_file($this->owner->FullPath);
- if (!$extractor) {
- return null;
- }
+ // Determine which extractor can process this file.
+ $extractor = FileTextExtractor::for_file($this->owner->FullPath);
+ if (!$extractor) {
+ return null;
+ }
- $text = $extractor->getContent($this->owner->FullPath);
- if (!$text) {
- return null;
- }
+ $text = $extractor->getContent($this->owner->FullPath);
+ if (!$text) {
+ return null;
+ }
- $this->getTextCache()->save($this->owner, $text);
+ $this->getTextCache()->save($this->owner, $text);
- return $text;
- }
+ return $text;
+ }
- public function onBeforeWrite() {
- // Clear cache before changing file
- $this->getTextCache()->invalidate($this->owner);
- }
+ public function onBeforeWrite()
+ {
+ // Clear cache before changing file
+ $this->getTextCache()->invalidate($this->owner);
+ }
}
diff --git a/code/extractors/FileTextExtractor.php b/code/extractors/FileTextExtractor.php
index b22e98e..cc7c176 100644
--- a/code/extractors/FileTextExtractor.php
+++ b/code/extractors/FileTextExtractor.php
@@ -5,131 +5,141 @@
* @author mstephens
*
*/
-abstract class FileTextExtractor extends Object {
+abstract class FileTextExtractor extends Object
+{
+ /**
+ * Set priority from 0-100.
+ * The highest priority extractor for a given content type will be selected.
+ *
+ * @config
+ * @var integer
+ */
+ private static $priority = 50;
- /**
- * Set priority from 0-100.
- * The highest priority extractor for a given content type will be selected.
- *
- * @config
- * @var integer
- */
- private static $priority = 50;
+ /**
+ * Cache of extractor class names, sorted by priority
+ *
+ * @var array
+ */
+ protected static $sorted_extractor_classes = null;
- /**
- * Cache of extractor class names, sorted by priority
- *
- * @var array
- */
- protected static $sorted_extractor_classes = null;
+ /**
+ * Gets the list of prioritised extractor classes
+ *
+ * @return array
+ */
+ protected static function get_extractor_classes()
+ {
+ // Check cache
+ if (self::$sorted_extractor_classes) {
+ return self::$sorted_extractor_classes;
+ }
+
+ // Generate the sorted list of extractors on demand.
+ $classes = ClassInfo::subclassesFor("FileTextExtractor");
+ array_shift($classes);
+ $classPriorities = array();
+ foreach ($classes as $class) {
+ $classPriorities[$class] = Config::inst()->get($class, 'priority');
+ }
+ arsort($classPriorities);
- /**
- * Gets the list of prioritised extractor classes
- *
- * @return array
- */
- protected static function get_extractor_classes() {
- // Check cache
- if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
-
- // Generate the sorted list of extractors on demand.
- $classes = ClassInfo::subclassesFor("FileTextExtractor");
- array_shift($classes);
- $classPriorities = array();
- foreach($classes as $class) {
- $classPriorities[$class] = Config::inst()->get($class, 'priority');
- }
- arsort($classPriorities);
+ // Save classes
+ $sortedClasses = array_keys($classPriorities);
+ return self::$sorted_extractor_classes = $sortedClasses;
+ }
- // Save classes
- $sortedClasses = array_keys($classPriorities);
- return self::$sorted_extractor_classes = $sortedClasses;
- }
+ /**
+ * Get the text file extractor for the given class
+ *
+ * @param string $class
+ * @return FileTextExtractor
+ */
+ protected static function get_extractor($class)
+ {
+ return Injector::inst()->get($class);
+ }
- /**
- * Get the text file extractor for the given class
- *
- * @param string $class
- * @return FileTextExtractor
- */
- protected static function get_extractor($class) {
- return Injector::inst()->get($class);
- }
+ /**
+ * Attempt to detect mime type for given file
+ *
+ * @param string $path
+ * @return string Mime type if found
+ */
+ protected static function get_mime($path)
+ {
+ $file = new Symfony\Component\HttpFoundation\File\File($path);
- /**
- * Attempt to detect mime type for given file
- *
- * @param string $path
- * @return string Mime type if found
- */
- protected static function get_mime($path) {
- $file = new Symfony\Component\HttpFoundation\File\File($path);
+ return $file->getMimeType();
+ }
- return $file->getMimeType();
- }
+ /**
+ * @param string $path
+ * @return FileTextExtractor|null
+ */
+ public static function for_file($path)
+ {
+ if (!file_exists($path) || is_dir($path)) {
+ return;
+ }
- /**
- * @param string $path
- * @return FileTextExtractor|null
- */
- static function for_file($path) {
- if(!file_exists($path) || is_dir($path)) {
- return;
- }
+ $extension = pathinfo($path, PATHINFO_EXTENSION);
+ $mime = self::get_mime($path);
+ foreach (self::get_extractor_classes() as $className) {
+ $extractor = self::get_extractor($className);
- $extension = pathinfo($path, PATHINFO_EXTENSION);
- $mime = self::get_mime($path);
- foreach(self::get_extractor_classes() as $className) {
- $extractor = self::get_extractor($className);
+ // Skip unavailable extractors
+ if (!$extractor->isAvailable()) {
+ continue;
+ }
- // Skip unavailable extractors
- if(!$extractor->isAvailable()) continue;
+ // Check extension
+ if ($extension && $extractor->supportsExtension($extension)) {
+ return $extractor;
+ }
- // Check extension
- if($extension && $extractor->supportsExtension($extension)) {
- return $extractor;
- }
+ // Check mime
+ if ($mime && $extractor->supportsMime($mime)) {
+ return $extractor;
+ }
+ }
+ }
- // Check mime
- if($mime && $extractor->supportsMime($mime)) {
- return $extractor;
- }
- }
- }
+ /**
+ * Checks if the extractor is supported on the current environment,
+ * for example if the correct binaries or libraries are available.
+ *
+ * @return boolean
+ */
+ abstract public function isAvailable();
- /**
- * Checks if the extractor is supported on the current environment,
- * for example if the correct binaries or libraries are available.
- *
- * @return boolean
- */
- abstract public function isAvailable();
+ /**
+ * Determine if this extractor supports the given extension.
+ * If support is determined by mime/type only, then this should return false.
+ *
+ * @param string $extension
+ * @return boolean
+ */
+ abstract public function supportsExtension($extension);
- /**
- * Determine if this extractor supports the given extension.
- * If support is determined by mime/type only, then this should return false.
- *
- * @param string $extension
- * @return boolean
- */
- abstract public function supportsExtension($extension);
+ /**
+ * Determine if this extractor suports the given mime type.
+ * Will only be called if supportsExtension returns false.
+ *
+ * @param string $mime
+ * @return boolean
+ */
+ abstract public function supportsMime($mime);
- /**
- * Determine if this extractor suports the given mime type.
- * Will only be called if supportsExtension returns false.
- *
- * @param string $mime
- * @return boolean
- */
- abstract public function supportsMime($mime);
-
- /**
- * Given a file path, extract the contents as text.
- *
- * @param string $path
- * @return string
- */
- abstract public function getContent($path);
+ /**
+ * Given a file path, extract the contents as text.
+ *
+ * @param string $path
+ * @return string
+ */
+ abstract public function getContent($path);
}
-class FileTextExtractor_Exception extends Exception {}
\ No newline at end of file
+class FileTextExtractor_Exception extends Exception
+{
+}
diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php
index a41fe23..810f473 100644
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@@ -5,69 +5,73 @@
* @author mstephens
*
*/
-class HTMLTextExtractor extends FileTextExtractor {
-
- public function isAvailable() {
- return true;
- }
+class HTMLTextExtractor extends FileTextExtractor
+{
+ public function isAvailable()
+ {
+ return true;
+ }
- public function supportsExtension($extension) {
- return in_array(
- strtolower($extension),
- array("html", "htm", "xhtml")
- );
- }
+ public function supportsExtension($extension)
+ {
+ return in_array(
+ strtolower($extension),
+ array("html", "htm", "xhtml")
+ );
+ }
- public function supportsMime($mime) {
- return strtolower($mime) === 'text/html';
- }
+ public function supportsMime($mime)
+ {
+ return strtolower($mime) === 'text/html';
+ }
- /**
- * Lower priority because its not the most clever HTML extraction. If there is something better, use it
- *
- * @config
- * @var integer
- */
- private static $priority = 10;
+ /**
+ * Lower priority because its not the most clever HTML extraction. If there is something better, use it
+ *
+ * @config
+ * @var integer
+ */
+ private static $priority = 10;
- /**
- * Extracts content from regex, by using strip_tags()
- * combined with regular expressions to remove non-content tags like @siu',
- '@@siu',
- '@@siu',
- '@@siu',
- '@@siu',
- '@]*?.*?@siu',
- '@@siu',
- '@]*?.*?@siu',
- // Add line breaks before and after blocks
- '@?((address)|(blockquote)|(center)|(del))@iu',
- '@?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
- '@?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
- '@?((table)|(th)|(td)|(caption))@iu',
- '@?((form)|(button)|(fieldset)|(legend)|(input))@iu',
- '@?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
- '@?((frameset)|(frame)|(iframe))@iu',
- ),
- array(
- ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
- ),
- $content
- );
- return strip_tags($content);
- }
+ /**
+ * Extracts content from regex, by using strip_tags()
+ * combined with regular expressions to remove non-content tags like @siu',
+ '@@siu',
+ '@@siu',
+ '@@siu',
+ '@@siu',
+ '@]*?.*?@siu',
+ '@@siu',
+ '@]*?.*?@siu',
+ // Add line breaks before and after blocks
+ '@?((address)|(blockquote)|(center)|(del))@iu',
+ '@?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
+ '@?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
+ '@?((table)|(th)|(td)|(caption))@iu',
+ '@?((form)|(button)|(fieldset)|(legend)|(input))@iu',
+ '@?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
+ '@?((frameset)|(frame)|(iframe))@iu',
+ ),
+ array(
+ ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
+ ),
+ $content
+ );
+ return strip_tags($content);
+ }
}
diff --git a/code/extractors/PDFTextExtractor.php b/code/extractors/PDFTextExtractor.php
index b64558c..2e96f35 100644
--- a/code/extractors/PDFTextExtractor.php
+++ b/code/extractors/PDFTextExtractor.php
@@ -5,94 +5,103 @@
* @author mstephens
*
*/
-class PDFTextExtractor extends FileTextExtractor {
+class PDFTextExtractor extends FileTextExtractor
+{
+ public function isAvailable()
+ {
+ $bin = $this->bin('pdftotext');
+ return (file_exists($bin) && is_executable($bin));
+ }
+
+ public function supportsExtension($extension)
+ {
+ return strtolower($extension) === 'pdf';
+ }
- public function isAvailable() {
- $bin = $this->bin('pdftotext');
- return (file_exists($bin) && is_executable($bin));
- }
-
- public function supportsExtension($extension) {
- return strtolower($extension) === 'pdf';
- }
+ public function supportsMime($mime)
+ {
+ return in_array(
+ strtolower($mime),
+ array(
+ 'application/pdf',
+ 'application/x-pdf',
+ 'application/x-bzpdf',
+ 'application/x-gzpdf'
+ )
+ );
+ }
- public function supportsMime($mime) {
- return in_array(
- strtolower($mime),
- array(
- 'application/pdf',
- 'application/x-pdf',
- 'application/x-bzpdf',
- 'application/x-gzpdf'
- )
- );
- }
+ /**
+ * Accessor to get the location of the binary
+ *
+ * @param string $prog Name of binary
+ * @return string
+ */
+ protected function bin($prog = '')
+ {
+ if ($this->config()->binary_location) {
+ // By config
+ $path = $this->config()->binary_location;
+ } elseif (file_exists('/usr/bin/pdftotext')) {
+ // By searching common directories
+ $path = '/usr/bin';
+ } elseif (file_exists('/usr/local/bin/pdftotext')) {
+ $path = '/usr/local/bin';
+ } else {
+ $path = '.'; // Hope it's in path
+ }
- /**
- * Accessor to get the location of the binary
- *
- * @param string $prog Name of binary
- * @return string
- */
- protected function bin($prog = '') {
- if ($this->config()->binary_location) {
- // By config
- $path = $this->config()->binary_location;
- } elseif (file_exists('/usr/bin/pdftotext')) {
- // By searching common directories
- $path = '/usr/bin';
- } elseif (file_exists('/usr/local/bin/pdftotext')) {
- $path = '/usr/local/bin';
- } else {
- $path = '.'; // Hope it's in path
- }
+ return ($path ? $path . '/' : '') . $prog;
+ }
+
+ public function getContent($path)
+ {
+ if (!$path) {
+ return "";
+ } // no file
+ $content = $this->getRawOutput($path);
+ return $this->cleanupLigatures($content);
+ }
- return ( $path ? $path . '/' : '' ) . $prog;
- }
-
- public function getContent($path) {
- if(!$path) return ""; // no file
- $content = $this->getRawOutput($path);
- return $this->cleanupLigatures($content);
- }
+ /**
+ * Invoke pdftotext with the given path
+ *
+ * @param string $path
+ * @return string Output
+ * @throws FileTextExtractor_Exception
+ */
+ protected function getRawOutput($path)
+ {
+ exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
+ if ($err) {
+ throw new FileTextExtractor_Exception(sprintf(
+ 'PDFTextExtractor->getContent() failed for %s: %s',
+ $path,
+ implode('', $err)
+ ));
+ }
+ return implode('', $content);
+ }
- /**
- * Invoke pdftotext with the given path
- *
- * @param string $path
- * @return string Output
- * @throws FileTextExtractor_Exception
- */
- protected function getRawOutput($path) {
- exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
- if($err) {
- throw new FileTextExtractor_Exception(sprintf(
- 'PDFTextExtractor->getContent() failed for %s: %s',
- $path,
- implode('', $err)
- ));
- }
- return implode('', $content);
- }
-
- /**
- * Removes utf-8 ligatures.
- *
- * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
- *
- * @param string $input
- * @return string
- */
- protected function cleanupLigatures($input) {
- $mapping = array(
- 'ff' => 'ff',
- 'fi' => 'fi',
- 'fl' => 'fl',
- 'ffi' => 'ffi',
- 'ffl' => 'ffl',
- 'ſt' => 'ft',
- 'st' => 'st'
- );
- return str_replace(array_keys($mapping), array_values($mapping), $input);
- }
+ /**
+ * Removes utf-8 ligatures.
+ *
+ * @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
+ *
+ * @param string $input
+ * @return string
+ */
+ protected function cleanupLigatures($input)
+ {
+ $mapping = array(
+ 'ff' => 'ff',
+ 'fi' => 'fi',
+ 'fl' => 'fl',
+ 'ffi' => 'ffi',
+ 'ffl' => 'ffl',
+ 'ſt' => 'ft',
+ 'st' => 'st'
+ );
+ return str_replace(array_keys($mapping), array_values($mapping), $input);
+ }
}
diff --git a/code/extractors/SolrCellTextExtractor.php b/code/extractors/SolrCellTextExtractor.php
index 29d26f9..6e14543 100644
--- a/code/extractors/SolrCellTextExtractor.php
+++ b/code/extractors/SolrCellTextExtractor.php
@@ -10,83 +10,93 @@ use Guzzle\Http\Client;
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
-class SolrCellTextExtractor extends FileTextExtractor {
+class SolrCellTextExtractor extends FileTextExtractor
+{
+ /**
+ * Base URL to use for solr text extraction.
+ * E.g. http://localhost:8983/solr/update/extract
+ *
+ * @config
+ * @var string
+ */
+ private static $base_url;
- /**
- * Base URL to use for solr text extraction.
- * E.g. http://localhost:8983/solr/update/extract
- *
- * @config
- * @var string
- */
- private static $base_url;
+ private static $priority = 75;
- private static $priority = 75;
+ protected $httpClient;
- protected $httpClient;
+ public function getHttpClient()
+ {
+ if (!$this->config()->get('base_url')) {
+ throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
+ }
+ if (!$this->httpClient) {
+ $this->httpClient = new Client($this->config()->get('base_url'));
+ }
+ return $this->httpClient;
+ }
- public function getHttpClient() {
- if(!$this->config()->get('base_url')) {
- throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
- }
- if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
- return $this->httpClient;
- }
+ public function setHttpClient($client)
+ {
+ $this->httpClient = $client;
+ }
- public function setHttpClient($client) {
- $this->httpClient = $client;
- }
+ public function isAvailable()
+ {
+ $url = $this->config()->get('base_url');
+ return (boolean) $url;
+ }
- public function isAvailable() {
- $url = $this->config()->get('base_url');
- return (boolean) $url;
- }
+ public function supportsExtension($extension)
+ {
+ return in_array(
+ strtolower($extension),
+ array(
+ 'pdf', 'doc', 'docx', 'xls', 'xlsx',
+ 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
+ 'ppt', 'pptx', 'odp', 'fodp', 'csv'
+ )
+ );
+ }
- public function supportsExtension($extension) {
- return in_array(
- strtolower($extension),
- array(
- 'pdf', 'doc', 'docx', 'xls', 'xlsx',
- 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
- 'ppt', 'pptx', 'odp', 'fodp', 'csv'
- )
- );
- }
+ public function supportsMime($mime)
+ {
+ // Rely on supportsExtension
+ return false;
+ }
+
+ public function getContent($path)
+ {
+ if (!$path) {
+ return "";
+ } // no file
- public function supportsMime($mime) {
- // Rely on supportsExtension
- return false;
- }
-
- public function getContent($path) {
- if (!$path) return ""; // no file
-
- $fileName = basename($path);
- $client = $this->getHttpClient();
- try {
- $request = $client
- ->post()
- ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
- ->addPostFiles(array('myfile' => $path));
- $response = $request->send();
- } catch(InvalidArgumentException $e) {
- SS_Log::log(
- sprintf(
- 'Error extracting text from "%s" (message: %s)',
- $path,
- $e->getMessage()
- ),
- SS_Log::NOTICE
- );
- return null;
- }
- // Use preg match to avoid SimpleXML running out of memory on large text nodes
- preg_match(
- sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)),
- (string)$response->getBody(),
- $matches
- );
+ $fileName = basename($path);
+ $client = $this->getHttpClient();
+ try {
+ $request = $client
+ ->post()
+ ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
+ ->addPostFiles(array('myfile' => $path));
+ $response = $request->send();
+ } catch (InvalidArgumentException $e) {
+ SS_Log::log(
+ sprintf(
+ 'Error extracting text from "%s" (message: %s)',
+ $path,
+ $e->getMessage()
+ ),
+ SS_Log::NOTICE
+ );
+ return null;
+ }
+ // Use preg match to avoid SimpleXML running out of memory on large text nodes
+ preg_match(
+ sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)),
+ (string)$response->getBody(),
+ $matches
+ );
- return $matches ? $matches[1] : null;
- }
+ return $matches ? $matches[1] : null;
+ }
}
diff --git a/code/extractors/TikaServerTextExtractor.php b/code/extractors/TikaServerTextExtractor.php
index b4a74e5..5371206 100644
--- a/code/extractors/TikaServerTextExtractor.php
+++ b/code/extractors/TikaServerTextExtractor.php
@@ -5,100 +5,112 @@
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
-class TikaServerTextExtractor extends FileTextExtractor {
+class TikaServerTextExtractor extends FileTextExtractor
+{
+ /**
+ * Tika server is pretty efficient so use it immediately if available
+ *
+ * @var integer
+ * @config
+ */
+ private static $priority = 80;
- /**
- * Tika server is pretty efficient so use it immediately if available
- *
- * @var integer
- * @config
- */
- private static $priority = 80;
+ /**
+ * Server endpoint
+ *
+ * @var string
+ * @config
+ */
+ private static $server_endpoint;
- /**
- * Server endpoint
- *
- * @var string
- * @config
- */
- private static $server_endpoint;
+ /**
+ * @var TikaRestClient
+ */
+ protected $client = null;
- /**
- * @var TikaRestClient
- */
- protected $client = null;
+ /**
+ * @return TikaRestClient
+ */
+ public function getClient()
+ {
+ return $this->client ?:
+ ($this->client =
+ Injector::inst()->createWithArgs(
+ 'TikaRestClient',
+ array($this->getServerEndpoint())
+ )
+ );
+ }
- /**
- * @return TikaRestClient
- */
- public function getClient() {
- return $this->client ?:
- ($this->client =
- Injector::inst()->createWithArgs(
- 'TikaRestClient',
- array($this->getServerEndpoint())
- )
- );
- }
+ public function getServerEndpoint()
+ {
+ if (defined('SS_TIKA_ENDPOINT')) {
+ return SS_TIKA_ENDPOINT;
+ }
- public function getServerEndpoint() {
- if(defined('SS_TIKA_ENDPOINT')) {
- return SS_TIKA_ENDPOINT;
- }
+ if (getenv('SS_TIKA_ENDPOINT')) {
+ return getenv('SS_TIKA_ENDPOINT');
+ }
- if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
+ // Default to configured endpoint
+ return $this->config()->server_endpoint;
+ }
- // Default to configured endpoint
- return $this->config()->server_endpoint;
- }
+ /**
+ * Get the version of tika installed, or 0 if not installed
+ *
+ * @return float version of tika
+ */
+ public function getVersion()
+ {
+ return $this
+ ->getClient()
+ ->getVersion();
+ }
- /**
- * Get the version of tika installed, or 0 if not installed
- *
- * @return float version of tika
- */
- public function getVersion() {
- return $this
- ->getClient()
- ->getVersion();
- }
+ public function isAvailable()
+ {
+ return $this->getServerEndpoint() &&
+ $this->getClient()->isAvailable() &&
+ $this->getVersion() >= 1.7;
+ }
- public function isAvailable() {
- return $this->getServerEndpoint() &&
- $this->getClient()->isAvailable() &&
- $this->getVersion() >= 1.7;
- }
-
- public function supportsExtension($extension) {
- // Determine support via mime type only
- return false;
- }
+ public function supportsExtension($extension)
+ {
+ // Determine support via mime type only
+ return false;
+ }
- /**
- * Cache of supported mime types
- *
- * @var array
- */
- protected $supportedMimes = array();
+ /**
+ * Cache of supported mime types
+ *
+ * @var array
+ */
+ protected $supportedMimes = array();
- public function supportsMime($mime) {
- $supported = $this->supportedMimes ?:
- ($this->supportedMimes = $this->getClient()->getSupportedMimes());
+ public function supportsMime($mime)
+ {
+ $supported = $this->supportedMimes ?:
+ ($this->supportedMimes = $this->getClient()->getSupportedMimes());
- // Check if supported (most common / quickest lookup)
- if(isset($supported[$mime])) return true;
+ // Check if supported (most common / quickest lookup)
+ if (isset($supported[$mime])) {
+ return true;
+ }
- // Check aliases
- foreach($supported as $info) {
- if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
- }
+ // Check aliases
+ foreach ($supported as $info) {
+ if (isset($info['alias']) && in_array($mime, $info['alias'])) {
+ return true;
+ }
+ }
- return false;
- }
-
- public function getContent($path) {
- return $this->getClient()->tika($path);
- }
+ return false;
+ }
+ public function getContent($path)
+ {
+ return $this->getClient()->tika($path);
+ }
}
diff --git a/code/extractors/TikaTextExtractor.php b/code/extractors/TikaTextExtractor.php
index 871bcca..0150058 100644
--- a/code/extractors/TikaTextExtractor.php
+++ b/code/extractors/TikaTextExtractor.php
@@ -5,90 +5,101 @@
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
-class TikaTextExtractor extends FileTextExtractor {
+class TikaTextExtractor extends FileTextExtractor
+{
+ /**
+ * Text extraction mode. Defaults to -t (plain text)
+ *
+ * @var string
+ * @config
+ */
+ private static $output_mode = '-t';
- /**
- * Text extraction mode. Defaults to -t (plain text)
- *
- * @var string
- * @config
- */
- private static $output_mode = '-t';
+ /**
+ * Get the version of tika installed, or 0 if not installed
+ *
+ * @return float version of tika
+ */
+ public function getVersion()
+ {
+ $code = $this->runShell('tika --version', $stdout);
- /**
- * Get the version of tika installed, or 0 if not installed
- *
- * @return float version of tika
- */
- public function getVersion() {
- $code = $this->runShell('tika --version', $stdout);
+ // Parse output
+ if (!$code && preg_match('/Apache Tika (?[\.\d]+)/', $stdout, $matches)) {
+ return $matches['version'];
+ }
- // Parse output
- if(!$code && preg_match('/Apache Tika (?[\.\d]+)/', $stdout, $matches)) {
- return $matches['version'];
- }
+ return 0;
+ }
- return 0;
- }
+ /**
+ * Runs an arbitrary and safely escaped shell command
+ *
+ * @param string $command Full command including arguments
+ * @param string &$stdout Standand output
+ * @param string &$stderr Standard error
+ * @param string $input Content to pass via standard input
+ * @return int Exit code. 0 is success
+ */
+ protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
+ {
+ $descriptorSpecs = array(
+ 0 => array("pipe", "r"),
+ 1 => array("pipe", "w"),
+ 2 => array("pipe", "w")
+ );
+ // Invoke command
+ $pipes = array();
+ $proc = proc_open($command, $descriptorSpecs, $pipes);
+ if (!is_resource($proc)) {
+ return 255;
+ }
- /**
- * Runs an arbitrary and safely escaped shell command
- *
- * @param string $command Full command including arguments
- * @param string &$stdout Standand output
- * @param string &$stderr Standard error
- * @param string $input Content to pass via standard input
- * @return int Exit code. 0 is success
- */
- protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
- $descriptorSpecs = array(
- 0 => array("pipe", "r"),
- 1 => array("pipe", "w"),
- 2 => array("pipe", "w")
- );
- // Invoke command
- $pipes = array();
- $proc = proc_open($command, $descriptorSpecs, $pipes);
- if (!is_resource($proc)) return 255;
+ // Send content as input
+ fwrite($pipes[0], $input);
+ fclose($pipes[0]);
- // Send content as input
- fwrite($pipes[0], $input);
- fclose($pipes[0]);
+ // Get output
+ $stdout = stream_get_contents($pipes[1]);
+ fclose($pipes[1]);
+ $stderr = stream_get_contents($pipes[2]);
+ fclose($pipes[2]);
- // Get output
- $stdout = stream_get_contents($pipes[1]);
- fclose($pipes[1]);
- $stderr = stream_get_contents($pipes[2]);
- fclose($pipes[2]);
+ // Get result
+ return proc_close($proc);
+ }
+
+ public function getContent($path)
+ {
+ $mode = $this->config()->output_mode;
+ $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
+ $code = $this->runShell($command, $output);
+ if ($code == 0) {
+ return $output;
+ }
+ }
- // Get result
- return proc_close($proc);
- }
-
- public function getContent($path) {
- $mode = $this->config()->output_mode;
- $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
- $code = $this->runShell($command, $output);
- if($code == 0) return $output;
- }
+ public function isAvailable()
+ {
+ return $this->getVersion() > 0;
+ }
- public function isAvailable() {
- return $this->getVersion() > 0;
- }
+ public function supportsExtension($extension)
+ {
+ // Determine support via mime type only
+ return false;
+ }
- public function supportsExtension($extension) {
- // Determine support via mime type only
- return false;
- }
-
- public function supportsMime($mime) {
- // Get list of supported mime types
- $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
- if($code) return false; // Error case
-
- // Check if the mime type is inside the result
- $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
- return (bool)preg_match($pattern, $supportedTypes);
- }
+ public function supportsMime($mime)
+ {
+ // Get list of supported mime types
+ $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
+ if ($code) {
+ return false;
+ } // Error case
+ // Check if the mime type is inside the result
+ $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
+ return (bool)preg_match($pattern, $supportedTypes);
+ }
}
diff --git a/code/tika/TikaRestClient.php b/code/tika/TikaRestClient.php
index 6773c32..f5561af 100644
--- a/code/tika/TikaRestClient.php
+++ b/code/tika/TikaRestClient.php
@@ -3,92 +3,97 @@
use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException;
-class TikaRestClient extends Client {
+class TikaRestClient extends Client
+{
+ /**
+ * Detect if the service is available
+ *
+ * @return bool
+ */
+ public function isAvailable()
+ {
+ try {
+ return $this
+ ->get()->send()
+ ->getStatusCode() == 200;
+ } catch (RequestException $ex) {
+ return false;
+ }
+ }
- /**
- * Detect if the service is available
- *
- * @return bool
- */
- public function isAvailable() {
- try {
- return $this
- ->get()->send()
- ->getStatusCode() == 200;
- } catch (RequestException $ex) {
- return false;
- }
- }
+ /**
+ * Get version code
+ *
+ * @return float
+ */
+ public function getVersion()
+ {
+ $response = $this->get('version')->send();
+ // Parse output
+ if ($response->getStatusCode() == 200 &&
+ preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody(), $matches)
+ ) {
+ return (float)$matches['version'];
+ }
- /**
- * Get version code
- *
- * @return float
- */
- public function getVersion() {
- $response = $this->get('version')->send();
- // Parse output
- if($response->getStatusCode() == 200 &&
- preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody(), $matches)
- ) {
- return (float)$matches['version'];
- }
+ return 0.0;
+ }
- return 0.0;
- }
+ protected $mimes = array();
- protected $mimes = array();
+ /**
+ * Gets supported mime data. May include aliased mime types.
+ *
+ * @return array
+ */
+ public function getSupportedMimes()
+ {
+ if ($this->mimes) {
+ return $this->mimes;
+ }
- /**
- * Gets supported mime data. May include aliased mime types.
- *
- * @return array
- */
- public function getSupportedMimes() {
- if($this->mimes) return $this->mimes;
+ $response = $this->get(
+ 'mime-types',
+ array('Accept' => 'application/json')
+ )->send();
- $response = $this->get(
- 'mime-types',
- array('Accept' => 'application/json')
- )->send();
+ return $this->mimes = $response->json();
+ }
- return $this->mimes = $response->json();
- }
+ /**
+ * Extract text content from a given file.
+ * Logs a notice-level error if the document can't be parsed.
+ *
+ * @param string $file Full filesystem path to a file to post
+ * @return string Content of the file extracted as plain text
+ */
+ public function tika($file)
+ {
+ $text = null;
+ try {
+ $response = $this->put(
+ 'tika',
+ array('Accept' => 'text/plain'),
+ file_get_contents($file)
+ )->send();
+ $text = $response->getBody(true);
+ } catch (RequestException $e) {
+ $msg = sprintf(
+ 'TikaRestClient was not able to process %s. Response: %s %s.',
+ $file,
+ $e->getResponse()->getStatusCode(),
+ $e->getResponse()->getReasonPhrase()
+ );
- /**
- * Extract text content from a given file.
- * Logs a notice-level error if the document can't be parsed.
- *
- * @param string $file Full filesystem path to a file to post
- * @return string Content of the file extracted as plain text
- */
- public function tika($file) {
- $text = null;
- try {
- $response = $this->put(
- 'tika',
- array('Accept' => 'text/plain'),
- file_get_contents($file)
- )->send();
- $text = $response->getBody(true);
- } catch(RequestException $e) {
- $msg = sprintf(
- 'TikaRestClient was not able to process %s. Response: %s %s.',
- $file,
- $e->getResponse()->getStatusCode(),
- $e->getResponse()->getReasonPhrase()
- );
-
- // Only available if tika-server was started with --includeStack
- $body = $e->getResponse()->getBody(true);
- if($body) {
- $msg .= ' Body: ' . $body;
- }
-
- SS_Log::log($msg, SS_Log::NOTICE);
- }
-
- return $text;
- }
+ // Only available if tika-server was started with --includeStack
+ $body = $e->getResponse()->getBody(true);
+ if ($body) {
+ $msg .= ' Body: ' . $body;
+ }
+ SS_Log::log($msg, SS_Log::NOTICE);
+ }
+
+ return $text;
+ }
}
diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php
index d0caf60..6b8d784 100644
--- a/tests/FileTextCacheDatabaseTest.php
+++ b/tests/FileTextCacheDatabaseTest.php
@@ -1,17 +1,17 @@
update('FileTextCache_Database', 'max_content_length', 5);
- $cache = new FileTextCache_Database();
- $file = $this->getMock('File', array('write'));
- $content = '0123456789';
- $cache->save($file, $content);
- $this->assertEquals($cache->load($file), '01234');
+class FileTextCacheDatabaseTest extends SapphireTest
+{
+ public function testTruncatesByMaxLength()
+ {
+ Config::nest();
+
+ Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
+ $cache = new FileTextCache_Database();
+ $file = $this->getMock('File', array('write'));
+ $content = '0123456789';
+ $cache->save($file, $content);
+ $this->assertEquals($cache->load($file), '01234');
- Config::unnest();
- }
-
-}
\ No newline at end of file
+ Config::unnest();
+ }
+}
diff --git a/tests/FileTextExtractableTest.php b/tests/FileTextExtractableTest.php
index 6c2d788..166b1ee 100644
--- a/tests/FileTextExtractableTest.php
+++ b/tests/FileTextExtractableTest.php
@@ -1,43 +1,46 @@
array('FileTextExtractable')
+ );
- protected $requiredExtensions = array(
- 'File' => array('FileTextExtractable')
- );
+ public function setUp()
+ {
+ parent::setUp();
- public function setUp() {
- parent::setUp();
+ // Ensure that html is a valid extension
+ Config::inst()
+ ->nest()
+ ->update('File', 'allowed_extensions', array('html'));
+ }
- // Ensure that html is a valid extension
- Config::inst()
- ->nest()
- ->update('File', 'allowed_extensions', array('html'));
- }
+ public function tearDown()
+ {
+ Config::unnest();
+ parent::tearDown();
+ }
- public function tearDown() {
- Config::unnest();
- parent::tearDown();
- }
+ public function testExtractFileAsText()
+ {
+ // Create a copy of the file, as it may be clobbered by the test
+ // ($file->extractFileAsText() calls $file->write)
+ copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
+
+ // Use HTML, since the extractor is always available
+ $file = new File(array(
+ 'Name' => 'test1-copy.html',
+ 'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
+ ));
+ $file->write();
+
+ $content = $file->extractFileAsText();
+ $this->assertContains('Test Headline', $content);
+ $this->assertContains('Test Text', $content);
+ $this->assertEquals($content, $file->FileContentCache);
- function testExtractFileAsText() {
- // Create a copy of the file, as it may be clobbered by the test
- // ($file->extractFileAsText() calls $file->write)
- copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
-
- // Use HTML, since the extractor is always available
- $file = new File(array(
- 'Name' => 'test1-copy.html',
- 'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
- ));
- $file->write();
-
- $content = $file->extractFileAsText();
- $this->assertContains('Test Headline', $content);
- $this->assertContains('Test Text', $content);
- $this->assertEquals($content, $file->FileContentCache);
-
- if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
- }
-
-
-}
\ No newline at end of file
+ if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
+ unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
+ }
+ }
+}
diff --git a/tests/HTMLTextExtractorTest.php b/tests/HTMLTextExtractorTest.php
index a1f2429..8ff8b0b 100644
--- a/tests/HTMLTextExtractorTest.php
+++ b/tests/HTMLTextExtractorTest.php
@@ -1,14 +1,14 @@
getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
- $this->assertContains('Test Headline', $content);
- $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
- $this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
- $this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
- }
-
-}
\ No newline at end of file
+ $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
+ $this->assertContains('Test Headline', $content);
+ $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
+ $this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
+ $this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
+ }
+}
diff --git a/tests/PDFTextExtractorTest.php b/tests/PDFTextExtractorTest.php
index 21b0a73..b99ff06 100644
--- a/tests/PDFTextExtractorTest.php
+++ b/tests/PDFTextExtractorTest.php
@@ -1,12 +1,14 @@
isAvailable()) $this->markTestSkipped('pdftotext not available');
-
- $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
- $this->assertContains('This is a test file with a link', $content);
- }
+class PDFTextExtractorTest extends SapphireTest
+{
+ public function testExtraction()
+ {
+ $extractor = new PDFTextExtractor();
+ if (!$extractor->isAvailable()) {
+ $this->markTestSkipped('pdftotext not available');
+ }
+ $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
+ $this->assertContains('This is a test file with a link', $content);
+ }
}
diff --git a/tests/TikaTextExtractorTest.php b/tests/TikaTextExtractorTest.php
index a35c636..0342dcf 100644
--- a/tests/TikaTextExtractorTest.php
+++ b/tests/TikaTextExtractorTest.php
@@ -3,36 +3,41 @@
/**
* Tests the {@see TikaTextExtractor} class
*/
-class TikaTextExtractorTest extends SapphireTest {
-
- function testExtraction() {
- $extractor = new TikaTextExtractor();
- if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
+class TikaTextExtractorTest extends SapphireTest
+{
+ public function testExtraction()
+ {
+ $extractor = new TikaTextExtractor();
+ if (!$extractor->isAvailable()) {
+ $this->markTestSkipped('tika cli not available');
+ }
- // Check file
- $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
- $content = $extractor->getContent($file);
- $this->assertContains('This is a test file with a link', $content);
+ // Check file
+ $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
+ $content = $extractor->getContent($file);
+ $this->assertContains('This is a test file with a link', $content);
- // Check mime validation
- $this->assertTrue($extractor->supportsMime('application/pdf'));
- $this->assertTrue($extractor->supportsMime('text/html'));
- $this->assertFalse($extractor->supportsMime('application/not-supported'));
- }
+ // Check mime validation
+ $this->assertTrue($extractor->supportsMime('application/pdf'));
+ $this->assertTrue($extractor->supportsMime('text/html'));
+ $this->assertFalse($extractor->supportsMime('application/not-supported'));
+ }
- function testServerExtraction() {
- $extractor = new TikaServerTextExtractor();
- if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
+ public function testServerExtraction()
+ {
+ $extractor = new TikaServerTextExtractor();
+ if (!$extractor->isAvailable()) {
+ $this->markTestSkipped('tika server not available');
+ }
- // Check file
- $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
- $content = $extractor->getContent($file);
- $this->assertContains('This is a test file with a link', $content);
-
- // Check mime validation
- $this->assertTrue($extractor->supportsMime('application/pdf'));
- $this->assertTrue($extractor->supportsMime('text/html'));
- $this->assertFalse($extractor->supportsMime('application/not-supported'));
- }
+ // Check file
+ $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
+ $content = $extractor->getContent($file);
+ $this->assertContains('This is a test file with a link', $content);
+ // Check mime validation
+ $this->assertTrue($extractor->supportsMime('application/pdf'));
+ $this->assertTrue($extractor->supportsMime('text/html'));
+ $this->assertFalse($extractor->supportsMime('application/not-supported'));
+ }
}