From edb02e91897bf44ebe5150d28f8c3c07272e25d9 Mon Sep 17 00:00:00 2001 From: Robbie Averill Date: Tue, 3 Jul 2018 15:55:02 +1200 Subject: [PATCH] API FileTextExtractable::getContent now takes a File instance instead of a path --- _config/cache.yml | 3 +- _config/config.yml | 10 +++++ src/Extension/FileTextExtractable.php | 20 ++++----- src/Extractor/FileTextExtractor.php | 51 +++++++++++++++++++---- src/Extractor/HTMLTextExtractor.php | 9 ++-- src/Extractor/PDFTextExtractor.php | 16 ++++--- src/Extractor/SolrCellTextExtractor.php | 12 +++--- src/Extractor/TikaServerTextExtractor.php | 6 ++- src/Extractor/TikaTextExtractor.php | 11 +++-- tests/FileTextExtractableTest.php | 35 +++++++++------- tests/HTMLTextExtractorTest.php | 17 +++++++- tests/PDFTextExtractorTest.php | 9 +++- 12 files changed, 138 insertions(+), 61 deletions(-) create mode 100644 _config/config.yml diff --git a/_config/cache.yml b/_config/cache.yml index ff793b2..2f82c29 100644 --- a/_config/cache.yml +++ b/_config/cache.yml @@ -3,9 +3,8 @@ Name: textextractioncache After: - '#corecache' --- - SilverStripe\Core\Injector\Injector: Psr\SimpleCache\CacheInterface.FileTextCache_Cache: factory: SilverStripe\Core\Cache\CacheFactory constructor: - namespace: 'FileTextCache_Cache' \ No newline at end of file + namespace: 'FileTextCache_Cache' diff --git a/_config/config.yml b/_config/config.yml new file mode 100644 index 0000000..0a0982d --- /dev/null +++ b/_config/config.yml @@ -0,0 +1,10 @@ +--- +Name: textextractionconfig +--- +SilverStripe\Core\Injector\Injector: + # Define default FileTextCache implementation + SilverStripe\TextExtraction\Cache\FileTextCache: + class: SilverStripe\TextExtraction\Cache\FileTextCache\Database + +SilverStripe\TextExtraction\Cache\FileTextCache\Database: + max_content_length: 500000 diff --git a/src/Extension/FileTextExtractable.php b/src/Extension/FileTextExtractable.php index b5f7896..fedccbc 100644 --- a/src/Extension/FileTextExtractable.php +++ b/src/Extension/FileTextExtractable.php @@ -2,7 +2,7 @@ namespace SilverStripe\TextExtraction\Extension; -use SilverStripe\Control\Director; +use SilverStripe\Assets\File; use SilverStripe\ORM\DataExtension; use SilverStripe\TextExtraction\Cache\FileTextCache; use SilverStripe\TextExtraction\Extractor\FileTextExtractor; @@ -14,12 +14,10 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor; * Adds an additional property which is the cached contents, which is populated on demand. * * @author mstephens - * */ class FileTextExtractable extends DataExtension { /** - * * @var array * @config */ @@ -28,7 +26,6 @@ class FileTextExtractable extends DataExtension ]; /** - * * @var array * @config */ @@ -37,12 +34,11 @@ class FileTextExtractable extends DataExtension ]; /** - * * @var array * @config */ private static $dependencies = [ - 'TextCache' => FileTextCache\Cache::class, + 'TextCache' => '%$' . FileTextCache::class, ]; /** @@ -51,7 +47,6 @@ class FileTextExtractable extends DataExtension protected $fileTextCache = null; /** - * * @param FileTextCache $cache * @return $this */ @@ -90,27 +85,28 @@ class FileTextExtractable extends DataExtension */ public function extractFileAsText($disableCache = false) { + /** @var File $file */ + $file = $this->owner; if (!$disableCache) { - $text = $this->getTextCache()->load($this->owner); + $text = $this->getTextCache()->load($file); if ($text) { return $text; } } // Determine which extractor can process this file. - $path = Director::baseFolder() . '/' . $this->owner->getFilename(); - $extractor = FileTextExtractor::for_file($path); + $extractor = FileTextExtractor::for_file($file); if (!$extractor) { return null; } - $text = $extractor->getContent($path); + $text = $extractor->getContent($file); if (!$text) { return null; } if (!$disableCache) { - $this->getTextCache()->save($this->owner, $text); + $this->getTextCache()->save($file, $text); } return $text; diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index fd3cf5c..57a82ef 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -2,10 +2,12 @@ namespace SilverStripe\TextExtraction\Extractor; +use SilverStripe\Assets\File; use SilverStripe\Core\ClassInfo; use SilverStripe\Core\Config\Config; use SilverStripe\Core\Config\Configurable; use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; /** * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. @@ -83,17 +85,19 @@ abstract class FileTextExtractor } /** - * @param string $path + * Given a File object, decide which extractor instance to use to handle it + * + * @param File $file * @return FileTextExtractor|null */ - public static function for_file($path) + public static function for_file(File $file) { - if (!file_exists($path) || is_dir($path)) { + if (!$file) { return null; } - $extension = pathinfo($path, PATHINFO_EXTENSION); - $mime = self::get_mime($path); + $extension = $file->getExtension(); + $mime = $file->getMimeType(); foreach (self::get_extractor_classes() as $className) { $extractor = self::get_extractor($className); @@ -115,6 +119,37 @@ abstract class FileTextExtractor } } + /** + * Some text extractors (like pdftotext) may require a physical file to read from, so write the current + * file contents to a temp file and return its path + * + * @param File $file + * @return string + * @throws Exception + */ + protected function getPathFromFile(File $file) + { + $path = tempnam(TEMP_PATH, 'pdftextextractor_'); + if (false === $path) { + throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name'); + } + + // Append extension to temp file if one is set + if ($file->getExtension()) { + $path .= '.' . $file->getExtension(); + } + + // Remove any existing temp files with this name + unlink($path); + + $bytesWritten = file_put_contents($path, $file->getStream()); + if (false === $bytesWritten) { + throw new Exception(static::class . '->getPathFromFile() failed to write temporary file'); + } + + return $path; + } + /** * Checks if the extractor is supported on the current environment, * for example if the correct binaries or libraries are available. @@ -142,10 +177,10 @@ abstract class FileTextExtractor abstract public function supportsMime($mime); /** - * Given a file path, extract the contents as text. + * Given a File instance, extract the contents as text. * - * @param string $path + * @param File $file * @return string */ - abstract public function getContent($path); + abstract public function getContent(File $file); } diff --git a/src/Extractor/HTMLTextExtractor.php b/src/Extractor/HTMLTextExtractor.php index 78c8440..d1b56b9 100644 --- a/src/Extractor/HTMLTextExtractor.php +++ b/src/Extractor/HTMLTextExtractor.php @@ -2,6 +2,8 @@ namespace SilverStripe\TextExtraction\Extractor; +use SilverStripe\Assets\File; + /** * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not * the best for readable text. @@ -49,12 +51,13 @@ class HTMLTextExtractor extends FileTextExtractor * combined with regular expressions to remove non-content tags like