API FileTextExtractable::getContent now takes a File instance instead of a path

This commit is contained in:
Robbie Averill 2018-07-03 15:55:02 +12:00
parent 8bd019b2aa
commit edb02e9189
12 changed files with 138 additions and 61 deletions

View File

@ -3,9 +3,8 @@ Name: textextractioncache
After: After:
- '#corecache' - '#corecache'
--- ---
SilverStripe\Core\Injector\Injector: SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache: Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory factory: SilverStripe\Core\Cache\CacheFactory
constructor: constructor:
namespace: 'FileTextCache_Cache' namespace: 'FileTextCache_Cache'

10
_config/config.yml Normal file
View File

@ -0,0 +1,10 @@
---
Name: textextractionconfig
---
SilverStripe\Core\Injector\Injector:
# Define default FileTextCache implementation
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
max_content_length: 500000

View File

@ -2,7 +2,7 @@
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Control\Director; use SilverStripe\Assets\File;
use SilverStripe\ORM\DataExtension; use SilverStripe\ORM\DataExtension;
use SilverStripe\TextExtraction\Cache\FileTextCache; use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor; use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
@ -14,12 +14,10 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
* Adds an additional property which is the cached contents, which is populated on demand. * Adds an additional property which is the cached contents, which is populated on demand.
* *
* @author mstephens * @author mstephens
*
*/ */
class FileTextExtractable extends DataExtension class FileTextExtractable extends DataExtension
{ {
/** /**
*
* @var array * @var array
* @config * @config
*/ */
@ -28,7 +26,6 @@ class FileTextExtractable extends DataExtension
]; ];
/** /**
*
* @var array * @var array
* @config * @config
*/ */
@ -37,12 +34,11 @@ class FileTextExtractable extends DataExtension
]; ];
/** /**
*
* @var array * @var array
* @config * @config
*/ */
private static $dependencies = [ private static $dependencies = [
'TextCache' => FileTextCache\Cache::class, 'TextCache' => '%$' . FileTextCache::class,
]; ];
/** /**
@ -51,7 +47,6 @@ class FileTextExtractable extends DataExtension
protected $fileTextCache = null; protected $fileTextCache = null;
/** /**
*
* @param FileTextCache $cache * @param FileTextCache $cache
* @return $this * @return $this
*/ */
@ -90,27 +85,28 @@ class FileTextExtractable extends DataExtension
*/ */
public function extractFileAsText($disableCache = false) public function extractFileAsText($disableCache = false)
{ {
/** @var File $file */
$file = $this->owner;
if (!$disableCache) { if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner); $text = $this->getTextCache()->load($file);
if ($text) { if ($text) {
return $text; return $text;
} }
} }
// Determine which extractor can process this file. // Determine which extractor can process this file.
$path = Director::baseFolder() . '/' . $this->owner->getFilename(); $extractor = FileTextExtractor::for_file($file);
$extractor = FileTextExtractor::for_file($path);
if (!$extractor) { if (!$extractor) {
return null; return null;
} }
$text = $extractor->getContent($path); $text = $extractor->getContent($file);
if (!$text) { if (!$text) {
return null; return null;
} }
if (!$disableCache) { if (!$disableCache) {
$this->getTextCache()->save($this->owner, $text); $this->getTextCache()->save($file, $text);
} }
return $text; return $text;

View File

@ -2,10 +2,12 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\ClassInfo; use SilverStripe\Core\ClassInfo;
use SilverStripe\Core\Config\Config; use SilverStripe\Core\Config\Config;
use SilverStripe\Core\Config\Configurable; use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/** /**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
@ -83,17 +85,19 @@ abstract class FileTextExtractor
} }
/** /**
* @param string $path * Given a File object, decide which extractor instance to use to handle it
*
* @param File $file
* @return FileTextExtractor|null * @return FileTextExtractor|null
*/ */
public static function for_file($path) public static function for_file(File $file)
{ {
if (!file_exists($path) || is_dir($path)) { if (!$file) {
return null; return null;
} }
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = $file->getExtension();
$mime = self::get_mime($path); $mime = $file->getMimeType();
foreach (self::get_extractor_classes() as $className) { foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className); $extractor = self::get_extractor($className);
@ -115,6 +119,37 @@ abstract class FileTextExtractor
} }
} }
/**
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
* file contents to a temp file and return its path
*
* @param File $file
* @return string
* @throws Exception
*/
protected function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
}
// Append extension to temp file if one is set
if ($file->getExtension()) {
$path .= '.' . $file->getExtension();
}
// Remove any existing temp files with this name
unlink($path);
$bytesWritten = file_put_contents($path, $file->getStream());
if (false === $bytesWritten) {
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
}
return $path;
}
/** /**
* Checks if the extractor is supported on the current environment, * Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available. * for example if the correct binaries or libraries are available.
@ -142,10 +177,10 @@ abstract class FileTextExtractor
abstract public function supportsMime($mime); abstract public function supportsMime($mime);
/** /**
* Given a file path, extract the contents as text. * Given a File instance, extract the contents as text.
* *
* @param string $path * @param File $file
* @return string * @return string
*/ */
abstract public function getContent($path); abstract public function getContent(File $file);
} }

View File

@ -2,6 +2,8 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
/** /**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
* the best for readable text. * the best for readable text.
@ -49,12 +51,13 @@ class HTMLTextExtractor extends FileTextExtractor
* combined with regular expressions to remove non-content tags like <style> or <script>, * combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags. * as well as adding line breaks after block tags.
* *
* @param string $path * @param File $file
* @return string * @return string
*/ */
public function getContent($path) public function getContent(File $file)
{ {
$content = file_get_contents($path); $content = $file->getString();
// Yes, yes, regex'ing HTML is evil. // Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job. // Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace( $content = preg_replace(

View File

@ -2,7 +2,9 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
use function tempnam;
/** /**
* Text extractor that calls pdftotext to do the conversion. * Text extractor that calls pdftotext to do the conversion.
@ -83,28 +85,30 @@ class PDFTextExtractor extends FileTextExtractor
return null; return null;
} }
public function getContent($path) public function getContent(File $file)
{ {
if (!$path) { if (!$file) {
// no file // no file
return ''; return '';
} }
$content = $this->getRawOutput($path); $content = $this->getRawOutput($file);
return $this->cleanupLigatures($content); return $this->cleanupLigatures($content);
} }
/** /**
* Invoke pdftotext with the given path * Invoke pdftotext with the given File object
* *
* @param string $path * @param File $file
* @return string Output * @return string Output
* @throws Exception * @throws Exception
*/ */
protected function getRawOutput($path) protected function getRawOutput(File $file)
{ {
if (!$this->isAvailable()) { if (!$this->isAvailable()) {
throw new Exception("getRawOutput called on unavailable extractor"); throw new Exception("getRawOutput called on unavailable extractor");
} }
$path = $this->getPathFromFile($file);
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) { if ($err) {
if (!is_array($err) && $err == 1) { if (!is_array($err) && $err == 1) {

View File

@ -6,6 +6,7 @@ use Exception;
use GuzzleHttp\Client; use GuzzleHttp\Client;
use InvalidArgumentException; use InvalidArgumentException;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
/** /**
@ -98,18 +99,18 @@ class SolrCellTextExtractor extends FileTextExtractor
} }
/** /**
* @param string $path * @param File $file
* @return string * @return string
* @throws InvalidArgumentException * @throws InvalidArgumentException
*/ */
public function getContent($path) public function getContent(File $file)
{ {
if (!$path) { if (!$file) {
// no file // no file
return ''; return '';
} }
$fileName = basename($path); $fileName = $file->getFilename();
$client = $this->getHttpClient(); $client = $this->getHttpClient();
// Get and validate base URL // Get and validate base URL
@ -119,6 +120,7 @@ class SolrCellTextExtractor extends FileTextExtractor
} }
try { try {
$path = $this->getPathFromFile($file);
$request = $client $request = $client
->post($baseUrl) ->post($baseUrl)
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text']) ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
@ -127,7 +129,7 @@ class SolrCellTextExtractor extends FileTextExtractor
} catch (InvalidArgumentException $e) { } catch (InvalidArgumentException $e) {
$msg = sprintf( $msg = sprintf(
'Error extracting text from "%s" (message: %s)', 'Error extracting text from "%s" (message: %s)',
$path, $fileName,
$e->getMessage() $e->getMessage()
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->notice($msg);

View File

@ -2,6 +2,7 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\Environment; use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient; use SilverStripe\TextExtraction\Rest\TikaRestClient;
@ -123,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
return false; return false;
} }
public function getContent($path) public function getContent(File $file)
{ {
return $this->getClient()->tika($path); $tempFile = $this->getPathFromFile($file);
return $this->getClient()->tika($tempFile);
} }
} }

View File

@ -2,6 +2,8 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
/** /**
* Enables text extraction of file content via the Tika CLI * Enables text extraction of file content via the Tika CLI
* *
@ -72,13 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
return proc_close($proc); return proc_close($proc);
} }
/** public function getContent(File $file)
* @param string $path
* @return string
*/
public function getContent($path)
{ {
$mode = $this->config()->output_mode; $mode = $this->config()->get('output_mode');
$path = $this->getPathFromFile($file);
$command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output); $code = $this->runShell($command, $output);

View File

@ -23,31 +23,36 @@ class FileTextExtractableTest extends SapphireTest
// Ensure that html is a valid extension // Ensure that html is a valid extension
Config::modify()->merge(File::class, 'allowed_extensions', ['html']); Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
}
public function testExtractFileAsText()
{
// Create a copy of the file, as it may be clobbered by the test // Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write) // ($file->extractFileAsText() calls $file->write)
copy( copy(
dirname(__FILE__) . '/fixtures/test1.html', dirname(__FILE__) . '/fixtures/test1.html',
dirname(__FILE__) . '/fixtures/test1-copy.html' dirname(__FILE__) . '/fixtures/test1-copy.html'
); );
}
// Use HTML, since the extractor is always available protected function tearDown()
$file = new File([ {
'Name' => 'test1-copy.html',
'Filename' => dirname(__FILE__) . '/fixtures/test1-copy.html'
]);
$file->write();
$content = $file->extractFileAsText();
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) { if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html'); unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
} }
parent::tearDown();
}
public function testExtractFileAsText()
{
// Use HTML, since the extractor is always available
/** @var File|FileTextExtractable $file */
$file = new File(['Name' => 'test1-copy.html']);
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
$file->write();
$content = $file->extractFileAsText();
$this->assertNotNull($content);
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
} }
} }

View File

@ -2,16 +2,31 @@
namespace SilverStripe\TextExtraction\Tests; namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest; use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor; use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
class HTMLTextExtractorTest extends SapphireTest class HTMLTextExtractorTest extends SapphireTest
{ {
protected $usesDatabase = true;
protected function setUp()
{
parent::setUp();
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
}
public function testExtraction() public function testExtraction()
{ {
$extractor = new HTMLTextExtractor(); $extractor = new HTMLTextExtractor();
$content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.html'); $file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('Test Headline', $content); $this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');

View File

@ -2,12 +2,15 @@
namespace SilverStripe\TextExtraction\Tests; namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest; use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor; use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
class PDFTextExtractorTest extends SapphireTest class PDFTextExtractorTest extends SapphireTest
{ {
protected $usesDatabase = true;
public function testExtraction() public function testExtraction()
{ {
$extractor = new PDFTextExtractor(); $extractor = new PDFTextExtractor();
@ -16,7 +19,11 @@ class PDFTextExtractorTest extends SapphireTest
$this->expectExceptionMessage('getRawOutput called on unavailable extractor'); $this->expectExceptionMessage('getRawOutput called on unavailable extractor');
} }
$content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.pdf'); $file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);
} }
} }