API FileTextExtractable::getContent now takes a File instance instead of a path

2024-10-22 09:06:00 +00:00 · 2018-07-03 15:55:02 +12:00 · 2018-07-03 15:55:02 +12:00 · edb02e9189
commit edb02e9189
parent 8bd019b2aa
12 changed files with 138 additions and 61 deletions
--- a/_config/cache.yml
+++ b/_config/cache.yml
@ -3,9 +3,8 @@ Name: textextractioncache
 After:
  - '#corecache'
 ---
-  
 SilverStripe\Core\Injector\Injector:
  Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
    factory: SilverStripe\Core\Cache\CacheFactory
    constructor:
-      namespace: 'FileTextCache_Cache'
+      namespace: 'FileTextCache_Cache'
--- a/_config/config.yml
+++ b/_config/config.yml
@ -0,0 +1,10 @@
+---
+Name: textextractionconfig
+---
+SilverStripe\Core\Injector\Injector:
+  # Define default FileTextCache implementation
+  SilverStripe\TextExtraction\Cache\FileTextCache:
+    class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
+
+SilverStripe\TextExtraction\Cache\FileTextCache\Database:
+  max_content_length: 500000
--- a/src/Extension/FileTextExtractable.php
+++ b/src/Extension/FileTextExtractable.php
@ -2,7 +2,7 @@

 namespace SilverStripe\TextExtraction\Extension;

-use SilverStripe\Control\Director;
+use SilverStripe\Assets\File;
 use SilverStripe\ORM\DataExtension;
 use SilverStripe\TextExtraction\Cache\FileTextCache;
 use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
@ -14,12 +14,10 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
 * Adds an additional property which is the cached contents, which is populated on demand.
 *
 * @author mstephens
- *
 */
 class FileTextExtractable extends DataExtension
 {
    /**
-     *
     * @var array
     * @config
     */
@ -28,7 +26,6 @@ class FileTextExtractable extends DataExtension
    ];

    /**
-     *
     * @var array
     * @config
     */
@ -37,12 +34,11 @@ class FileTextExtractable extends DataExtension
    ];

    /**
-     *
     * @var array
     * @config
     */
    private static $dependencies = [
-        'TextCache' => FileTextCache\Cache::class,
+        'TextCache' => '%$' . FileTextCache::class,
    ];

    /**
@ -51,7 +47,6 @@ class FileTextExtractable extends DataExtension
    protected $fileTextCache = null;

    /**
-     *
     * @param  FileTextCache $cache
     * @return $this
     */
@ -90,27 +85,28 @@ class FileTextExtractable extends DataExtension
     */
    public function extractFileAsText($disableCache = false)
    {
+        /** @var File $file */
+        $file = $this->owner;
        if (!$disableCache) {
-            $text = $this->getTextCache()->load($this->owner);
+            $text = $this->getTextCache()->load($file);
            if ($text) {
                return $text;
            }
        }

        // Determine which extractor can process this file.
-        $path = Director::baseFolder() . '/' . $this->owner->getFilename();
-        $extractor = FileTextExtractor::for_file($path);
+        $extractor = FileTextExtractor::for_file($file);
        if (!$extractor) {
            return null;
        }

-        $text = $extractor->getContent($path);
+        $text = $extractor->getContent($file);
        if (!$text) {
            return null;
        }

        if (!$disableCache) {
-            $this->getTextCache()->save($this->owner, $text);
+            $this->getTextCache()->save($file, $text);
        }

        return $text;
--- a/src/Extractor/FileTextExtractor.php
+++ b/src/Extractor/FileTextExtractor.php
@ -2,10 +2,12 @@

 namespace SilverStripe\TextExtraction\Extractor;

+use SilverStripe\Assets\File;
 use SilverStripe\Core\ClassInfo;
 use SilverStripe\Core\Config\Config;
 use SilverStripe\Core\Config\Configurable;
 use SilverStripe\Core\Injector\Injector;
+use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;

 /**
 * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
@ -83,17 +85,19 @@ abstract class FileTextExtractor
    }

    /**
-     * @param  string $path
+     * Given a File object, decide which extractor instance to use to handle it
+     *
+     * @param File $file
     * @return FileTextExtractor|null
     */
-    public static function for_file($path)
+    public static function for_file(File $file)
    {
-        if (!file_exists($path) || is_dir($path)) {
+        if (!$file) {
            return null;
        }

-        $extension = pathinfo($path, PATHINFO_EXTENSION);
-        $mime = self::get_mime($path);
+        $extension = $file->getExtension();
+        $mime = $file->getMimeType();

        foreach (self::get_extractor_classes() as $className) {
            $extractor = self::get_extractor($className);
@ -115,6 +119,37 @@ abstract class FileTextExtractor
        }
    }

+    /**
+     * Some text extractors (like pdftotext) may require a physical file to read from, so write the current
+     * file contents to a temp file and return its path
+     *
+     * @param File $file
+     * @return string
+     * @throws Exception
+     */
+    protected function getPathFromFile(File $file)
+    {
+        $path = tempnam(TEMP_PATH, 'pdftextextractor_');
+        if (false === $path) {
+            throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
+        }
+
+        // Append extension to temp file if one is set
+        if ($file->getExtension()) {
+            $path .= '.' . $file->getExtension();
+        }
+
+        // Remove any existing temp files with this name
+        unlink($path);
+
+        $bytesWritten = file_put_contents($path, $file->getStream());
+        if (false === $bytesWritten) {
+            throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
+        }
+
+        return $path;
+    }
+
    /**
     * Checks if the extractor is supported on the current environment,
     * for example if the correct binaries or libraries are available.
@ -142,10 +177,10 @@ abstract class FileTextExtractor
    abstract public function supportsMime($mime);

    /**
-     * Given a file path, extract the contents as text.
+     * Given a File instance, extract the contents as text.
     *
-     * @param string $path
+     * @param File $file
     * @return string
     */
-    abstract public function getContent($path);
+    abstract public function getContent(File $file);
 }
--- a/src/Extractor/HTMLTextExtractor.php
+++ b/src/Extractor/HTMLTextExtractor.php
@ -2,6 +2,8 @@

 namespace SilverStripe\TextExtraction\Extractor;

+use SilverStripe\Assets\File;
+
 /**
 * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
 * the best for readable text.
@ -49,12 +51,13 @@ class HTMLTextExtractor extends FileTextExtractor
     * combined with regular expressions to remove non-content tags like <style> or <script>,
     * as well as adding line breaks after block tags.
     *
-     * @param string $path
+     * @param File $file
     * @return string
     */
-    public function getContent($path)
+    public function getContent(File $file)
    {
-        $content = file_get_contents($path);
+        $content = $file->getString();
+
        // Yes, yes, regex'ing HTML is evil.
        // Since we don't care about well-formedness or markup here, it does the job.
        $content = preg_replace(
--- a/src/Extractor/PDFTextExtractor.php
+++ b/src/Extractor/PDFTextExtractor.php
@ -2,7 +2,9 @@

 namespace SilverStripe\TextExtraction\Extractor;

+use SilverStripe\Assets\File;
 use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
+use function tempnam;

 /**
 * Text extractor that calls pdftotext to do the conversion.
@ -83,28 +85,30 @@ class PDFTextExtractor extends FileTextExtractor
        return null;
    }

-    public function getContent($path)
+    public function getContent(File $file)
    {
-        if (!$path) {
+        if (!$file) {
            // no file
            return '';
        }
-        $content = $this->getRawOutput($path);
+        $content = $this->getRawOutput($file);
        return $this->cleanupLigatures($content);
    }

    /**
-     * Invoke pdftotext with the given path
+     * Invoke pdftotext with the given File object
     *
-     * @param  string $path
+     * @param  File $file
     * @return string Output
     * @throws Exception
     */
-    protected function getRawOutput($path)
+    protected function getRawOutput(File $file)
    {
        if (!$this->isAvailable()) {
            throw new Exception("getRawOutput called on unavailable extractor");
        }
+
+        $path = $this->getPathFromFile($file);
        exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
        if ($err) {
            if (!is_array($err) && $err == 1) {
--- a/src/Extractor/SolrCellTextExtractor.php
+++ b/src/Extractor/SolrCellTextExtractor.php
@ -6,6 +6,7 @@ use Exception;
 use GuzzleHttp\Client;
 use InvalidArgumentException;
 use Psr\Log\LoggerInterface;
+use SilverStripe\Assets\File;
 use SilverStripe\Core\Injector\Injector;

 /**
@ -98,18 +99,18 @@ class SolrCellTextExtractor extends FileTextExtractor
    }

    /**
-     * @param string $path
+     * @param File $file
     * @return string
     * @throws InvalidArgumentException
     */
-    public function getContent($path)
+    public function getContent(File $file)
    {
-        if (!$path) {
+        if (!$file) {
            // no file
            return '';
        }

-        $fileName = basename($path);
+        $fileName = $file->getFilename();
        $client = $this->getHttpClient();

        // Get and validate base URL
@ -119,6 +120,7 @@ class SolrCellTextExtractor extends FileTextExtractor
        }

        try {
+            $path = $this->getPathFromFile($file);
            $request = $client
                ->post($baseUrl)
                ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
@ -127,7 +129,7 @@ class SolrCellTextExtractor extends FileTextExtractor
        } catch (InvalidArgumentException $e) {
            $msg = sprintf(
                'Error extracting text from "%s" (message: %s)',
-                $path,
+                $fileName,
                $e->getMessage()
            );
            Injector::inst()->get(LoggerInterface::class)->notice($msg);
--- a/src/Extractor/TikaServerTextExtractor.php
+++ b/src/Extractor/TikaServerTextExtractor.php
@ -2,6 +2,7 @@

 namespace SilverStripe\TextExtraction\Extractor;

+use SilverStripe\Assets\File;
 use SilverStripe\Core\Environment;
 use SilverStripe\Core\Injector\Injector;
 use SilverStripe\TextExtraction\Rest\TikaRestClient;
@ -123,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
        return false;
    }

-    public function getContent($path)
+    public function getContent(File $file)
    {
-        return $this->getClient()->tika($path);
+        $tempFile = $this->getPathFromFile($file);
+        return $this->getClient()->tika($tempFile);
    }
 }
--- a/src/Extractor/TikaTextExtractor.php
+++ b/src/Extractor/TikaTextExtractor.php
@ -2,6 +2,8 @@

 namespace SilverStripe\TextExtraction\Extractor;

+use SilverStripe\Assets\File;
+
 /**
 * Enables text extraction of file content via the Tika CLI
 *
@ -72,13 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
        return proc_close($proc);
    }

-    /**
-     * @param  string $path
-     * @return string
-     */
-    public function getContent($path)
+    public function getContent(File $file)
    {
-        $mode = $this->config()->output_mode;
+        $mode = $this->config()->get('output_mode');
+        $path = $this->getPathFromFile($file);
        $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
        $code = $this->runShell($command, $output);

--- a/tests/FileTextExtractableTest.php
+++ b/tests/FileTextExtractableTest.php
@ -23,31 +23,36 @@ class FileTextExtractableTest extends SapphireTest

        // Ensure that html is a valid extension
        Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
-    }

-    public function testExtractFileAsText()
-    {
        // Create a copy of the file, as it may be clobbered by the test
        // ($file->extractFileAsText() calls $file->write)
        copy(
            dirname(__FILE__) . '/fixtures/test1.html',
            dirname(__FILE__) . '/fixtures/test1-copy.html'
        );
+    }

-        // Use HTML, since the extractor is always available
-        $file = new File([
-            'Name' => 'test1-copy.html',
-            'Filename' => dirname(__FILE__) . '/fixtures/test1-copy.html'
-        ]);
-        $file->write();
-
-        $content = $file->extractFileAsText();
-        $this->assertContains('Test Headline', $content);
-        $this->assertContains('Test Text', $content);
-        $this->assertEquals($content, $file->FileContentCache);
-
+    protected function tearDown()
+    {
        if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
            unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
        }
+
+        parent::tearDown();
+    }
+
+    public function testExtractFileAsText()
+    {
+        // Use HTML, since the extractor is always available
+        /** @var File|FileTextExtractable $file */
+        $file = new File(['Name' => 'test1-copy.html']);
+        $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
+        $file->write();
+
+        $content = $file->extractFileAsText();
+        $this->assertNotNull($content);
+        $this->assertContains('Test Headline', $content);
+        $this->assertContains('Test Text', $content);
+        $this->assertEquals($content, $file->FileContentCache);
    }
 }
--- a/tests/HTMLTextExtractorTest.php
+++ b/tests/HTMLTextExtractorTest.php
@ -2,16 +2,31 @@

 namespace SilverStripe\TextExtraction\Tests;

+use SilverStripe\Assets\File;
+use SilverStripe\Core\Config\Config;
 use SilverStripe\Dev\SapphireTest;
 use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;

 class HTMLTextExtractorTest extends SapphireTest
 {
+    protected $usesDatabase = true;
+
+    protected function setUp()
+    {
+        parent::setUp();
+
+        Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
+    }
+
    public function testExtraction()
    {
        $extractor = new HTMLTextExtractor();

-        $content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.html');
+        $file = new File();
+        $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
+        $file->write();
+
+        $content = $extractor->getContent($file);

        $this->assertContains('Test Headline', $content);
        $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
--- a/tests/PDFTextExtractorTest.php
+++ b/tests/PDFTextExtractorTest.php
@ -2,12 +2,15 @@

 namespace SilverStripe\TextExtraction\Tests;

+use SilverStripe\Assets\File;
 use SilverStripe\Dev\SapphireTest;
 use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
 use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;

 class PDFTextExtractorTest extends SapphireTest
 {
+    protected $usesDatabase = true;
+
    public function testExtraction()
    {
        $extractor = new PDFTextExtractor();
@ -16,7 +19,11 @@ class PDFTextExtractorTest extends SapphireTest
            $this->expectExceptionMessage('getRawOutput called on unavailable extractor');
        }

-        $content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.pdf');
+        $file = new File();
+        $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
+        $file->write();
+
+        $content = $extractor->getContent($file);
        $this->assertContains('This is a test file with a link', $content);
    }
 }