mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API FileTextExtractable::getContent now takes a File instance instead of a path
This commit is contained in:
parent
8bd019b2aa
commit
edb02e9189
@ -3,7 +3,6 @@ Name: textextractioncache
|
|||||||
After:
|
After:
|
||||||
- '#corecache'
|
- '#corecache'
|
||||||
---
|
---
|
||||||
|
|
||||||
SilverStripe\Core\Injector\Injector:
|
SilverStripe\Core\Injector\Injector:
|
||||||
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
||||||
factory: SilverStripe\Core\Cache\CacheFactory
|
factory: SilverStripe\Core\Cache\CacheFactory
|
||||||
|
10
_config/config.yml
Normal file
10
_config/config.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
Name: textextractionconfig
|
||||||
|
---
|
||||||
|
SilverStripe\Core\Injector\Injector:
|
||||||
|
# Define default FileTextCache implementation
|
||||||
|
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||||
|
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||||
|
|
||||||
|
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||||
|
max_content_length: 500000
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extension;
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
use SilverStripe\Control\Director;
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\ORM\DataExtension;
|
use SilverStripe\ORM\DataExtension;
|
||||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
@ -14,12 +14,10 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
|||||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||||
*
|
*
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
class FileTextExtractable extends DataExtension
|
class FileTextExtractable extends DataExtension
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
@ -28,7 +26,6 @@ class FileTextExtractable extends DataExtension
|
|||||||
];
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
@ -37,12 +34,11 @@ class FileTextExtractable extends DataExtension
|
|||||||
];
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $dependencies = [
|
private static $dependencies = [
|
||||||
'TextCache' => FileTextCache\Cache::class,
|
'TextCache' => '%$' . FileTextCache::class,
|
||||||
];
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -51,7 +47,6 @@ class FileTextExtractable extends DataExtension
|
|||||||
protected $fileTextCache = null;
|
protected $fileTextCache = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param FileTextCache $cache
|
* @param FileTextCache $cache
|
||||||
* @return $this
|
* @return $this
|
||||||
*/
|
*/
|
||||||
@ -90,27 +85,28 @@ class FileTextExtractable extends DataExtension
|
|||||||
*/
|
*/
|
||||||
public function extractFileAsText($disableCache = false)
|
public function extractFileAsText($disableCache = false)
|
||||||
{
|
{
|
||||||
|
/** @var File $file */
|
||||||
|
$file = $this->owner;
|
||||||
if (!$disableCache) {
|
if (!$disableCache) {
|
||||||
$text = $this->getTextCache()->load($this->owner);
|
$text = $this->getTextCache()->load($file);
|
||||||
if ($text) {
|
if ($text) {
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine which extractor can process this file.
|
// Determine which extractor can process this file.
|
||||||
$path = Director::baseFolder() . '/' . $this->owner->getFilename();
|
$extractor = FileTextExtractor::for_file($file);
|
||||||
$extractor = FileTextExtractor::for_file($path);
|
|
||||||
if (!$extractor) {
|
if (!$extractor) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$text = $extractor->getContent($path);
|
$text = $extractor->getContent($file);
|
||||||
if (!$text) {
|
if (!$text) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!$disableCache) {
|
if (!$disableCache) {
|
||||||
$this->getTextCache()->save($this->owner, $text);
|
$this->getTextCache()->save($file, $text);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
|
@ -2,10 +2,12 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\Core\ClassInfo;
|
use SilverStripe\Core\ClassInfo;
|
||||||
use SilverStripe\Core\Config\Config;
|
use SilverStripe\Core\Config\Config;
|
||||||
use SilverStripe\Core\Config\Configurable;
|
use SilverStripe\Core\Config\Configurable;
|
||||||
use SilverStripe\Core\Injector\Injector;
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||||
@ -83,17 +85,19 @@ abstract class FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $path
|
* Given a File object, decide which extractor instance to use to handle it
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
* @return FileTextExtractor|null
|
* @return FileTextExtractor|null
|
||||||
*/
|
*/
|
||||||
public static function for_file($path)
|
public static function for_file(File $file)
|
||||||
{
|
{
|
||||||
if (!file_exists($path) || is_dir($path)) {
|
if (!$file) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
$extension = $file->getExtension();
|
||||||
$mime = self::get_mime($path);
|
$mime = $file->getMimeType();
|
||||||
|
|
||||||
foreach (self::get_extractor_classes() as $className) {
|
foreach (self::get_extractor_classes() as $className) {
|
||||||
$extractor = self::get_extractor($className);
|
$extractor = self::get_extractor($className);
|
||||||
@ -115,6 +119,37 @@ abstract class FileTextExtractor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
|
||||||
|
* file contents to a temp file and return its path
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return string
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
protected function getPathFromFile(File $file)
|
||||||
|
{
|
||||||
|
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||||
|
if (false === $path) {
|
||||||
|
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append extension to temp file if one is set
|
||||||
|
if ($file->getExtension()) {
|
||||||
|
$path .= '.' . $file->getExtension();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove any existing temp files with this name
|
||||||
|
unlink($path);
|
||||||
|
|
||||||
|
$bytesWritten = file_put_contents($path, $file->getStream());
|
||||||
|
if (false === $bytesWritten) {
|
||||||
|
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
||||||
|
}
|
||||||
|
|
||||||
|
return $path;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the extractor is supported on the current environment,
|
* Checks if the extractor is supported on the current environment,
|
||||||
* for example if the correct binaries or libraries are available.
|
* for example if the correct binaries or libraries are available.
|
||||||
@ -142,10 +177,10 @@ abstract class FileTextExtractor
|
|||||||
abstract public function supportsMime($mime);
|
abstract public function supportsMime($mime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a file path, extract the contents as text.
|
* Given a File instance, extract the contents as text.
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File $file
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
abstract public function getContent($path);
|
abstract public function getContent(File $file);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
|
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
|
||||||
* the best for readable text.
|
* the best for readable text.
|
||||||
@ -49,12 +51,13 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||||
* as well as adding line breaks after block tags.
|
* as well as adding line breaks after block tags.
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File $file
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function getContent($path)
|
public function getContent(File $file)
|
||||||
{
|
{
|
||||||
$content = file_get_contents($path);
|
$content = $file->getString();
|
||||||
|
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
$content = preg_replace(
|
$content = preg_replace(
|
||||||
|
@ -2,7 +2,9 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
|
use function tempnam;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that calls pdftotext to do the conversion.
|
* Text extractor that calls pdftotext to do the conversion.
|
||||||
@ -83,28 +85,30 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path)
|
public function getContent(File $file)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$file) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
$content = $this->getRawOutput($path);
|
$content = $this->getRawOutput($file);
|
||||||
return $this->cleanupLigatures($content);
|
return $this->cleanupLigatures($content);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoke pdftotext with the given path
|
* Invoke pdftotext with the given File object
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File $file
|
||||||
* @return string Output
|
* @return string Output
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
protected function getRawOutput($path)
|
protected function getRawOutput(File $file)
|
||||||
{
|
{
|
||||||
if (!$this->isAvailable()) {
|
if (!$this->isAvailable()) {
|
||||||
throw new Exception("getRawOutput called on unavailable extractor");
|
throw new Exception("getRawOutput called on unavailable extractor");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$path = $this->getPathFromFile($file);
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
if ($err) {
|
if ($err) {
|
||||||
if (!is_array($err) && $err == 1) {
|
if (!is_array($err) && $err == 1) {
|
||||||
|
@ -6,6 +6,7 @@ use Exception;
|
|||||||
use GuzzleHttp\Client;
|
use GuzzleHttp\Client;
|
||||||
use InvalidArgumentException;
|
use InvalidArgumentException;
|
||||||
use Psr\Log\LoggerInterface;
|
use Psr\Log\LoggerInterface;
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\Core\Injector\Injector;
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -98,18 +99,18 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $path
|
* @param File $file
|
||||||
* @return string
|
* @return string
|
||||||
* @throws InvalidArgumentException
|
* @throws InvalidArgumentException
|
||||||
*/
|
*/
|
||||||
public function getContent($path)
|
public function getContent(File $file)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$file) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
$fileName = basename($path);
|
$fileName = $file->getFilename();
|
||||||
$client = $this->getHttpClient();
|
$client = $this->getHttpClient();
|
||||||
|
|
||||||
// Get and validate base URL
|
// Get and validate base URL
|
||||||
@ -119,6 +120,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
$path = $this->getPathFromFile($file);
|
||||||
$request = $client
|
$request = $client
|
||||||
->post($baseUrl)
|
->post($baseUrl)
|
||||||
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
||||||
@ -127,7 +129,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
} catch (InvalidArgumentException $e) {
|
} catch (InvalidArgumentException $e) {
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'Error extracting text from "%s" (message: %s)',
|
'Error extracting text from "%s" (message: %s)',
|
||||||
$path,
|
$fileName,
|
||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
);
|
);
|
||||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\Core\Environment;
|
use SilverStripe\Core\Environment;
|
||||||
use SilverStripe\Core\Injector\Injector;
|
use SilverStripe\Core\Injector\Injector;
|
||||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||||
@ -123,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path)
|
public function getContent(File $file)
|
||||||
{
|
{
|
||||||
return $this->getClient()->tika($path);
|
$tempFile = $this->getPathFromFile($file);
|
||||||
|
return $this->getClient()->tika($tempFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables text extraction of file content via the Tika CLI
|
* Enables text extraction of file content via the Tika CLI
|
||||||
*
|
*
|
||||||
@ -72,13 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
return proc_close($proc);
|
return proc_close($proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public function getContent(File $file)
|
||||||
* @param string $path
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getContent($path)
|
|
||||||
{
|
{
|
||||||
$mode = $this->config()->output_mode;
|
$mode = $this->config()->get('output_mode');
|
||||||
|
$path = $this->getPathFromFile($file);
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
$code = $this->runShell($command, $output);
|
$code = $this->runShell($command, $output);
|
||||||
|
|
||||||
|
@ -23,31 +23,36 @@ class FileTextExtractableTest extends SapphireTest
|
|||||||
|
|
||||||
// Ensure that html is a valid extension
|
// Ensure that html is a valid extension
|
||||||
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||||
}
|
|
||||||
|
|
||||||
public function testExtractFileAsText()
|
|
||||||
{
|
|
||||||
// Create a copy of the file, as it may be clobbered by the test
|
// Create a copy of the file, as it may be clobbered by the test
|
||||||
// ($file->extractFileAsText() calls $file->write)
|
// ($file->extractFileAsText() calls $file->write)
|
||||||
copy(
|
copy(
|
||||||
dirname(__FILE__) . '/fixtures/test1.html',
|
dirname(__FILE__) . '/fixtures/test1.html',
|
||||||
dirname(__FILE__) . '/fixtures/test1-copy.html'
|
dirname(__FILE__) . '/fixtures/test1-copy.html'
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Use HTML, since the extractor is always available
|
protected function tearDown()
|
||||||
$file = new File([
|
{
|
||||||
'Name' => 'test1-copy.html',
|
|
||||||
'Filename' => dirname(__FILE__) . '/fixtures/test1-copy.html'
|
|
||||||
]);
|
|
||||||
$file->write();
|
|
||||||
|
|
||||||
$content = $file->extractFileAsText();
|
|
||||||
$this->assertContains('Test Headline', $content);
|
|
||||||
$this->assertContains('Test Text', $content);
|
|
||||||
$this->assertEquals($content, $file->FileContentCache);
|
|
||||||
|
|
||||||
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
||||||
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
parent::tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testExtractFileAsText()
|
||||||
|
{
|
||||||
|
// Use HTML, since the extractor is always available
|
||||||
|
/** @var File|FileTextExtractable $file */
|
||||||
|
$file = new File(['Name' => 'test1-copy.html']);
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $file->extractFileAsText();
|
||||||
|
$this->assertNotNull($content);
|
||||||
|
$this->assertContains('Test Headline', $content);
|
||||||
|
$this->assertContains('Test Text', $content);
|
||||||
|
$this->assertEquals($content, $file->FileContentCache);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,16 +2,31 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Tests;
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Core\Config\Config;
|
||||||
use SilverStripe\Dev\SapphireTest;
|
use SilverStripe\Dev\SapphireTest;
|
||||||
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
|
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
|
||||||
|
|
||||||
class HTMLTextExtractorTest extends SapphireTest
|
class HTMLTextExtractorTest extends SapphireTest
|
||||||
{
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
|
protected function setUp()
|
||||||
|
{
|
||||||
|
parent::setUp();
|
||||||
|
|
||||||
|
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||||
|
}
|
||||||
|
|
||||||
public function testExtraction()
|
public function testExtraction()
|
||||||
{
|
{
|
||||||
$extractor = new HTMLTextExtractor();
|
$extractor = new HTMLTextExtractor();
|
||||||
|
|
||||||
$content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.html');
|
$file = new File();
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
|
|
||||||
$this->assertContains('Test Headline', $content);
|
$this->assertContains('Test Headline', $content);
|
||||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||||
|
@ -2,12 +2,15 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Tests;
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\Dev\SapphireTest;
|
use SilverStripe\Dev\SapphireTest;
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
|
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
|
||||||
|
|
||||||
class PDFTextExtractorTest extends SapphireTest
|
class PDFTextExtractorTest extends SapphireTest
|
||||||
{
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
public function testExtraction()
|
public function testExtraction()
|
||||||
{
|
{
|
||||||
$extractor = new PDFTextExtractor();
|
$extractor = new PDFTextExtractor();
|
||||||
@ -16,7 +19,11 @@ class PDFTextExtractorTest extends SapphireTest
|
|||||||
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
|
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
|
||||||
}
|
}
|
||||||
|
|
||||||
$content = $extractor->getContent(dirname(__FILE__) . '/fixtures/test1.pdf');
|
$file = new File();
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user