mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
API Implement Tika support
API Implement support for detection via mime-type as well as file extension API Implement FileContent property for safe usage in templates API instead of returning the list of extensions / mime types supported, support is determined on a per-file bases Marking dev-master as version 2.0 as this contains breaking changes
This commit is contained in:
parent
526de4586c
commit
2977f85cb5
23
.travis.yml
23
.travis.yml
@ -1,18 +1,21 @@
|
||||
# See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details
|
||||
|
||||
language: php
|
||||
php:
|
||||
- 5.3
|
||||
php:
|
||||
- 5.4
|
||||
|
||||
env:
|
||||
- DB=MYSQL CORE_RELEASE=3.0
|
||||
- DB=MYSQL CORE_RELEASE=3.1
|
||||
- DB=PGSQL CORE_RELEASE=master
|
||||
- DB=MYSQL CORE_RELEASE=3.1
|
||||
- DB=MYSQL CORE_RELEASE=3
|
||||
|
||||
before_script:
|
||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
||||
- cd ~/builds/ss
|
||||
- mkdir $HOME/bin
|
||||
- export PATH=$PATH:$HOME/bin
|
||||
- ./.travis/install_tika.sh
|
||||
- sudo ./.travis/install_pdftotext.sh
|
||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
||||
- cd ~/builds/ss
|
||||
|
||||
script:
|
||||
- phpunit textextraction/tests/
|
||||
script:
|
||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
||||
|
3
.travis/install_pdftotext.sh
Executable file
3
.travis/install_pdftotext.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
apt-get update
|
||||
apt-get install -y xpdf
|
6
.travis/install_tika.sh
Executable file
6
.travis/install_tika.sh
Executable file
@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
mkdir $HOME/bin
|
||||
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
||||
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
|
||||
chmod ug+x $HOME/bin/tika
|
||||
$HOME/bin/tika --version
|
24
README.md
24
README.md
@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
||||
* SilverStripe 3.1
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||
|
||||
### Supported Formats
|
||||
|
||||
@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
||||
* CSV (Solr)
|
||||
* RTF (Solr)
|
||||
* EPub (Solr)
|
||||
* Many others (Tika)
|
||||
|
||||
## Installation
|
||||
|
||||
@ -37,7 +39,7 @@ Add the following to your `composer.json`:
|
||||
```js
|
||||
{
|
||||
"require": {
|
||||
"silverstripe/textextraction": "*"
|
||||
"silverstripe/textextraction": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
```
|
||||
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config.php`:
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
|
||||
DataObject::add_extension('File', 'FileTextExtractable');
|
||||
```yaml
|
||||
File:
|
||||
extensions:
|
||||
- FileTextExtractable
|
||||
```
|
||||
|
||||
### XPDF
|
||||
|
||||
@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
class MySolrIndex extends SolrIndex {
|
||||
function init() {
|
||||
$this->addClass('MyDocument');
|
||||
$this->addFulltextField('Content', 'HTMLText');
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
### Tika
|
||||
|
||||
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||
configuring this.
|
||||
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
|
||||
'FileContentCache' => 'Text'
|
||||
);
|
||||
|
||||
private static $casting = array(
|
||||
'FileContent' => 'Text'
|
||||
);
|
||||
|
||||
/**
|
||||
* Helper function for template
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getFileContent() {
|
||||
return $this->extractFileAsText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||
* The value is also cached into the File record itself.
|
||||
*
|
||||
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
|
||||
* cached version
|
||||
* @return String
|
||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||
* If true, the content parsing is forced, bypassing the cached version
|
||||
* @return string
|
||||
*/
|
||||
function extractFileAsText($forceParse = false) {
|
||||
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||
public function extractFileAsText($disableCache = false) {
|
||||
if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
@ -6,40 +6,89 @@
|
||||
*
|
||||
*/
|
||||
abstract class FileTextExtractor extends Object {
|
||||
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
* The highest priority extractor for a given content type will be selected.
|
||||
*
|
||||
* @config
|
||||
* @var int
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 50;
|
||||
|
||||
/**
|
||||
* Cache of extractor class names, sorted by priority
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected static $sorted_extractor_classes = null;
|
||||
|
||||
/**
|
||||
* @param String $path
|
||||
* Gets the list of prioritised extractor classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected static function get_extractor_classes() {
|
||||
// Check cache
|
||||
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
|
||||
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
array_shift($classes);
|
||||
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
|
||||
arsort($classes);
|
||||
|
||||
// Save classes
|
||||
$sortedClasses = array_keys($classes);
|
||||
return self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text file extractor for the given class
|
||||
*
|
||||
* @param string $class
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
protected static function get_extractor($class) {
|
||||
return Injector::inst()->get($class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to detect mime type for given file
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Mime type if found
|
||||
*/
|
||||
protected static function get_mime($path) {
|
||||
if(!class_exists('finfo')) return null;
|
||||
|
||||
// Check mime of file
|
||||
$finfo = new finfo(FILEINFO_MIME_TYPE);
|
||||
return $finfo->file($path);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $path
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
static function for_file($path) {
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
$mime = self::get_mime($path);
|
||||
foreach(self::get_extractor_classes() as $className) {
|
||||
$extractor = self::get_extractor($className);
|
||||
|
||||
if (!self::$sorted_extractor_classes) {
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
array_shift($classes);
|
||||
$sortedClasses = array();
|
||||
foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
|
||||
arsort($sortedClasses);
|
||||
// Skip unavailable extractors
|
||||
if(!$extractor->isAvailable()) continue;
|
||||
|
||||
self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
foreach(self::$sorted_extractor_classes as $className => $priority) {
|
||||
$formatter = new $className();
|
||||
$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
|
||||
return (strtolower($compare) == strtolower($extension));
|
||||
});
|
||||
if($matched) return $formatter;
|
||||
// Check extension
|
||||
if($extension && $extractor->supportsExtension($extension)) {
|
||||
return $extractor;
|
||||
}
|
||||
|
||||
// Check mime
|
||||
if($mime && $extractor->supportsMime($mime)) {
|
||||
return $extractor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
abstract function isAvailable();
|
||||
abstract public function isAvailable();
|
||||
|
||||
/**
|
||||
* Return an array of content types that the extractor can handle.
|
||||
* @return unknown_type
|
||||
* Determine if this extractor supports the given extension.
|
||||
* If support is determined by mime/type only, then this should return false.
|
||||
*
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
abstract function supportedExtensions();
|
||||
abstract public function supportsExtension($extension);
|
||||
|
||||
/**
|
||||
* Determine if this extractor suports the given mime type.
|
||||
* Will only be called if supportsExtension returns false.
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsMime($mime);
|
||||
|
||||
/**
|
||||
* Given a file path, extract the contents as text.
|
||||
*
|
||||
* @param $path
|
||||
* @return unknown_type
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
abstract function getContent($path);
|
||||
abstract public function getContent($path);
|
||||
}
|
||||
|
||||
class FileTextExtractor_Exception extends Exception {}
|
@ -7,16 +7,26 @@
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor {
|
||||
|
||||
function isAvailable() {
|
||||
return true;
|
||||
public function isAvailable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
function supportedExtensions() {
|
||||
return array("html", "htm", "xhtml");
|
||||
public function supportsExtension($extension) {
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array("html", "htm", "xhtml")
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
return strtolower($mime) === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param [type] $path [description]
|
||||
* @return [type] [description]
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
function getContent($path) {
|
||||
public function getContent($path) {
|
||||
$content = file_get_contents($path);
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
@ -7,32 +7,64 @@
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor {
|
||||
|
||||
function isAvailable() {
|
||||
public function isAvailable() {
|
||||
$bin = $this->bin('pdftotext');
|
||||
return (file_exists($bin) && is_executable($bin));
|
||||
}
|
||||
|
||||
function supportedExtensions() {
|
||||
return array("pdf");
|
||||
public function supportsExtension($extension) {
|
||||
return strtolower($extension) === 'pdf';
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
return in_array(
|
||||
strtolower($mime),
|
||||
array(
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
* @param $prog
|
||||
* @return unknown_type
|
||||
*
|
||||
* @param string $prog Name of binary
|
||||
* @return string
|
||||
*/
|
||||
function bin($prog='') {
|
||||
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
|
||||
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories
|
||||
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
|
||||
else $path = '.'; // Hope it's in path
|
||||
protected function bin($prog = '') {
|
||||
if ($this->config()->binary_location) {
|
||||
// By config
|
||||
$path = $this->config()->binary_location;
|
||||
} elseif (file_exists('/usr/bin/pdftotext')) {
|
||||
// By searching common directories
|
||||
$path = '/usr/bin';
|
||||
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
||||
$path = '/usr/local/bin';
|
||||
} else {
|
||||
$path = '.'; // Hope it's in path
|
||||
}
|
||||
|
||||
return ( $path ? $path . '/' : '' ) . $prog;
|
||||
}
|
||||
|
||||
function getContent($path) {
|
||||
if (!$path) return ""; // no file
|
||||
exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err);
|
||||
public function getContent($path) {
|
||||
if(!$path) return ""; // no file
|
||||
$content = $this->getRawOutput($path);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke pdftotext with the given path
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Output
|
||||
* @throws FileTextExtractor_Exception
|
||||
*/
|
||||
protected function getRawOutput($path) {
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||
if($err) {
|
||||
throw new FileTextExtractor_Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
|
||||
}
|
||||
return implode('', $content);
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
/**
|
||||
* Removes utf-8 ligatures.
|
||||
*
|
||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||
*
|
||||
* @param string $input
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanupLigatures($input) {
|
||||
$mapping = array(
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
'ffi' => 'ffi',
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
);
|
||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||
}
|
||||
}
|
||||
|
@ -13,8 +13,11 @@ use Guzzle\Http\Client;
|
||||
class SolrCellTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Base URL to use for solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
* @var [type]
|
||||
* @var string
|
||||
*/
|
||||
private static $base_url;
|
||||
|
||||
@ -38,18 +41,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
|
||||
$url = $this->config()->get('base_url');
|
||||
if(!$url) return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see http://tika.apache.org/1.3/formats.html
|
||||
* @return Array
|
||||
*/
|
||||
public function supportedExtensions() {
|
||||
return array(
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array(
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
// Rely on supportsExtension
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
if (!$path) return ""; // no file
|
||||
@ -82,4 +89,4 @@ class SolrCellTextExtractor extends FileTextExtractor {
|
||||
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
103
code/extractors/TikaTextExtractor.php
Normal file
103
code/extractors/TikaTextExtractor.php
Normal file
@ -0,0 +1,103 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika CLI
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
|
||||
* as the placeholder for the default locale
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $locale = '{default}.utf-8';
|
||||
|
||||
/**
|
||||
* Text extraction mode. Defaults to -t (plain text)
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $output_mode = '-t';
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
*/
|
||||
public function getVersion() {
|
||||
$code = $this->runShell('tika --version', $stdout);
|
||||
|
||||
// Parse output
|
||||
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
||||
return $matches['version'];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs an arbitrary and safely escaped shell command
|
||||
*
|
||||
* @param string $command Full command including arguments
|
||||
* @param string &$stdout Standand output
|
||||
* @param string &$stderr Standard error
|
||||
* @param string $input Content to pass via standard input
|
||||
* @return int Exit code. 0 is success
|
||||
*/
|
||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
|
||||
$descriptorSpecs = array(
|
||||
0 => array("pipe", "r"),
|
||||
1 => array("pipe", "w"),
|
||||
2 => array("pipe", "w")
|
||||
);
|
||||
// Invoke command
|
||||
$pipes = array();
|
||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||
if (!is_resource($proc)) return 255;
|
||||
|
||||
// Send content as input
|
||||
fwrite($pipes[0], $input);
|
||||
fclose($pipes[0]);
|
||||
|
||||
// Get output
|
||||
$stdout = stream_get_contents($pipes[1]);
|
||||
fclose($pipes[1]);
|
||||
$stderr = stream_get_contents($pipes[2]);
|
||||
fclose($pipes[2]);
|
||||
|
||||
// Get result
|
||||
return proc_close($proc);
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
$mode = $this->config()->output_mode;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||
$code = $this->runShell($command, $output);
|
||||
if($code == 0) return $output;
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
return $this->getVersion() > 0;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
// Get list of supported mime types
|
||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||
if($code) return false; // Error case
|
||||
|
||||
// Check if the mime type is inside the result
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||
return (bool)preg_match($pattern, $supportedTypes);
|
||||
}
|
||||
|
||||
}
|
@ -18,7 +18,18 @@
|
||||
"require": {
|
||||
"php": ">=5.3.2",
|
||||
"composer/installers": "*",
|
||||
"silverstripe/framework": "~3.0",
|
||||
"silverstripe/framework": "~3.1",
|
||||
"guzzle/http": "*"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/PHPUnit": "~3.7@stable"
|
||||
},
|
||||
"suggest": {
|
||||
"ext-fileinfo": "Improved support for file mime detection"
|
||||
},
|
||||
"extra": {
|
||||
"branch-alias": {
|
||||
"dev-master": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
|
||||
'File' => array('FileTextExtractable')
|
||||
);
|
||||
|
||||
public function setUp() {
|
||||
parent::setUp();
|
||||
|
||||
// Ensure that html is a valid extension
|
||||
Config::inst()
|
||||
->nest()
|
||||
->update('File', 'allowed_extensions', array('html'));
|
||||
}
|
||||
|
||||
public function tearDown() {
|
||||
Config::unnest();
|
||||
parent::tearDown();
|
||||
}
|
||||
|
||||
function testExtractFileAsText() {
|
||||
// Create a copy of the file, as it may be clobbered by the test
|
||||
// ($file->extractFileAsText() calls $file->write)
|
||||
|
@ -9,4 +9,4 @@ class PDFTextExtractorTest extends SapphireTest {
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
23
tests/TikaTextExtractorTest.php
Normal file
23
tests/TikaTextExtractorTest.php
Normal file
@ -0,0 +1,23 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Tests the {@see TikaTextExtractor} class
|
||||
*/
|
||||
class TikaTextExtractorTest extends SapphireTest {
|
||||
|
||||
function testExtraction() {
|
||||
$extractor = new TikaTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user