mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
Merge pull request #4 from tractorcow/pulls/tika-support
API Implement Tika support
This commit is contained in:
commit
23d83b7d01
11
.travis.yml
11
.travis.yml
@ -2,17 +2,20 @@
|
|||||||
|
|
||||||
language: php
|
language: php
|
||||||
php:
|
php:
|
||||||
- 5.3
|
- 5.4
|
||||||
|
|
||||||
env:
|
env:
|
||||||
- DB=MYSQL CORE_RELEASE=3.0
|
|
||||||
- DB=MYSQL CORE_RELEASE=3.1
|
- DB=MYSQL CORE_RELEASE=3.1
|
||||||
- DB=PGSQL CORE_RELEASE=master
|
- DB=MYSQL CORE_RELEASE=3
|
||||||
|
|
||||||
before_script:
|
before_script:
|
||||||
|
- mkdir $HOME/bin
|
||||||
|
- export PATH=$PATH:$HOME/bin
|
||||||
|
- ./.travis/install_tika.sh
|
||||||
|
- sudo ./.travis/install_pdftotext.sh
|
||||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
||||||
- cd ~/builds/ss
|
- cd ~/builds/ss
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- phpunit textextraction/tests/
|
- vendor/bin/phpunit --verbose textextraction/tests/
|
||||||
|
3
.travis/install_pdftotext.sh
Executable file
3
.travis/install_pdftotext.sh
Executable file
@ -0,0 +1,3 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y xpdf
|
6
.travis/install_tika.sh
Executable file
6
.travis/install_tika.sh
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
mkdir $HOME/bin
|
||||||
|
wget 'http://www.us.apache.org/dist/tika/tika-app-1.7.jar' -O "$HOME/bin/tika.jar"
|
||||||
|
echo -e '#!/usr/bin/env bash\nexec java -jar $HOME/bin/tika.jar "$@"' >> $HOME/bin/tika
|
||||||
|
chmod ug+x $HOME/bin/tika
|
||||||
|
$HOME/bin/tika --version
|
24
README.md
24
README.md
@ -18,6 +18,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
|||||||
* SilverStripe 3.1
|
* SilverStripe 3.1
|
||||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||||
|
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||||
|
|
||||||
### Supported Formats
|
### Supported Formats
|
||||||
|
|
||||||
@ -28,6 +29,7 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
|||||||
* CSV (Solr)
|
* CSV (Solr)
|
||||||
* RTF (Solr)
|
* RTF (Solr)
|
||||||
* EPub (Solr)
|
* EPub (Solr)
|
||||||
|
* Many others (Tika)
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@ -37,7 +39,7 @@ Add the following to your `composer.json`:
|
|||||||
```js
|
```js
|
||||||
{
|
{
|
||||||
"require": {
|
"require": {
|
||||||
"silverstripe/textextraction": "*"
|
"silverstripe/textextraction": "2.0.x-dev"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@ -53,9 +55,13 @@ through PEAR and ensure its in your `include_path`.
|
|||||||
By default, only extraction from HTML documents is supported.
|
By default, only extraction from HTML documents is supported.
|
||||||
No configuration is required for that, unless you want to make
|
No configuration is required for that, unless you want to make
|
||||||
the content available through your `DataObject` subclass.
|
the content available through your `DataObject` subclass.
|
||||||
In this case, add the following to `mysite/_config.php`:
|
In this case, add the following to `mysite/_config/config.yml`:
|
||||||
|
|
||||||
DataObject::add_extension('File', 'FileTextExtractable');
|
```yaml
|
||||||
|
File:
|
||||||
|
extensions:
|
||||||
|
- FileTextExtractable
|
||||||
|
```
|
||||||
|
|
||||||
### XPDF
|
### XPDF
|
||||||
|
|
||||||
@ -108,7 +114,7 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
|||||||
class MySolrIndex extends SolrIndex {
|
class MySolrIndex extends SolrIndex {
|
||||||
function init() {
|
function init() {
|
||||||
$this->addClass('MyDocument');
|
$this->addClass('MyDocument');
|
||||||
$this->addFulltextField('Content', 'HTMLText');
|
$this->addStoredField('Content', 'HTMLText');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
@ -116,6 +122,16 @@ The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
|||||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||||
each HTTP request is run synchronously.
|
each HTTP request is run synchronously.
|
||||||
|
|
||||||
|
### Tika
|
||||||
|
|
||||||
|
Support for Apache Tika (1.7 and above) is included for the standalone command line utility.
|
||||||
|
|
||||||
|
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||||
|
configuring this.
|
||||||
|
|
||||||
|
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||||
|
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Manual extraction:
|
Manual extraction:
|
||||||
|
@ -15,16 +15,29 @@ class FileTextExtractable extends DataExtension {
|
|||||||
'FileContentCache' => 'Text'
|
'FileContentCache' => 'Text'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
private static $casting = array(
|
||||||
|
'FileContent' => 'Text'
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function for template
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function getFileContent() {
|
||||||
|
return $this->extractFileAsText();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||||
* The value is also cached into the File record itself.
|
* The value is also cached into the File record itself.
|
||||||
*
|
*
|
||||||
* @param $forceParse If false, the file content is only parsed on demand. If true, the content parsing is forced, bypassing the
|
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||||
* cached version
|
* If true, the content parsing is forced, bypassing the cached version
|
||||||
* @return String
|
* @return string
|
||||||
*/
|
*/
|
||||||
function extractFileAsText($forceParse = false) {
|
public function extractFileAsText($disableCache = false) {
|
||||||
if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||||
|
|
||||||
// Determine which extractor can process this file.
|
// Determine which extractor can process this file.
|
||||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||||
@ -39,5 +52,3 @@ class FileTextExtractable extends DataExtension {
|
|||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@ -6,40 +6,89 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor extends Object {
|
abstract class FileTextExtractor extends Object {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set priority from 0-100.
|
* Set priority from 0-100.
|
||||||
* The highest priority extractor for a given content type will be selected.
|
* The highest priority extractor for a given content type will be selected.
|
||||||
*
|
*
|
||||||
* @config
|
* @config
|
||||||
* @var int
|
* @var integer
|
||||||
*/
|
*/
|
||||||
private static $priority = 50;
|
private static $priority = 50;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache of extractor class names, sorted by priority
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
protected static $sorted_extractor_classes = null;
|
protected static $sorted_extractor_classes = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param String $path
|
* Gets the list of prioritised extractor classes
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
protected static function get_extractor_classes() {
|
||||||
|
// Check cache
|
||||||
|
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
|
||||||
|
|
||||||
|
// Generate the sorted list of extractors on demand.
|
||||||
|
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||||
|
array_shift($classes);
|
||||||
|
foreach($classes as $class) $classes[$class] = Config::inst()->get($class, 'priority');
|
||||||
|
arsort($classes);
|
||||||
|
|
||||||
|
// Save classes
|
||||||
|
$sortedClasses = array_keys($classes);
|
||||||
|
return self::$sorted_extractor_classes = $sortedClasses;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the text file extractor for the given class
|
||||||
|
*
|
||||||
|
* @param string $class
|
||||||
|
* @return FileTextExtractor
|
||||||
|
*/
|
||||||
|
protected static function get_extractor($class) {
|
||||||
|
return Injector::inst()->get($class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempt to detect mime type for given file
|
||||||
|
*
|
||||||
|
* @param string $path
|
||||||
|
* @return string Mime type if found
|
||||||
|
*/
|
||||||
|
protected static function get_mime($path) {
|
||||||
|
if(!class_exists('finfo')) return null;
|
||||||
|
|
||||||
|
// Check mime of file
|
||||||
|
$finfo = new finfo(FILEINFO_MIME_TYPE);
|
||||||
|
return $finfo->file($path);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $path
|
||||||
* @return FileTextExtractor
|
* @return FileTextExtractor
|
||||||
*/
|
*/
|
||||||
static function for_file($path) {
|
static function for_file($path) {
|
||||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||||
|
$mime = self::get_mime($path);
|
||||||
|
foreach(self::get_extractor_classes() as $className) {
|
||||||
|
$extractor = self::get_extractor($className);
|
||||||
|
|
||||||
if (!self::$sorted_extractor_classes) {
|
// Skip unavailable extractors
|
||||||
// Generate the sorted list of extractors on demand.
|
if(!$extractor->isAvailable()) continue;
|
||||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
|
||||||
array_shift($classes);
|
|
||||||
$sortedClasses = array();
|
|
||||||
foreach($classes as $class) $sortedClasses[$class] = Config::inst()->get($class, 'priority');
|
|
||||||
arsort($sortedClasses);
|
|
||||||
|
|
||||||
self::$sorted_extractor_classes = $sortedClasses;
|
// Check extension
|
||||||
|
if($extension && $extractor->supportsExtension($extension)) {
|
||||||
|
return $extractor;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check mime
|
||||||
|
if($mime && $extractor->supportsMime($mime)) {
|
||||||
|
return $extractor;
|
||||||
}
|
}
|
||||||
foreach(self::$sorted_extractor_classes as $className => $priority) {
|
|
||||||
$formatter = new $className();
|
|
||||||
$matched = array_filter($formatter->supportedExtensions(), function($compare) use($extension) {
|
|
||||||
return (strtolower($compare) == strtolower($extension));
|
|
||||||
});
|
|
||||||
if($matched) return $formatter;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -49,21 +98,33 @@ abstract class FileTextExtractor extends Object {
|
|||||||
*
|
*
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
abstract function isAvailable();
|
abstract public function isAvailable();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return an array of content types that the extractor can handle.
|
* Determine if this extractor supports the given extension.
|
||||||
* @return unknown_type
|
* If support is determined by mime/type only, then this should return false.
|
||||||
|
*
|
||||||
|
* @param string $extension
|
||||||
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
abstract function supportedExtensions();
|
abstract public function supportsExtension($extension);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine if this extractor suports the given mime type.
|
||||||
|
* Will only be called if supportsExtension returns false.
|
||||||
|
*
|
||||||
|
* @param string $mime
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
abstract public function supportsMime($mime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a file path, extract the contents as text.
|
* Given a file path, extract the contents as text.
|
||||||
*
|
*
|
||||||
* @param $path
|
* @param string $path
|
||||||
* @return unknown_type
|
* @return string
|
||||||
*/
|
*/
|
||||||
abstract function getContent($path);
|
abstract public function getContent($path);
|
||||||
}
|
}
|
||||||
|
|
||||||
class FileTextExtractor_Exception extends Exception {}
|
class FileTextExtractor_Exception extends Exception {}
|
@ -7,16 +7,26 @@
|
|||||||
*/
|
*/
|
||||||
class HTMLTextExtractor extends FileTextExtractor {
|
class HTMLTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
function isAvailable() {
|
public function isAvailable() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
function supportedExtensions() {
|
public function supportsExtension($extension) {
|
||||||
return array("html", "htm", "xhtml");
|
return in_array(
|
||||||
|
strtolower($extension),
|
||||||
|
array("html", "htm", "xhtml")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supportsMime($mime) {
|
||||||
|
return strtolower($mime) === 'text/html';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||||
|
*
|
||||||
|
* @config
|
||||||
|
* @var integer
|
||||||
*/
|
*/
|
||||||
private static $priority = 10;
|
private static $priority = 10;
|
||||||
|
|
||||||
@ -25,10 +35,10 @@ class HTMLTextExtractor extends FileTextExtractor {
|
|||||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||||
* as well as adding line breaks after block tags.
|
* as well as adding line breaks after block tags.
|
||||||
*
|
*
|
||||||
* @param [type] $path [description]
|
* @param string $path
|
||||||
* @return [type] [description]
|
* @return string
|
||||||
*/
|
*/
|
||||||
function getContent($path) {
|
public function getContent($path) {
|
||||||
$content = file_get_contents($path);
|
$content = file_get_contents($path);
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
@ -61,5 +71,3 @@ class HTMLTextExtractor extends FileTextExtractor {
|
|||||||
return strip_tags($content);
|
return strip_tags($content);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@ -7,32 +7,64 @@
|
|||||||
*/
|
*/
|
||||||
class PDFTextExtractor extends FileTextExtractor {
|
class PDFTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
function isAvailable() {
|
public function isAvailable() {
|
||||||
$bin = $this->bin('pdftotext');
|
$bin = $this->bin('pdftotext');
|
||||||
return (file_exists($bin) && is_executable($bin));
|
return (file_exists($bin) && is_executable($bin));
|
||||||
}
|
}
|
||||||
|
|
||||||
function supportedExtensions() {
|
public function supportsExtension($extension) {
|
||||||
return array("pdf");
|
return strtolower($extension) === 'pdf';
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supportsMime($mime) {
|
||||||
|
return in_array(
|
||||||
|
strtolower($mime),
|
||||||
|
array(
|
||||||
|
'application/pdf',
|
||||||
|
'application/x-pdf',
|
||||||
|
'application/x-bzpdf',
|
||||||
|
'application/x-gzpdf'
|
||||||
|
)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Accessor to get the location of the binary
|
* Accessor to get the location of the binary
|
||||||
* @param $prog
|
*
|
||||||
* @return unknown_type
|
* @param string $prog Name of binary
|
||||||
|
* @return string
|
||||||
*/
|
*/
|
||||||
function bin($prog='') {
|
protected function bin($prog = '') {
|
||||||
if ($this->stat('binary_location')) $path = $this->stat('binary_location'); // By static from _config.php
|
if ($this->config()->binary_location) {
|
||||||
elseif (file_exists('/usr/bin/pdftotext')) $path = '/usr/bin'; // By searching common directories
|
// By config
|
||||||
elseif (file_exists('/usr/local/bin/pdftotext')) $path = '/usr/local/bin';
|
$path = $this->config()->binary_location;
|
||||||
else $path = '.'; // Hope it's in path
|
} elseif (file_exists('/usr/bin/pdftotext')) {
|
||||||
|
// By searching common directories
|
||||||
|
$path = '/usr/bin';
|
||||||
|
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
||||||
|
$path = '/usr/local/bin';
|
||||||
|
} else {
|
||||||
|
$path = '.'; // Hope it's in path
|
||||||
|
}
|
||||||
|
|
||||||
return ( $path ? $path . '/' : '' ) . $prog;
|
return ( $path ? $path . '/' : '' ) . $prog;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getContent($path) {
|
public function getContent($path) {
|
||||||
if(!$path) return ""; // no file
|
if(!$path) return ""; // no file
|
||||||
exec(sprintf('%s "%s" - 2>&1', $this->bin('pdftotext'), $path), $content, $err);
|
$content = $this->getRawOutput($path);
|
||||||
|
return $this->cleanupLigatures($content);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Invoke pdftotext with the given path
|
||||||
|
*
|
||||||
|
* @param string $path
|
||||||
|
* @return string Output
|
||||||
|
* @throws FileTextExtractor_Exception
|
||||||
|
*/
|
||||||
|
protected function getRawOutput($path) {
|
||||||
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
if($err) {
|
if($err) {
|
||||||
throw new FileTextExtractor_Exception(sprintf(
|
throw new FileTextExtractor_Exception(sprintf(
|
||||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||||
@ -42,6 +74,25 @@ class PDFTextExtractor extends FileTextExtractor {
|
|||||||
}
|
}
|
||||||
return implode('', $content);
|
return implode('', $content);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
?>
|
/**
|
||||||
|
* Removes utf-8 ligatures.
|
||||||
|
*
|
||||||
|
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||||
|
*
|
||||||
|
* @param string $input
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function cleanupLigatures($input) {
|
||||||
|
$mapping = array(
|
||||||
|
'ff' => 'ff',
|
||||||
|
'fi' => 'fi',
|
||||||
|
'fl' => 'fl',
|
||||||
|
'ffi' => 'ffi',
|
||||||
|
'ffl' => 'ffl',
|
||||||
|
'ſt' => 'ft',
|
||||||
|
'st' => 'st'
|
||||||
|
);
|
||||||
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -13,8 +13,11 @@ use Guzzle\Http\Client;
|
|||||||
class SolrCellTextExtractor extends FileTextExtractor {
|
class SolrCellTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Base URL to use for solr text extraction.
|
||||||
|
* E.g. http://localhost:8983/solr/update/extract
|
||||||
|
*
|
||||||
* @config
|
* @config
|
||||||
* @var [type]
|
* @var string
|
||||||
*/
|
*/
|
||||||
private static $base_url;
|
private static $base_url;
|
||||||
|
|
||||||
@ -39,18 +42,22 @@ class SolrCellTextExtractor extends FileTextExtractor {
|
|||||||
if(!$url) return false;
|
if(!$url) return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public function supportsExtension($extension) {
|
||||||
* @see http://tika.apache.org/1.3/formats.html
|
return in_array(
|
||||||
* @return Array
|
strtolower($extension),
|
||||||
*/
|
array(
|
||||||
public function supportedExtensions() {
|
|
||||||
return array(
|
|
||||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||||
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function supportsMime($mime) {
|
||||||
|
// Rely on supportsExtension
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
public function getContent($path) {
|
public function getContent($path) {
|
||||||
if (!$path) return ""; // no file
|
if (!$path) return ""; // no file
|
||||||
|
|
||||||
|
103
code/extractors/TikaTextExtractor.php
Normal file
103
code/extractors/TikaTextExtractor.php
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables text extraction of file content via the Tika CLI
|
||||||
|
*
|
||||||
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||||
|
*/
|
||||||
|
class TikaTextExtractor extends FileTextExtractor {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Text extraction locale. Use {locale} as a placeholder for the current locale, {default}
|
||||||
|
* as the placeholder for the default locale
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $locale = '{default}.utf-8';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Text extraction mode. Defaults to -t (plain text)
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
* @config
|
||||||
|
*/
|
||||||
|
private static $output_mode = '-t';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the version of tika installed, or 0 if not installed
|
||||||
|
*
|
||||||
|
* @return float version of tika
|
||||||
|
*/
|
||||||
|
public function getVersion() {
|
||||||
|
$code = $this->runShell('tika --version', $stdout);
|
||||||
|
|
||||||
|
// Parse output
|
||||||
|
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
||||||
|
return $matches['version'];
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs an arbitrary and safely escaped shell command
|
||||||
|
*
|
||||||
|
* @param string $command Full command including arguments
|
||||||
|
* @param string &$stdout Standand output
|
||||||
|
* @param string &$stderr Standard error
|
||||||
|
* @param string $input Content to pass via standard input
|
||||||
|
* @return int Exit code. 0 is success
|
||||||
|
*/
|
||||||
|
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
|
||||||
|
$descriptorSpecs = array(
|
||||||
|
0 => array("pipe", "r"),
|
||||||
|
1 => array("pipe", "w"),
|
||||||
|
2 => array("pipe", "w")
|
||||||
|
);
|
||||||
|
// Invoke command
|
||||||
|
$pipes = array();
|
||||||
|
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||||
|
if (!is_resource($proc)) return 255;
|
||||||
|
|
||||||
|
// Send content as input
|
||||||
|
fwrite($pipes[0], $input);
|
||||||
|
fclose($pipes[0]);
|
||||||
|
|
||||||
|
// Get output
|
||||||
|
$stdout = stream_get_contents($pipes[1]);
|
||||||
|
fclose($pipes[1]);
|
||||||
|
$stderr = stream_get_contents($pipes[2]);
|
||||||
|
fclose($pipes[2]);
|
||||||
|
|
||||||
|
// Get result
|
||||||
|
return proc_close($proc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function getContent($path) {
|
||||||
|
$mode = $this->config()->output_mode;
|
||||||
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
|
$code = $this->runShell($command, $output);
|
||||||
|
if($code == 0) return $output;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function isAvailable() {
|
||||||
|
return $this->getVersion() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supportsExtension($extension) {
|
||||||
|
// Determine support via mime type only
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function supportsMime($mime) {
|
||||||
|
// Get list of supported mime types
|
||||||
|
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||||
|
if($code) return false; // Error case
|
||||||
|
|
||||||
|
// Check if the mime type is inside the result
|
||||||
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||||
|
return (bool)preg_match($pattern, $supportedTypes);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -18,7 +18,18 @@
|
|||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.3.2",
|
"php": ">=5.3.2",
|
||||||
"composer/installers": "*",
|
"composer/installers": "*",
|
||||||
"silverstripe/framework": "~3.0",
|
"silverstripe/framework": "~3.1",
|
||||||
"guzzle/http": "*"
|
"guzzle/http": "*"
|
||||||
|
},
|
||||||
|
"require-dev": {
|
||||||
|
"phpunit/PHPUnit": "~3.7@stable"
|
||||||
|
},
|
||||||
|
"suggest": {
|
||||||
|
"ext-fileinfo": "Improved support for file mime detection"
|
||||||
|
},
|
||||||
|
"extra": {
|
||||||
|
"branch-alias": {
|
||||||
|
"dev-master": "2.0.x-dev"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,20 @@ class FileTextExtractableTest extends SapphireTest {
|
|||||||
'File' => array('FileTextExtractable')
|
'File' => array('FileTextExtractable')
|
||||||
);
|
);
|
||||||
|
|
||||||
|
public function setUp() {
|
||||||
|
parent::setUp();
|
||||||
|
|
||||||
|
// Ensure that html is a valid extension
|
||||||
|
Config::inst()
|
||||||
|
->nest()
|
||||||
|
->update('File', 'allowed_extensions', array('html'));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function tearDown() {
|
||||||
|
Config::unnest();
|
||||||
|
parent::tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
function testExtractFileAsText() {
|
function testExtractFileAsText() {
|
||||||
// Create a copy of the file, as it may be clobbered by the test
|
// Create a copy of the file, as it may be clobbered by the test
|
||||||
// ($file->extractFileAsText() calls $file->write)
|
// ($file->extractFileAsText() calls $file->write)
|
||||||
|
23
tests/TikaTextExtractorTest.php
Normal file
23
tests/TikaTextExtractorTest.php
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the {@see TikaTextExtractor} class
|
||||||
|
*/
|
||||||
|
class TikaTextExtractorTest extends SapphireTest {
|
||||||
|
|
||||||
|
function testExtraction() {
|
||||||
|
$extractor = new TikaTextExtractor();
|
||||||
|
if(!$extractor->isAvailable()) $this->markTestSkipped('tika not available');
|
||||||
|
|
||||||
|
// Check file
|
||||||
|
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
|
||||||
|
// Check mime validation
|
||||||
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
|
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||||
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user