Merge pull request #45 from creative-commoners/pulls/3.0/ss4-updates

API Update namespaces and SilverStripe API implementations for SilverStripe 4 compat
This commit is contained in:
Dylan Wagstaff 2018-07-04 11:34:17 +12:00 committed by GitHub
commit 9795866abe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 636 additions and 444 deletions

1
.gitattributes vendored
View File

@ -4,3 +4,4 @@
/.gitignore export-ignore /.gitignore export-ignore
/.travis.yml export-ignore /.travis.yml export-ignore
/.scrutinizer.yml export-ignore /.scrutinizer.yml export-ignore
/codecov.yml export-ignore

View File

@ -1,9 +1,13 @@
inherit: true inherit: true
checks: checks:
php: php: true
code_rating: true
duplication: true build:
nodes:
analysis:
tests:
override: [php-scrutinizer-run]
filter: filter:
paths: [code/*, tests/*] paths: [src/*, tests/*]

View File

@ -1,39 +1,47 @@
# See https://github.com/silverstripe/silverstripe-travis-support for setup details
language: php language: php
sudo: false
addons: addons:
apt: apt:
packages: packages:
- poppler-utils - poppler-utils
env:
global:
- COMPOSER_ROOT_VERSION=3.x-dev
- SS_TIKA_ENDPOINT="http://localhost:9998/"
matrix: matrix:
include: include:
- php: 5.4
env: DB=PGSQL CORE_RELEASE=3.2
- php: 5.5
env: DB=PGSQL CORE_RELEASE=3.3
- php: 5.6 - php: 5.6
env: DB=PGSQL CORE_RELEASE=3.4 env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1
- php: 5.6
env: DB=MYSQL CORE_RELEASE=3.5
- php: 7.0 - php: 7.0
env: DB=MYSQL CORE_RELEASE=3.6 env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1
- php: 7.1 - php: 7.1
env: DB=MYSQL CORE_RELEASE=3 env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1
- php: 7.2
env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1
before_script: before_script:
- composer self-update || true # Init PHP
- phpenv rehash
- phpenv config-rm xdebug.ini
# Configure Tika bin
- mkdir -p $HOME/bin - mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin - export PATH=$PATH:$HOME/bin
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
- ./.travis/install_tika.sh - ./.travis/install_tika.sh
- git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support - ($HOME/bin/tika-rest-server &) &> /dev/null
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
- cd ~/builds/ss # Install composer dependencies
- composer install - composer validate
- composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION"
- if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi
- composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile
script: script:
- ($HOME/bin/tika-rest-server &) &> /dev/null - if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi
- vendor/bin/phpunit --verbose textextraction/tests/ - if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi
- if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi
after_success:
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi

14
.upgrade.yml Normal file
View File

@ -0,0 +1,14 @@
mappings:
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient

View File

@ -1,12 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [2.0.1]
Using Symfony mime type detection
## [2.0.0]
Clarified Tika docs

View File

@ -1,11 +1,9 @@
# Text extraction module # Text extraction module
[![Build Status](https://secure.travis-ci.org/silverstripe/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe/silverstripe-textextraction) [![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master)
[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction)
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/) [![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe/silverstripe-textextraction.svg?style=flat)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction)
[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat)](https://packagist.org/packages/silverstripe/silverstripe-textextraction)
[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat)](license.md)
Provides a text extraction API for file content, that can hook into different extractor Provides a text extraction API for file content, that can hook into different extractor
engines based on availability and the parsed file format. The output returned is always a string of the file content. engines based on availability and the parsed file format. The output returned is always a string of the file content.
@ -26,14 +24,14 @@ The module supports text extraction on the following file formats:
## Requirements ## Requirements
* SilverStripe ^3.1 * SilverStripe ^4.0
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/) * (optional) [Apache Tika](http://tika.apache.org/)
## Installation ## Installation
```js ```
composer require silverstripe/textextraction composer require silverstripe/textextraction
``` ```

View File

View File

@ -3,7 +3,6 @@ Name: textextractioncache
After: After:
- '#corecache' - '#corecache'
--- ---
SilverStripe\Core\Injector\Injector: SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache: Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory factory: SilverStripe\Core\Cache\CacheFactory

10
_config/config.yml Normal file
View File

@ -0,0 +1,10 @@
---
Name: textextractionconfig
---
SilverStripe\Core\Injector\Injector:
# Define default FileTextCache implementation
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
max_content_length: 500000

3
codecov.yml Normal file
View File

@ -0,0 +1,3 @@
comment: false
codecov:
branch: master

View File

@ -1,37 +1,44 @@
{ {
"name": "silverstripe/textextraction", "name": "silverstripe/textextraction",
"type": "silverstripe-vendormodule", "type": "silverstripe-vendormodule",
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)", "description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
"homepage": "http://silverstripe.org", "homepage": "http://silverstripe.org",
"license": "BSD-3-Clause", "license": "BSD-3-Clause",
"keywords": ["silverstripe", "fulltext", "pdf"], "keywords": [
"authors": [ "silverstripe",
{ "fulltext",
"name": "SilverStripe", "pdf"
"homepage": "http://silverstripe.com" ],
}, "authors": [
{ {
"name": "The SilverStripe Community", "name": "SilverStripe",
"homepage": "http://silverstripe.org" "homepage": "http://silverstripe.com"
} },
], {
"require": { "name": "The SilverStripe Community",
"php": ">=5.6", "homepage": "http://silverstripe.org"
"silverstripe/framework": "^4", }
"guzzlehttp/guzzle": "~6.3.0", ],
"symfony/event-dispatcher": "^2.6.0@stable", "require": {
"symfony/http-foundation": "^2.6.0", "silverstripe/framework": "^4",
"silverstripe/assets": "^1" "silverstripe/assets": "^1",
}, "silverstripe/versioned": "^1",
"require-dev": { "guzzlehttp/guzzle": "~6.3.0",
"phpunit/phpunit": "^5.7" "symfony/event-dispatcher": "^2.6.0@stable",
}, "symfony/http-foundation": "^2.6.0"
"suggest": { },
"ext-fileinfo": "Improved support for file mime detection" "require-dev": {
}, "squizlabs/php_codesniffer": "^3",
"extra": { "phpunit/phpunit": "^5.7"
"branch-alias": { },
"dev-master": "3.x-dev" "suggest": {
} "ext-fileinfo": "Improved support for file mime detection"
} },
"extra": {
"branch-alias": {
"dev-master": "3.x-dev"
}
},
"minimum-stability": "dev",
"prefer-stable": true
} }

View File

@ -8,31 +8,30 @@ the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config/config.yml`: In this case, add the following to `mysite/_config/config.yml`:
```yaml ```yaml
File: SilverStripe\Assets\File:
extensions: extensions:
- FileTextExtractable - SilverStripe\TextExtraction\Extension\FileTextExtractable
``` ```
By default any extracted content will be cached against the database row. By default any extracted content will be cached against the database row. In order to stay within common size
In order to stay within common size constraints for SQL queries required in this operation, constraints for SQL queries required in this operation, the cache sets a maximum character length after which
the cache sets a maximum character length after which content gets truncated (default: 500000). content gets truncated (default: 500000). You can configure this value through
You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration. `SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
In order to swap out the cache backend you can use the following yaml configuration. In order to swap out the cache backend you can use the following yaml configuration.
```yaml ```yaml
--- ---
Name: mytextextraction Name: mytextextraction
After: '#textextraction' After: '#textextraction'
--- ---
Injector: SilverStripe\Core\Injector\Injector:
FileTextCache: FileTextCache_SSCache SilverStripe\TextExtraction\Cache\FileTextCache:
FileTextCache_SSCache: class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
lifetime: 3600 # Number of seconds to cache content for
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
lifetime: 3600 # Number of seconds to cache content for
``` ```
## XPDF ## XPDF
@ -42,7 +41,7 @@ commandline utility. Follow their installation instructions, its presence will b
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`: detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
```yml ```yml
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
binary_location: /my/path/pdftotext binary_location: /my/path/pdftotext
``` ```
@ -59,7 +58,7 @@ in your database driver, or even pass it back to Solr as part of a full index up
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
```yml ```yml
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract' base_url: 'http://localhost:8983/solr/update/extract'
``` ```
@ -76,16 +75,27 @@ or by writing your own method around `FileTextExtractor->getContent()` (see "Usa
The property should be listed in your `SolrIndex` subclass, e.g. as follows: The property should be listed in your `SolrIndex` subclass, e.g. as follows:
```php ```php
class MyDocument extends DataObject { use SilverStripe\ORM\DataObject;
static $db = array('Path' => 'Text'); use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
function getContent() {
class MyDocument extends DataObject
{
private static $db = ['Path' => 'Text'];
public function getContent()
{
$extractor = FileTextExtractor::for_file($this->Path); $extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null; return $extractor ? $extractor->getContent($this->Path) : null;
} }
} }
class MySolrIndex extends SolrIndex {
function init() { use SilverStripe\FullTextSearch\Solr;
$this->addClass('MyDocument');
class MySolrIndex extends SolrIndex
{
public function init()
{
$this->addClass(MyDocument::class);
$this->addStoredField('Content', 'HTMLText'); $this->addStoredField('Content', 'HTMLText');
} }
} }
@ -120,14 +130,15 @@ exec java -jar tika-app-1.8.jar "$@"
Tika can also be run as a server. You can configure your server endpoint by setting the url via config. Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
```yaml ```yaml
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998' server_endpoint: 'http://localhost:9998'
``` ```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
environment variable of the same name.
Then startup your server as below Then startup your server as below:
```bash ```bash
java -jar tika-server-1.8.jar --host=localhost --port=9998 java -jar tika-server-1.8.jar --host=localhost --port=9998
@ -136,7 +147,7 @@ java -jar tika-server-1.8.jar --host=localhost --port=9998
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
it behaves differently and is not recommended. it behaves differently and is not recommended.
The module will log extraction errors with `SS_Log::NOTICE` priority by default, The module will log extraction errors with PSR-3 "notice" priority by default,
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
In case you want more information on why processing failed, you can increase In case you want more information on why processing failed, you can increase
the logging verbosity in the tika server instance by passing through the logging verbosity in the tika server instance by passing through

View File

@ -1,7 +1,8 @@
# Developer documentation # Developer documentation
## Usage ## Usage
Manual extraction: Manual extraction via string file path:
```php ```php
$myFile = '/my/path/myfile.pdf'; $myFile = '/my/path/myfile.pdf';
@ -9,6 +10,14 @@ $extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile); $content = $extractor->getContent($myFile);
``` ```
Manual extraction via File object:
```php
$myFile = File::get()->filter(['Name' => 'My file')->first();
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied: Extraction with `FileTextExtractable` extension applied:
```php ```php

View File

@ -1,4 +1,4 @@
Copyright (c) 2017, SilverStripe Limited Copyright (c) 2018, SilverStripe Limited
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

10
phpcs.xml.dist Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<ruleset name="SilverStripe">
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
<!-- base rules are PSR-2 -->
<rule ref="PSR2" >
<!-- Current exclusions -->
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
</rule>
</ruleset>

14
phpunit.xml.dist Normal file
View File

@ -0,0 +1,14 @@
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
<testsuite name="Default">
<directory>tests/</directory>
</testsuite>
<filter>
<whitelist addUncoveredFilesFromWhitelist="true">
<directory suffix=".php">src/</directory>
<exclude>
<directory suffix=".php">tests/</directory>
</exclude>
</whitelist>
</filter>
</phpunit>

View File

@ -1,6 +1,6 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache;
use SilverStripe\Assets\File; use SilverStripe\Assets\File;

View File

@ -1,19 +1,21 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File, use Psr\SimpleCache\CacheInterface;
SilverStripe\Core\Config\Config, use SilverStripe\Assets\File;
SilverStripe\TextExtraction\Extension\FileTextCache, use SilverStripe\Core\Config\Configurable;
SilverStripe\Core\Flushable, use SilverStripe\Core\Flushable;
Psr\SimpleCache\CacheInterface, use SilverStripe\Core\Injector\Injector;
SilverStripe\Core\Injector\Injector; use SilverStripe\TextExtraction\Cache\FileTextCache;
/** /**
* Uses SS_Cache with a lifetime to cache extracted content * Uses SS_Cache with a lifetime to cache extracted content
*/ */
class FileTextCache_Cache implements FileTextCache, Flushable class Cache implements FileTextCache, Flushable
{ {
use Configurable;
/** /**
* Lifetime of cache in seconds * Lifetime of cache in seconds
* Null is indefinite * Null is indefinite
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/** /**
* *
* @param File $file * @param File $file
* @return type * @return mixed
*/ */
public function load(File $file) public function load(File $file)
{ {
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
*/ */
public function save(File $file, $content) public function save(File $file, $content)
{ {
$lifetime = Config::inst()->get(__CLASS__, 'lifetime'); $lifetime = $this->config()->get('lifetime') ?: 3600;
$lifetime = $lifetime ?: 3600;
$key = $this->getKey($file); $key = $this->getKey($file);
$cache = self::get_cache(); $cache = self::get_cache();
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/** /**
* *
* @param File $file * @param File $file
* @return type * @return bool
*/ */
public function invalidate(File $file) public function invalidate(File $file)
{ {

View File

@ -1,17 +1,25 @@
<?php <?php
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File, use SilverStripe\Assets\File;
SilverStripe\Core\Config\Config, use SilverStripe\Core\Config\Configurable;
SilverStripe\TextExtraction\Extension\FileTextCache; use SilverStripe\TextExtraction\Cache\FileTextCache;
/** /**
* Caches the extracted content on the record for the file. * Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits. * Limits the stored file content by default to avoid hitting query size limits.
*/ */
class FileTextCache_Database implements FileTextCache class Database implements FileTextCache
{ {
use Configurable;
/**
* @config
* @var int
*/
private static $max_content_length = null;
/** /**
* *
* @param File $file * @param File $file
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
*/ */
public function save(File $file, $content) public function save(File $file, $content)
{ {
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length'); $maxLength = $this->config()->get('max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write(); $file->write();
} }

View File

@ -1,9 +0,0 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -2,9 +2,10 @@
namespace SilverStripe\TextExtraction\Extension; namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension, use SilverStripe\Assets\File;
SilverStripe\TextExtraction\Extension\FileTextCache, use SilverStripe\ORM\DataExtension;
SilverStripe\Control\Director; use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/** /**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of * Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
@ -13,36 +14,32 @@ use SilverStripe\ORM\DataExtension,
* Adds an additional property which is the cached contents, which is populated on demand. * Adds an additional property which is the cached contents, which is populated on demand.
* *
* @author mstephens * @author mstephens
*
*/ */
class FileTextExtractable extends DataExtension class FileTextExtractable extends DataExtension
{ {
/** /**
*
* @var array * @var array
* @config * @config
*/ */
private static $db = array( private static $db = [
'FileContentCache' => 'Text' 'FileContentCache' => 'Text'
); ];
/** /**
*
* @var array * @var array
* @config * @config
*/ */
private static $casting = array( private static $casting = [
'FileContent' => 'Text' 'FileContent' => 'Text'
); ];
/** /**
*
* @var array * @var array
* @config * @config
*/ */
private static $dependencies = array( private static $dependencies = [
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' 'TextCache' => '%$' . FileTextCache::class,
); ];
/** /**
* @var FileTextCache * @var FileTextCache
@ -50,13 +47,13 @@ class FileTextExtractable extends DataExtension
protected $fileTextCache = null; protected $fileTextCache = null;
/** /**
*
* @param FileTextCache $cache * @param FileTextCache $cache
* @return void * @return $this
*/ */
public function setTextCache(FileTextCache $cache) public function setTextCache(FileTextCache $cache)
{ {
$this->fileTextCache = $cache; $this->fileTextCache = $cache;
return $this;
} }
/** /**
@ -78,37 +75,38 @@ class FileTextExtractable extends DataExtension
} }
/** /**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
* The value is also cached into the File record itself. * returns the text. The value is also cached into the File record itself.
* *
* @param boolean $disableCache If false, the file content is only parsed on demand. * @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing * If true, the content parsing is forced, bypassing
* the cached version * the cached version
* @return mixed string | null * @return string|null
*/ */
public function extractFileAsText($disableCache = false) public function extractFileAsText($disableCache = false)
{ {
/** @var File $file */
$file = $this->owner;
if (!$disableCache) { if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner); $text = $this->getTextCache()->load($file);
if ($text) { if ($text) {
return $text; return $text;
} }
} }
// Determine which extractor can process this file. // Determine which extractor can process this file.
$path = Director::baseFolder() . '/' . $this->owner->getFilename(); $extractor = FileTextExtractor::for_file($file);
$extractor = FileTextExtractor::for_file($path);
if (!$extractor) { if (!$extractor) {
return null; return null;
} }
$text = $extractor->getContent($path); $text = $extractor->getContent($file);
if (!$text) { if (!$text) {
return null; return null;
} }
if (!$disableCache) { if (!$disableCache) {
$this->getTextCache()->save($this->owner, $text); $this->getTextCache()->save($file, $text);
} }
return $text; return $text;

View File

@ -2,17 +2,22 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config, use SilverStripe\Assets\File;
SilverStripe\Core\Injector\Injector, use SilverStripe\Core\ClassInfo;
SilverStripe\Core\ClassInfo; use SilverStripe\Core\Config\Config;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injectable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/** /**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens * @author mstephens
*
*/ */
abstract class FileTextExtractor abstract class FileTextExtractor
{ {
use Configurable;
use Injectable;
/** /**
* Set priority from 0-100. * Set priority from 0-100.
@ -45,7 +50,7 @@ abstract class FileTextExtractor
// Generate the sorted list of extractors on demand. // Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor(__CLASS__); $classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes); array_shift($classes);
$classPriorities = array(); $classPriorities = [];
foreach ($classes as $class) { foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority'); $classPriorities[$class] = Config::inst()->get($class, 'priority');
@ -76,23 +81,25 @@ abstract class FileTextExtractor
*/ */
protected static function get_mime($path) protected static function get_mime($path)
{ {
$file = new Symfony\Component\HttpFoundation\File\File($path); $file = new \Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType(); return $file->getMimeType();
} }
/** /**
* @param string $path * Given a File object, decide which extractor instance to use to handle it
* @return mixed FileTextExtractor | null *
* @param File $file
* @return FileTextExtractor|null
*/ */
public static function for_file($path) public static function for_file(File $file)
{ {
if (!file_exists($path) || is_dir($path)) { if (!$file) {
return; return null;
} }
$extension = pathinfo($path, PATHINFO_EXTENSION); $extension = $file->getExtension();
$mime = self::get_mime($path); $mime = $file->getMimeType();
foreach (self::get_extractor_classes() as $className) { foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className); $extractor = self::get_extractor($className);
@ -114,6 +121,39 @@ abstract class FileTextExtractor
} }
} }
/**
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
* file contents to a temp file and return its path
*
* @param File $file
* @return string
* @throws Exception
*/
protected function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
}
// Append extension to temp file if one is set
if ($file->getExtension()) {
$path .= '.' . $file->getExtension();
}
// Remove any existing temp files with this name
if (file_exists($path)) {
unlink($path);
}
$bytesWritten = file_put_contents($path, $file->getStream());
if (false === $bytesWritten) {
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
}
return $path;
}
/** /**
* Checks if the extractor is supported on the current environment, * Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available. * for example if the correct binaries or libraries are available.
@ -132,7 +172,7 @@ abstract class FileTextExtractor
abstract public function supportsExtension($extension); abstract public function supportsExtension($extension);
/** /**
* Determine if this extractor suports the given mime type. * Determine if this extractor supports the given mime type.
* Will only be called if supportsExtension returns false. * Will only be called if supportsExtension returns false.
* *
* @param string $mime * @param string $mime
@ -141,10 +181,10 @@ abstract class FileTextExtractor
abstract public function supportsMime($mime); abstract public function supportsMime($mime);
/** /**
* Given a file path, extract the contents as text. * Given a File instance, extract the contents as text.
* *
* @param string $path * @param File|string $file Either the File instance, or a file path for a file to load
* @return string * @return string
*/ */
abstract public function getContent($path); abstract public function getContent($file);
} }

View File

@ -0,0 +1,7 @@
<?php
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class Exception extends \Exception
{
}

View File

@ -2,47 +2,16 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor; use SilverStripe\Assets\File;
/** /**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text. * Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
* @author mstephens * the best for readable text.
* *
* @author mstephens
*/ */
class HTMLTextExtractor extends FileTextExtractor class HTMLTextExtractor extends FileTextExtractor
{ {
/**
*
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
}
/**
*
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/** /**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it * Lower priority because its not the most clever HTML extraction. If there is something better, use it
* *
@ -51,44 +20,71 @@ class HTMLTextExtractor extends FileTextExtractor
*/ */
private static $priority = 10; private static $priority = 10;
/**
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
}
/**
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/** /**
* Extracts content from regex, by using strip_tags() * Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>, * combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags. * as well as adding line breaks after block tags.
* *
* @param string $path * @param File $file
* @return string * @return string
*/ */
public function getContent($path) public function getContent($file)
{ {
$content = file_get_contents($path); $content = $file instanceof File ? $file->getString() : file_get_contents($file);
// Yes, yes, regex'ing HTML is evil. // Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job. // Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace( $content = preg_replace(
array( [
// Remove invisible content // Remove invisible content
'@<head[^>]*?>.*?</head>@siu', '@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu', '@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu', '@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu', '@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu', '@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu', '@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu', '@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu', '@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu', '@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks // Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu', '@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu', '@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu', '@</?((frameset)|(frame)|(iframe))@iu',
), array( ],
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0", [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
), $content $content
); );
return strip_tags($content); return strip_tags($content);
} }
} }

View File

@ -2,17 +2,15 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use SilverStripe\Assets\File;
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception; use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/** /**
* Text extractor that calls pdftotext to do the conversion. * Text extractor that calls pdftotext to do the conversion.
* @author mstephens * @author mstephens
*
*/ */
class PDFTextExtractor extends FileTextExtractor class PDFTextExtractor extends FileTextExtractor
{ {
/** /**
* Set to bin path this extractor can execute * Set to bin path this extractor can execute
* *
@ -27,10 +25,10 @@ class PDFTextExtractor extends FileTextExtractor
* @config * @config
* @var array * @var array
*/ */
private static $search_binary_locations = array( private static $search_binary_locations = [
'/usr/bin', '/usr/bin',
'/usr/local/bin', '/usr/local/bin',
); ];
public function isAvailable() public function isAvailable()
{ {
@ -46,12 +44,13 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime) public function supportsMime($mime)
{ {
return in_array( return in_array(
strtolower($mime), array( strtolower($mime),
'application/pdf', [
'application/x-pdf', 'application/pdf',
'application/x-bzpdf', 'application/x-pdf',
'application/x-gzpdf' 'application/x-bzpdf',
) 'application/x-gzpdf'
]
); );
} }
@ -64,10 +63,10 @@ class PDFTextExtractor extends FileTextExtractor
protected function bin($program = '') protected function bin($program = '')
{ {
// Get list of allowed search paths // Get list of allowed search paths
if ($location = $this->config()->binary_location) { if ($location = $this->config()->get('binary_location')) {
$locations = array($location); $locations = [$location];
} else { } else {
$locations = $this->config()->search_binary_locations; $locations = $this->config()->get('search_binary_locations');
} }
// Find program in each path // Find program in each path
@ -85,35 +84,41 @@ class PDFTextExtractor extends FileTextExtractor
return null; return null;
} }
public function getContent($path) public function getContent($file)
{ {
if (!$path) { if (!$file || (is_string($file) && !file_exists($file))) {
return ""; // no file
} // no file return '';
$content = $this->getRawOutput($path); }
$content = $this->getRawOutput($file);
return $this->cleanupLigatures($content); return $this->cleanupLigatures($content);
} }
/** /**
* Invoke pdftotext with the given path * Invoke pdftotext with the given File object
* *
* @param string $path * @param File|string $file
* @return string Output * @return string Output
* @throws FileTextExtractor_Exception * @throws Exception
*/ */
protected function getRawOutput($path) protected function getRawOutput($file)
{ {
if (!$this->isAvailable()) { if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); throw new Exception("getRawOutput called on unavailable extractor");
} }
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) { if ($err) {
if (!is_array($err) && $err == 1) { if (!is_array($err) && $err == 1) {
// For Windows compatibility // For Windows compatibility
$err = $content; $err = $content;
} }
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
)); ));
} }
@ -130,7 +135,7 @@ class PDFTextExtractor extends FileTextExtractor
*/ */
protected function cleanupLigatures($input) protected function cleanupLigatures($input)
{ {
$mapping = array( $mapping = [
'ff' => 'ff', 'ff' => 'ff',
'fi' => 'fi', 'fi' => 'fi',
'fl' => 'fl', 'fl' => 'fl',
@ -138,9 +143,8 @@ class PDFTextExtractor extends FileTextExtractor
'ffl' => 'ffl', 'ffl' => 'ffl',
'ſt' => 'ft', 'ſt' => 'ft',
'st' => 'st' 'st' => 'st'
); ];
return str_replace(array_keys($mapping), array_values($mapping), $input); return str_replace(array_keys($mapping), array_values($mapping), $input);
} }
} }

View File

@ -2,9 +2,12 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use Exception;
GuzzleHttp\Client, use GuzzleHttp\Client;
Psr\Log\LoggerInterface; use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Injector\Injector;
/** /**
* Text extractor that calls an Apache Solr instance * Text extractor that calls an Apache Solr instance
@ -18,7 +21,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
class SolrCellTextExtractor extends FileTextExtractor class SolrCellTextExtractor extends FileTextExtractor
{ {
/** /**
* Base URL to use for solr text extraction. * Base URL to use for Solr text extraction.
* E.g. http://localhost:8983/solr/update/extract * E.g. http://localhost:8983/solr/update/extract
* *
* @config * @config
@ -27,43 +30,36 @@ class SolrCellTextExtractor extends FileTextExtractor
private static $base_url; private static $base_url;
/** /**
*
* @var int * @var int
* @config * @config
*/ */
private static $priority = 75; private static $priority = 75;
/** /**
* * @var Client
* @var GuzzleHttp\Client
*/ */
protected $httpClient; protected $httpClient;
/** /**
* * @return Client
* @return GuzzleHttp\Client
* @throws InvalidArgumentException
*/ */
public function getHttpClient() public function getHttpClient()
{ {
if (!$this->config()->get('base_url')) {
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if (!$this->httpClient) { if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url')); $this->httpClient = new Client();
} }
return $this->httpClient; return $this->httpClient;
} }
/** /**
* * @param Client $client
* @param GuzzleHttp\Client $client * @return $this
* @return void
*/ */
public function setHttpClient($client) public function setHttpClient(Client $client)
{ {
$this->httpClient = $client; $this->httpClient = $client;
return $this;
} }
/** /**
@ -73,30 +69,28 @@ class SolrCellTextExtractor extends FileTextExtractor
{ {
$url = $this->config()->get('base_url'); $url = $this->config()->get('base_url');
return (boolean) $url; return (bool) $url;
} }
/** /**
*
* @param string $extension * @param string $extension
* @return boolean * @return bool
*/ */
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
return in_array( return in_array(
strtolower($extension), strtolower($extension),
array( [
'pdf', 'doc', 'docx', 'xls', 'xlsx', 'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv' 'ppt', 'pptx', 'odp', 'fodp', 'csv'
) ]
); );
} }
/** /**
*
* @param string $mime * @param string $mime
* @return boolean * @return bool
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
@ -105,48 +99,57 @@ class SolrCellTextExtractor extends FileTextExtractor
} }
/** /**
* * @param File|string $file
* @param string $path
* @return string * @return string
* @throws InvalidArgumentException
*/ */
public function getContent($path) public function getContent($file)
{ {
if (!$path) { if (!$file || (is_string($file) && !file_exists($file))) {
return ""; // no file
} // no file return '';
}
$fileName = basename($path); $fileName = $file instanceof File ? $file->getFilename() : basename($file);
$client = $this->getHttpClient(); $client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
try { try {
$path = $this->getPathFromFile($file);
$request = $client $request = $client
->post() ->post($baseUrl)
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
->addPostFiles(array('myfile' => $path)); ->addPostFiles(['myfile' => $path]);
$response = $request->send(); $response = $request->send();
} catch (\InvalidArgumentException $e) { } catch (InvalidArgumentException $e) {
$msg = sprintf( $msg = sprintf(
'Error extracting text from "%s" (message: %s)', 'Error extracting text from "%s" (message: %s)',
$path, $fileName,
$e->getMessage() $e->getMessage()
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} catch (\Exception $e) { } catch (Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
// query in some cases.
$msg = sprintf( $msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)', 'Tika server error attempting to extract from "%s" (message: %s)',
$path, $path,
$e->getMessage() $e->getMessage()
); );
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null; return null;
} }
// Just initialise it, it doesn't take miuch. // Just initialise it, it doesn't take much.
$matches = []; $matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes // Use preg match to avoid SimpleXML running out of memory on large text nodes

View File

@ -2,10 +2,10 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor, use SilverStripe\Assets\File;
SilverStripe\Core\Injector\Injector, use SilverStripe\Core\Environment;
SilverStripe\Core\Environment, use SilverStripe\Core\Injector\Injector;
SilverStripe\TextExtraction\Rest\TikaRestClient; use SilverStripe\TextExtraction\Rest\TikaRestClient;
/** /**
* Enables text extraction of file content via the Tika Rest Server * Enables text extraction of file content via the Tika Rest Server
@ -35,18 +35,25 @@ class TikaServerTextExtractor extends FileTextExtractor
*/ */
protected $client = null; protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/** /**
* @return TikaRestClient * @return TikaRestClient
*/ */
public function getClient() public function getClient()
{ {
return $this->client ?: if (!$this->client) {
($this->client = $this->client = Injector::inst()->createWithArgs(
Injector::inst()->createWithArgs( TikaRestClient::class,
TikaRestClient::class, [$this->getServerEndpoint()]
array($this->getServerEndpoint())
)
); );
}
return $this->client;
} }
/** /**
@ -59,19 +66,17 @@ class TikaServerTextExtractor extends FileTextExtractor
} }
// Default to configured endpoint // Default to configured endpoint
return $this->config()->server_endpoint; return $this->config()->get('server_endpoint');
} }
/** /**
* Get the version of tika installed, or 0 if not installed * Get the version of Tika installed, or 0 if not installed
* *
* @return float version of tika * @return float version of Tika
*/ */
public function getVersion() public function getVersion()
{ {
return $this return $this->getClient()->getVersion();
->getClient()
->getVersion();
} }
/** /**
@ -79,13 +84,12 @@ class TikaServerTextExtractor extends FileTextExtractor
*/ */
public function isAvailable() public function isAvailable()
{ {
return $this->getServerEndpoint() && return $this->getServerEndpoint()
$this->getClient()->isAvailable() && && $this->getClient()->isAvailable()
version_compare($this->getVersion(), '1.7.0') >= 0; && version_compare($this->getVersion(), '1.7.0') >= 0;
} }
/** /**
*
* @param string $extension * @param string $extension
* @return boolean * @return boolean
*/ */
@ -95,31 +99,23 @@ class TikaServerTextExtractor extends FileTextExtractor
return false; return false;
} }
/** /**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
/**
*
* @param string $mime * @param string $mime
* @return boolean * @return boolean
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
$supported = $this->supportedMimes ?: if (!$this->supportedMimes) {
($this->supportedMimes = $this->getClient()->getSupportedMimes()); $this->supportedMimes = $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup) // Check if supported (most common / quickest lookup)
if (isset($supported[$mime])) { if (isset($this->supportedMimes[$mime])) {
return true; return true;
} }
// Check aliases // Check aliases
foreach ($supported as $info) { foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'])) { if (isset($info['alias']) && in_array($mime, $info['alias'])) {
return true; return true;
} }
@ -128,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
return false; return false;
} }
public function getContent($path) public function getContent($file)
{ {
return $this->getClient()->tika($path); $tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
return $this->getClient()->tika($tempFile);
} }
} }

View File

@ -2,7 +2,7 @@
namespace SilverStripe\TextExtraction\Extractor; namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor; use SilverStripe\Assets\File;
/** /**
* Enables text extraction of file content via the Tika CLI * Enables text extraction of file content via the Tika CLI
@ -47,13 +47,13 @@ class TikaTextExtractor extends FileTextExtractor
*/ */
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{ {
$descriptorSpecs = array( $descriptorSpecs = [
0 => array("pipe", "r"), 0 => ["pipe", "r"],
1 => array("pipe", "w"), 1 => ["pipe", "w"],
2 => array("pipe", "w") 2 => ["pipe", "w"]
); ];
// Invoke command // Invoke command
$pipes = array(); $pipes = [];
$proc = proc_open($command, $descriptorSpecs, $pipes); $proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) { if (!is_resource($proc)) {
@ -74,14 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
return proc_close($proc); return proc_close($proc);
} }
/** public function getContent($file)
*
* @param string $path
* @return string
*/
public function getContent($path)
{ {
$mode = $this->config()->output_mode; $mode = $this->config()->get('output_mode');
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output); $code = $this->runShell($command, $output);
@ -91,8 +87,7 @@ class TikaTextExtractor extends FileTextExtractor
} }
/** /**
* * @return bool
* @return boolean
*/ */
public function isAvailable() public function isAvailable()
{ {
@ -100,8 +95,7 @@ class TikaTextExtractor extends FileTextExtractor
} }
/** /**
* * @return bool
* @return boolean
*/ */
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
@ -111,9 +105,8 @@ class TikaTextExtractor extends FileTextExtractor
/** /**
*
* @param string $mime * @param string $mime
* @return boolean * @return bool
*/ */
public function supportsMime($mime) public function supportsMime($mime)
{ {
@ -121,8 +114,9 @@ class TikaTextExtractor extends FileTextExtractor
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) { if ($code) {
// Error case
return false; return false;
} // Error case }
// Check if the mime type is inside the result // Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));

View File

@ -2,11 +2,11 @@
namespace SilverStripe\TextExtraction\Rest; namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client, use GuzzleHttp\Client;
GuzzleHttp\Exception\RequestException, use GuzzleHttp\Exception\RequestException;
SilverStripe\Core\Environment, use Psr\Log\LoggerInterface;
Psr\Log\LoggerInterface, use SilverStripe\Core\Environment;
SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client class TikaRestClient extends Client
{ {
@ -15,30 +15,30 @@ class TikaRestClient extends Client
* *
* @var array * @var array
*/ */
protected $options = array('username' => null, 'password' => null); protected $options = ['username' => null, 'password' => null];
/** /**
* @var array * @var array
*/ */
protected $mimes = array(); protected $mimes = [];
/** /**
* *
* @param string $baseUrl * @param string $baseUrl
* @param array $config * @param array $config
*/ */
public function __construct($baseUrl = '', $config = null) public function __construct($baseUrl = '', $config = [])
{ {
$psswd = Environment::getEnv('SS_TIKA_PASSWORD'); $password = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) { if (!empty($password)) {
$this->options = array( $this->options = [
'username' => Environment::getEnv('SS_TIKA_USERNAME'), 'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $psswd, 'password' => $password,
); ];
} }
parent::__construct($baseUrl, $config); parent::__construct($config);
} }
/** /**
@ -58,7 +58,7 @@ class TikaRestClient extends Client
} }
} catch (RequestException $ex) { } catch (RequestException $ex) {
$msg = sprintf("Tika unavailable - %s", $ex->getMessage()); $msg = sprintf("Tika unavailable - %s", $ex->getMessage());
Injector::inst()->get(LoggerInterface::class)->error($msg); Injector::inst()->get(LoggerInterface::class)->info($msg);
return false; return false;
} }
@ -120,7 +120,7 @@ class TikaRestClient extends Client
try { try {
$response = $this->put( $response = $this->put(
'tika', 'tika',
array('Accept' => 'text/plain'), ['Accept' => 'text/plain'],
file_get_contents($file) file_get_contents($file)
); );
$response->setAuth($this->options['username'], $this->options['password']); $response->setAuth($this->options['username'], $this->options['password']);
@ -139,7 +139,7 @@ class TikaRestClient extends Client
$msg .= ' Body: ' . $body; $msg .= ' Body: ' . $body;
} }
Injector::inst()->get(LoggerInterface::class)->notice($msg); Injector::inst()->get(LoggerInterface::class)->info($msg);
} }
return $text; return $text;

View File

@ -1,23 +1,23 @@
<?php <?php
use SilverStripe\TextExtraction\Extension\FileTextCache, namespace SilverStripe\TextExtraction\Tests;
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
SilverStripe\Dev\SapphireTest, use SilverStripe\Assets\File;
SilverStripe\Core\Config\Config; use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
class FileTextCacheDatabaseTest extends SapphireTest class FileTextCacheDatabaseTest extends SapphireTest
{ {
public function testTruncatesByMaxLength() public function testTruncatesByMaxLength()
{ {
Config::nest(); Config::modify()->set(Database::class, 'max_content_length', 5);
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5); $cache = new Database();
$cache = new FileTextCache_Database(); $file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
$file = $this->getMock('File', array('write'));
$content = '0123456789'; $content = '0123456789';
$cache->save($file, $content); $cache->save($file, $content);
$this->assertEquals($cache->load($file), '01234');
Config::unnest(); $this->assertEquals($cache->load($file), '01234');
} }
} }

View File

@ -1,46 +1,58 @@
<?php <?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
class FileTextExtractableTest extends SapphireTest class FileTextExtractableTest extends SapphireTest
{ {
protected $requiredExtensions = array( protected $usesDatabase = true;
'File' => array('FileTextExtractable')
);
public function setUp() protected static $required_extensions = [
File::class => [
FileTextExtractable::class,
],
];
protected function setUp()
{ {
parent::setUp(); parent::setUp();
// Ensure that html is a valid extension // Ensure that html is a valid extension
Config::inst() Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
->nest()
->update('File', 'allowed_extensions', array('html')); // Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(
dirname(__FILE__) . '/fixtures/test1.html',
dirname(__FILE__) . '/fixtures/test1-copy.html'
);
} }
public function tearDown() protected function tearDown()
{ {
Config::unnest(); if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
}
parent::tearDown(); parent::tearDown();
} }
public function testExtractFileAsText() public function testExtractFileAsText()
{ {
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// Use HTML, since the extractor is always available // Use HTML, since the extractor is always available
$file = new File(array( /** @var File|FileTextExtractable $file */
'Name' => 'test1-copy.html', $file = new File(['Name' => 'test1-copy.html']);
'Filename' => 'textextraction/tests/fixtures/test1-copy.html' $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
));
$file->write(); $file->write();
$content = $file->extractFileAsText(); $content = $file->extractFileAsText();
$this->assertNotNull($content);
$this->assertContains('Test Headline', $content); $this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content); $this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache); $this->assertEquals($content, $file->FileContentCache);
if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
}
} }
} }

View File

@ -1,11 +1,33 @@
<?php <?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
class HTMLTextExtractorTest extends SapphireTest class HTMLTextExtractorTest extends SapphireTest
{ {
protected $usesDatabase = true;
protected function setUp()
{
parent::setUp();
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
}
public function testExtraction() public function testExtraction()
{ {
$extractor = new HTMLTextExtractor(); $extractor = new HTMLTextExtractor();
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); $file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('Test Headline', $content); $this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); $this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); $this->assertNotContains('Test Style', $content, 'Strips non-content style tags');

View File

@ -1,17 +1,29 @@
<?php <?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
class PDFTextExtractorTest extends SapphireTest class PDFTextExtractorTest extends SapphireTest
{ {
protected $usesDatabase = true;
public function testExtraction() public function testExtraction()
{ {
$extractor = new PDFTextExtractor(); $extractor = new PDFTextExtractor();
if (!$extractor->isAvailable()) { if (!$extractor->isAvailable()) {
$this->setExpectedException( $this->expectException(Exception::class);
'FileTextExtractor_Exception', $this->expectExceptionMessage('getRawOutput called on unavailable extractor');
'getRawOutput called on unavailable extractor'
);
} }
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); $file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);
} }
} }

View File

@ -0,0 +1,36 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
/**
* @group tika-tests
*/
class TikaServerTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testServerExtraction()
{
$extractor = TikaServerTextExtractor::create();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}

View File

@ -1,37 +1,32 @@
<?php <?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
/** /**
* Tests the {@see TikaTextExtractor} class * Tests the {@see TikaTextExtractor} class
*
* @group tika-tests
*/ */
class TikaTextExtractorTest extends SapphireTest class TikaTextExtractorTest extends SapphireTest
{ {
protected $usesDatabase = true;
public function testExtraction() public function testExtraction()
{ {
$extractor = new TikaTextExtractor(); $extractor = TikaTextExtractor::create();
if (!$extractor->isAvailable()) { if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika cli not available'); $this->markTestSkipped('tika cli not available');
} }
// Check file // Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $file = new File();
$content = $extractor->getContent($file); $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$this->assertContains('This is a test file with a link', $content); $file->write();
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
public function testServerExtraction()
{
$extractor = new TikaServerTextExtractor();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file); $content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content); $this->assertContains('This is a test file with a link', $content);