Merge pull request #45 from creative-commoners/pulls/3.0/ss4-updates

API Update namespaces and SilverStripe API implementations for SilverStripe 4 compat
This commit is contained in:
Dylan Wagstaff 2018-07-04 11:34:17 +12:00 committed by GitHub
commit 9795866abe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 636 additions and 444 deletions

1
.gitattributes vendored
View File

@ -4,3 +4,4 @@
/.gitignore export-ignore
/.travis.yml export-ignore
/.scrutinizer.yml export-ignore
/codecov.yml export-ignore

View File

@ -1,9 +1,13 @@
inherit: true
checks:
php:
code_rating: true
duplication: true
php: true
build:
nodes:
analysis:
tests:
override: [php-scrutinizer-run]
filter:
paths: [code/*, tests/*]
paths: [src/*, tests/*]

View File

@ -1,39 +1,47 @@
# See https://github.com/silverstripe/silverstripe-travis-support for setup details
language: php
sudo: false
addons:
apt:
packages:
- poppler-utils
env:
global:
- COMPOSER_ROOT_VERSION=3.x-dev
- SS_TIKA_ENDPOINT="http://localhost:9998/"
matrix:
include:
- php: 5.4
env: DB=PGSQL CORE_RELEASE=3.2
- php: 5.5
env: DB=PGSQL CORE_RELEASE=3.3
- php: 5.6
env: DB=PGSQL CORE_RELEASE=3.4
- php: 5.6
env: DB=MYSQL CORE_RELEASE=3.5
env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1
- php: 7.0
env: DB=MYSQL CORE_RELEASE=3.6
env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1
- php: 7.1
env: DB=MYSQL CORE_RELEASE=3
env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1
- php: 7.2
env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1
before_script:
- composer self-update || true
# Init PHP
- phpenv rehash
- phpenv config-rm xdebug.ini
# Configure Tika bin
- mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
- ./.travis/install_tika.sh
- git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
- cd ~/builds/ss
- composer install
- ($HOME/bin/tika-rest-server &) &> /dev/null
# Install composer dependencies
- composer validate
- composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION"
- if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi
- composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile
script:
- ($HOME/bin/tika-rest-server &) &> /dev/null
- vendor/bin/phpunit --verbose textextraction/tests/
- if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi
- if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi
after_success:
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi

14
.upgrade.yml Normal file
View File

@ -0,0 +1,14 @@
mappings:
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient

View File

@ -1,12 +0,0 @@
# Changelog
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).
## [2.0.1]
Using Symfony mime type detection
## [2.0.0]
Clarified Tika docs

View File

@ -1,11 +1,9 @@
# Text extraction module
[![Build Status](https://secure.travis-ci.org/silverstripe/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe/silverstripe-textextraction)
[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master)
[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction)
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe/silverstripe-textextraction.svg?style=flat)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction)
[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat)](https://packagist.org/packages/silverstripe/silverstripe-textextraction)
[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat)](license.md)
Provides a text extraction API for file content, that can hook into different extractor
engines based on availability and the parsed file format. The output returned is always a string of the file content.
@ -26,14 +24,14 @@ The module supports text extraction on the following file formats:
## Requirements
* SilverStripe ^3.1
* SilverStripe ^4.0
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
## Installation
```js
```
composer require silverstripe/textextraction
```

View File

View File

@ -3,9 +3,8 @@ Name: textextractioncache
After:
- '#corecache'
---
SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory
constructor:
namespace: 'FileTextCache_Cache'
namespace: 'FileTextCache_Cache'

10
_config/config.yml Normal file
View File

@ -0,0 +1,10 @@
---
Name: textextractionconfig
---
SilverStripe\Core\Injector\Injector:
# Define default FileTextCache implementation
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
max_content_length: 500000

3
codecov.yml Normal file
View File

@ -0,0 +1,3 @@
comment: false
codecov:
branch: master

View File

@ -1,37 +1,44 @@
{
"name": "silverstripe/textextraction",
"type": "silverstripe-vendormodule",
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
"homepage": "http://silverstripe.org",
"license": "BSD-3-Clause",
"keywords": ["silverstripe", "fulltext", "pdf"],
"authors": [
{
"name": "SilverStripe",
"homepage": "http://silverstripe.com"
},
{
"name": "The SilverStripe Community",
"homepage": "http://silverstripe.org"
}
],
"require": {
"php": ">=5.6",
"silverstripe/framework": "^4",
"guzzlehttp/guzzle": "~6.3.0",
"symfony/event-dispatcher": "^2.6.0@stable",
"symfony/http-foundation": "^2.6.0",
"silverstripe/assets": "^1"
},
"require-dev": {
"phpunit/phpunit": "^5.7"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": {
"branch-alias": {
"dev-master": "3.x-dev"
}
}
"name": "silverstripe/textextraction",
"type": "silverstripe-vendormodule",
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
"homepage": "http://silverstripe.org",
"license": "BSD-3-Clause",
"keywords": [
"silverstripe",
"fulltext",
"pdf"
],
"authors": [
{
"name": "SilverStripe",
"homepage": "http://silverstripe.com"
},
{
"name": "The SilverStripe Community",
"homepage": "http://silverstripe.org"
}
],
"require": {
"silverstripe/framework": "^4",
"silverstripe/assets": "^1",
"silverstripe/versioned": "^1",
"guzzlehttp/guzzle": "~6.3.0",
"symfony/event-dispatcher": "^2.6.0@stable",
"symfony/http-foundation": "^2.6.0"
},
"require-dev": {
"squizlabs/php_codesniffer": "^3",
"phpunit/phpunit": "^5.7"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": {
"branch-alias": {
"dev-master": "3.x-dev"
}
},
"minimum-stability": "dev",
"prefer-stable": true
}

View File

@ -8,31 +8,30 @@ the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config/config.yml`:
```yaml
File:
SilverStripe\Assets\File:
extensions:
- FileTextExtractable
- SilverStripe\TextExtraction\Extension\FileTextExtractable
```
By default any extracted content will be cached against the database row.
In order to stay within common size constraints for SQL queries required in this operation,
the cache sets a maximum character length after which content gets truncated (default: 500000).
You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration.
By default any extracted content will be cached against the database row. In order to stay within common size
constraints for SQL queries required in this operation, the cache sets a maximum character length after which
content gets truncated (default: 500000). You can configure this value through
`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
In order to swap out the cache backend you can use the following yaml configuration.
```yaml
---
Name: mytextextraction
After: '#textextraction'
---
Injector:
FileTextCache: FileTextCache_SSCache
FileTextCache_SSCache:
lifetime: 3600 # Number of seconds to cache content for
SilverStripe\Core\Injector\Injector:
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
lifetime: 3600 # Number of seconds to cache content for
```
## XPDF
@ -42,7 +41,7 @@ commandline utility. Follow their installation instructions, its presence will b
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
```yml
PDFTextExtractor:
SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
binary_location: /my/path/pdftotext
```
@ -59,7 +58,7 @@ in your database driver, or even pass it back to Solr as part of a full index up
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
```yml
SolrCellTextExtractor:
SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract'
```
@ -76,16 +75,27 @@ or by writing your own method around `FileTextExtractor->getContent()` (see "Usa
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
```php
class MyDocument extends DataObject {
static $db = array('Path' => 'Text');
function getContent() {
use SilverStripe\ORM\DataObject;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class MyDocument extends DataObject
{
private static $db = ['Path' => 'Text'];
public function getContent()
{
$extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null;
}
}
class MySolrIndex extends SolrIndex {
function init() {
$this->addClass('MyDocument');
use SilverStripe\FullTextSearch\Solr;
class MySolrIndex extends SolrIndex
{
public function init()
{
$this->addClass(MyDocument::class);
$this->addStoredField('Content', 'HTMLText');
}
}
@ -120,14 +130,15 @@ exec java -jar tika-app-1.8.jar "$@"
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
```yaml
TikaServerTextExtractor:
SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998'
```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name.
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
environment variable of the same name.
Then startup your server as below
Then startup your server as below:
```bash
java -jar tika-server-1.8.jar --host=localhost --port=9998
@ -136,7 +147,7 @@ java -jar tika-server-1.8.jar --host=localhost --port=9998
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
it behaves differently and is not recommended.
The module will log extraction errors with `SS_Log::NOTICE` priority by default,
The module will log extraction errors with PSR-3 "notice" priority by default,
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
In case you want more information on why processing failed, you can increase
the logging verbosity in the tika server instance by passing through

View File

@ -1,7 +1,8 @@
# Developer documentation
## Usage
Manual extraction:
Manual extraction via string file path:
```php
$myFile = '/my/path/myfile.pdf';
@ -9,6 +10,14 @@ $extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Manual extraction via File object:
```php
$myFile = File::get()->filter(['Name' => 'My file')->first();
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied:
```php

View File

@ -1,4 +1,4 @@
Copyright (c) 2017, SilverStripe Limited
Copyright (c) 2018, SilverStripe Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

10
phpcs.xml.dist Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<ruleset name="SilverStripe">
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
<!-- base rules are PSR-2 -->
<rule ref="PSR2" >
<!-- Current exclusions -->
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
</rule>
</ruleset>

14
phpunit.xml.dist Normal file
View File

@ -0,0 +1,14 @@
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
<testsuite name="Default">
<directory>tests/</directory>
</testsuite>
<filter>
<whitelist addUncoveredFilesFromWhitelist="true">
<directory suffix=".php">src/</directory>
<exclude>
<directory suffix=".php">tests/</directory>
</exclude>
</whitelist>
</filter>
</phpunit>

View File

@ -1,6 +1,6 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache;
use SilverStripe\Assets\File;

View File

@ -1,19 +1,21 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Core\Flushable,
Psr\SimpleCache\CacheInterface,
SilverStripe\Core\Injector\Injector;
use Psr\SimpleCache\CacheInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Flushable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_Cache implements FileTextCache, Flushable
class Cache implements FileTextCache, Flushable
{
use Configurable;
/**
* Lifetime of cache in seconds
* Null is indefinite
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/**
*
* @param File $file
* @return type
* @return mixed
*/
public function load(File $file)
{
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
*/
public function save(File $file, $content)
{
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$lifetime = $lifetime ?: 3600;
$lifetime = $this->config()->get('lifetime') ?: 3600;
$key = $this->getKey($file);
$cache = self::get_cache();
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
/**
*
* @param File $file
* @return type
* @return bool
*/
public function invalidate(File $file)
{

View File

@ -1,17 +1,25 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File,
SilverStripe\Core\Config\Config,
SilverStripe\TextExtraction\Extension\FileTextCache;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache
class Database implements FileTextCache
{
use Configurable;
/**
* @config
* @var int
*/
private static $max_content_length = null;
/**
*
* @param File $file
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
*/
public function save(File $file, $content)
{
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$maxLength = $this->config()->get('max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}

View File

@ -1,9 +0,0 @@
<?php
namespace SilverStripe\TextExtraction\Exception;
use \Exception;
class FileTextExtractor_Exception extends Exception
{
}

View File

@ -2,9 +2,10 @@
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\ORM\DataExtension,
SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\Control\Director;
use SilverStripe\Assets\File;
use SilverStripe\ORM\DataExtension;
use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
@ -13,36 +14,32 @@ use SilverStripe\ORM\DataExtension,
* Adds an additional property which is the cached contents, which is populated on demand.
*
* @author mstephens
*
*/
class FileTextExtractable extends DataExtension
{
/**
*
* @var array
* @config
*/
private static $db = array(
private static $db = [
'FileContentCache' => 'Text'
);
];
/**
*
* @var array
* @config
*/
private static $casting = array(
private static $casting = [
'FileContent' => 'Text'
);
];
/**
*
* @var array
* @config
*/
private static $dependencies = array(
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
);
private static $dependencies = [
'TextCache' => '%$' . FileTextCache::class,
];
/**
* @var FileTextCache
@ -50,13 +47,13 @@ class FileTextExtractable extends DataExtension
protected $fileTextCache = null;
/**
*
* @param FileTextCache $cache
* @return void
* @return $this
*/
public function setTextCache(FileTextCache $cache)
{
$this->fileTextCache = $cache;
return $this;
}
/**
@ -78,37 +75,38 @@ class FileTextExtractable extends DataExtension
}
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
* returns the text. The value is also cached into the File record itself.
*
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing
* the cached version
* @return mixed string | null
* @return string|null
*/
public function extractFileAsText($disableCache = false)
{
/** @var File $file */
$file = $this->owner;
if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner);
$text = $this->getTextCache()->load($file);
if ($text) {
return $text;
}
}
// Determine which extractor can process this file.
$path = Director::baseFolder() . '/' . $this->owner->getFilename();
$extractor = FileTextExtractor::for_file($path);
$extractor = FileTextExtractor::for_file($file);
if (!$extractor) {
return null;
}
$text = $extractor->getContent($path);
$text = $extractor->getContent($file);
if (!$text) {
return null;
}
if (!$disableCache) {
$this->getTextCache()->save($this->owner, $text);
$this->getTextCache()->save($file, $text);
}
return $text;

View File

@ -2,17 +2,22 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Core\Config\Config,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\ClassInfo;
use SilverStripe\Assets\File;
use SilverStripe\Core\ClassInfo;
use SilverStripe\Core\Config\Config;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injectable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*
*/
abstract class FileTextExtractor
{
use Configurable;
use Injectable;
/**
* Set priority from 0-100.
@ -45,7 +50,7 @@ abstract class FileTextExtractor
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes);
$classPriorities = array();
$classPriorities = [];
foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
@ -76,23 +81,25 @@ abstract class FileTextExtractor
*/
protected static function get_mime($path)
{
$file = new Symfony\Component\HttpFoundation\File\File($path);
$file = new \Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType();
}
/**
* @param string $path
* @return mixed FileTextExtractor | null
* Given a File object, decide which extractor instance to use to handle it
*
* @param File $file
* @return FileTextExtractor|null
*/
public static function for_file($path)
public static function for_file(File $file)
{
if (!file_exists($path) || is_dir($path)) {
return;
if (!$file) {
return null;
}
$extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
$extension = $file->getExtension();
$mime = $file->getMimeType();
foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
@ -114,6 +121,39 @@ abstract class FileTextExtractor
}
}
/**
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
* file contents to a temp file and return its path
*
* @param File $file
* @return string
* @throws Exception
*/
protected function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
}
// Append extension to temp file if one is set
if ($file->getExtension()) {
$path .= '.' . $file->getExtension();
}
// Remove any existing temp files with this name
if (file_exists($path)) {
unlink($path);
}
$bytesWritten = file_put_contents($path, $file->getStream());
if (false === $bytesWritten) {
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
}
return $path;
}
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
@ -132,7 +172,7 @@ abstract class FileTextExtractor
abstract public function supportsExtension($extension);
/**
* Determine if this extractor suports the given mime type.
* Determine if this extractor supports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime
@ -141,10 +181,10 @@ abstract class FileTextExtractor
abstract public function supportsMime($mime);
/**
* Given a file path, extract the contents as text.
* Given a File instance, extract the contents as text.
*
* @param string $path
* @param File|string $file Either the File instance, or a file path for a file to load
* @return string
*/
abstract public function getContent($path);
abstract public function getContent($file);
}

View File

@ -0,0 +1,7 @@
<?php
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class Exception extends \Exception
{
}

View File

@ -2,47 +2,16 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
use SilverStripe\Assets\File;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
* the best for readable text.
*
* @author mstephens
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
*
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
}
/**
*
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
@ -51,44 +20,71 @@ class HTMLTextExtractor extends FileTextExtractor
*/
private static $priority = 10;
/**
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
}
/**
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @param File $file
* @return string
*/
public function getContent($path)
public function getContent($file)
{
$content = file_get_contents($path);
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
), array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), $content
[
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
],
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
$content
);
return strip_tags($content);
}
}

View File

@ -2,17 +2,15 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
use SilverStripe\Assets\File;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
class PDFTextExtractor extends FileTextExtractor
{
/**
* Set to bin path this extractor can execute
*
@ -27,10 +25,10 @@ class PDFTextExtractor extends FileTextExtractor
* @config
* @var array
*/
private static $search_binary_locations = array(
private static $search_binary_locations = [
'/usr/bin',
'/usr/local/bin',
);
];
public function isAvailable()
{
@ -46,12 +44,13 @@ class PDFTextExtractor extends FileTextExtractor
public function supportsMime($mime)
{
return in_array(
strtolower($mime), array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
strtolower($mime),
[
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
]
);
}
@ -64,10 +63,10 @@ class PDFTextExtractor extends FileTextExtractor
protected function bin($program = '')
{
// Get list of allowed search paths
if ($location = $this->config()->binary_location) {
$locations = array($location);
if ($location = $this->config()->get('binary_location')) {
$locations = [$location];
} else {
$locations = $this->config()->search_binary_locations;
$locations = $this->config()->get('search_binary_locations');
}
// Find program in each path
@ -85,35 +84,41 @@ class PDFTextExtractor extends FileTextExtractor
return null;
}
public function getContent($path)
public function getContent($file)
{
if (!$path) {
return "";
} // no file
$content = $this->getRawOutput($path);
if (!$file || (is_string($file) && !file_exists($file))) {
// no file
return '';
}
$content = $this->getRawOutput($file);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given path
* Invoke pdftotext with the given File object
*
* @param string $path
* @param File|string $file
* @return string Output
* @throws FileTextExtractor_Exception
* @throws Exception
*/
protected function getRawOutput($path)
protected function getRawOutput($file)
{
if (!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
throw new Exception("getRawOutput called on unavailable extractor");
}
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
if (!is_array($err) && $err == 1) {
// For Windows compatibility
$err = $content;
}
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
));
}
@ -130,7 +135,7 @@ class PDFTextExtractor extends FileTextExtractor
*/
protected function cleanupLigatures($input)
{
$mapping = array(
$mapping = [
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
@ -138,9 +143,8 @@ class PDFTextExtractor extends FileTextExtractor
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
];
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}

View File

@ -2,9 +2,12 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
GuzzleHttp\Client,
Psr\Log\LoggerInterface;
use Exception;
use GuzzleHttp\Client;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Injector\Injector;
/**
* Text extractor that calls an Apache Solr instance
@ -18,7 +21,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
class SolrCellTextExtractor extends FileTextExtractor
{
/**
* Base URL to use for solr text extraction.
* Base URL to use for Solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
@ -27,43 +30,36 @@ class SolrCellTextExtractor extends FileTextExtractor
private static $base_url;
/**
*
* @var int
* @config
*/
private static $priority = 75;
/**
*
* @var GuzzleHttp\Client
* @var Client
*/
protected $httpClient;
/**
*
* @return GuzzleHttp\Client
* @throws InvalidArgumentException
* @return Client
*/
public function getHttpClient()
{
if (!$this->config()->get('base_url')) {
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if (!$this->httpClient) {
$this->httpClient = new Client($this->config()->get('base_url'));
$this->httpClient = new Client();
}
return $this->httpClient;
}
/**
*
* @param GuzzleHttp\Client $client
* @return void
* @param Client $client
* @return $this
*/
public function setHttpClient($client)
public function setHttpClient(Client $client)
{
$this->httpClient = $client;
return $this;
}
/**
@ -73,30 +69,28 @@ class SolrCellTextExtractor extends FileTextExtractor
{
$url = $this->config()->get('base_url');
return (boolean) $url;
return (bool) $url;
}
/**
*
* @param string $extension
* @return boolean
* @return bool
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension),
array(
[
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
]
);
}
/**
*
* @param string $mime
* @return boolean
* @return bool
*/
public function supportsMime($mime)
{
@ -105,48 +99,57 @@ class SolrCellTextExtractor extends FileTextExtractor
}
/**
*
* @param string $path
* @param File|string $file
* @return string
* @throws InvalidArgumentException
*/
public function getContent($path)
public function getContent($file)
{
if (!$path) {
return "";
} // no file
if (!$file || (is_string($file) && !file_exists($file))) {
// no file
return '';
}
$fileName = basename($path);
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
$client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
try {
$path = $this->getPathFromFile($file);
$request = $client
->post()
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
->addPostFiles(array('myfile' => $path));
->post($baseUrl)
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
->addPostFiles(['myfile' => $path]);
$response = $request->send();
} catch (\InvalidArgumentException $e) {
} catch (InvalidArgumentException $e) {
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
);
'Error extracting text from "%s" (message: %s)',
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
} catch (\Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
} catch (Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
// query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
);
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
}
// Just initialise it, it doesn't take miuch.
// Just initialise it, it doesn't take much.
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes

View File

@ -2,10 +2,10 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
SilverStripe\Core\Injector\Injector,
SilverStripe\Core\Environment,
SilverStripe\TextExtraction\Rest\TikaRestClient;
use SilverStripe\Assets\File;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* Enables text extraction of file content via the Tika Rest Server
@ -35,18 +35,25 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/**
* @return TikaRestClient
*/
public function getClient()
{
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
TikaRestClient::class,
array($this->getServerEndpoint())
)
if (!$this->client) {
$this->client = Injector::inst()->createWithArgs(
TikaRestClient::class,
[$this->getServerEndpoint()]
);
}
return $this->client;
}
/**
@ -59,19 +66,17 @@ class TikaServerTextExtractor extends FileTextExtractor
}
// Default to configured endpoint
return $this->config()->server_endpoint;
return $this->config()->get('server_endpoint');
}
/**
* Get the version of tika installed, or 0 if not installed
* Get the version of Tika installed, or 0 if not installed
*
* @return float version of tika
* @return float version of Tika
*/
public function getVersion()
{
return $this
->getClient()
->getVersion();
return $this->getClient()->getVersion();
}
/**
@ -79,13 +84,12 @@ class TikaServerTextExtractor extends FileTextExtractor
*/
public function isAvailable()
{
return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() &&
version_compare($this->getVersion(), '1.7.0') >= 0;
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
&& version_compare($this->getVersion(), '1.7.0') >= 0;
}
/**
*
* @param string $extension
* @return boolean
*/
@ -95,31 +99,23 @@ class TikaServerTextExtractor extends FileTextExtractor
return false;
}
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
/**
*
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
$supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
if (!$this->supportedMimes) {
$this->supportedMimes = $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)
if (isset($supported[$mime])) {
if (isset($this->supportedMimes[$mime])) {
return true;
}
// Check aliases
foreach ($supported as $info) {
foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
return true;
}
@ -128,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
return false;
}
public function getContent($path)
public function getContent($file)
{
return $this->getClient()->tika($path);
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
return $this->getClient()->tika($tempFile);
}
}

View File

@ -2,7 +2,7 @@
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
use SilverStripe\Assets\File;
/**
* Enables text extraction of file content via the Tika CLI
@ -47,13 +47,13 @@ class TikaTextExtractor extends FileTextExtractor
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
$descriptorSpecs = [
0 => ["pipe", "r"],
1 => ["pipe", "w"],
2 => ["pipe", "w"]
];
// Invoke command
$pipes = array();
$pipes = [];
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) {
@ -74,14 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
return proc_close($proc);
}
/**
*
* @param string $path
* @return string
*/
public function getContent($path)
public function getContent($file)
{
$mode = $this->config()->output_mode;
$mode = $this->config()->get('output_mode');
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
@ -91,8 +87,7 @@ class TikaTextExtractor extends FileTextExtractor
}
/**
*
* @return boolean
* @return bool
*/
public function isAvailable()
{
@ -100,8 +95,7 @@ class TikaTextExtractor extends FileTextExtractor
}
/**
*
* @return boolean
* @return bool
*/
public function supportsExtension($extension)
{
@ -111,9 +105,8 @@ class TikaTextExtractor extends FileTextExtractor
/**
*
* @param string $mime
* @return boolean
* @return bool
*/
public function supportsMime($mime)
{
@ -121,8 +114,9 @@ class TikaTextExtractor extends FileTextExtractor
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) {
// Error case
return false;
} // Error case
}
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));

View File

@ -2,11 +2,11 @@
namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client,
GuzzleHttp\Exception\RequestException,
SilverStripe\Core\Environment,
Psr\Log\LoggerInterface,
SilverStripe\Core\Injector\Injector;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client
{
@ -15,30 +15,30 @@ class TikaRestClient extends Client
*
* @var array
*/
protected $options = array('username' => null, 'password' => null);
protected $options = ['username' => null, 'password' => null];
/**
* @var array
*/
protected $mimes = array();
protected $mimes = [];
/**
*
* @param string $baseUrl
* @param array $config
* @param array $config
*/
public function __construct($baseUrl = '', $config = null)
public function __construct($baseUrl = '', $config = [])
{
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
$password = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($psswd)) {
$this->options = array(
if (!empty($password)) {
$this->options = [
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $psswd,
);
'password' => $password,
];
}
parent::__construct($baseUrl, $config);
parent::__construct($config);
}
/**
@ -58,7 +58,7 @@ class TikaRestClient extends Client
}
} catch (RequestException $ex) {
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
Injector::inst()->get(LoggerInterface::class)->error($msg);
Injector::inst()->get(LoggerInterface::class)->info($msg);
return false;
}
@ -120,7 +120,7 @@ class TikaRestClient extends Client
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
['Accept' => 'text/plain'],
file_get_contents($file)
);
$response->setAuth($this->options['username'], $this->options['password']);
@ -139,7 +139,7 @@ class TikaRestClient extends Client
$msg .= ' Body: ' . $body;
}
Injector::inst()->get(LoggerInterface::class)->notice($msg);
Injector::inst()->get(LoggerInterface::class)->info($msg);
}
return $text;

View File

@ -1,23 +1,23 @@
<?php
use SilverStripe\TextExtraction\Extension\FileTextCache,
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
SilverStripe\Dev\SapphireTest,
SilverStripe\Core\Config\Config;
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
class FileTextCacheDatabaseTest extends SapphireTest
{
public function testTruncatesByMaxLength()
{
Config::nest();
Config::modify()->set(Database::class, 'max_content_length', 5);
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
$cache = new FileTextCache_Database();
$file = $this->getMock('File', array('write'));
$cache = new Database();
$file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
$content = '0123456789';
$cache->save($file, $content);
$this->assertEquals($cache->load($file), '01234');
Config::unnest();
$this->assertEquals($cache->load($file), '01234');
}
}

View File

@ -1,46 +1,58 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
class FileTextExtractableTest extends SapphireTest
{
protected $requiredExtensions = array(
'File' => array('FileTextExtractable')
);
protected $usesDatabase = true;
public function setUp()
protected static $required_extensions = [
File::class => [
FileTextExtractable::class,
],
];
protected function setUp()
{
parent::setUp();
// Ensure that html is a valid extension
Config::inst()
->nest()
->update('File', 'allowed_extensions', array('html'));
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(
dirname(__FILE__) . '/fixtures/test1.html',
dirname(__FILE__) . '/fixtures/test1-copy.html'
);
}
public function tearDown()
protected function tearDown()
{
Config::unnest();
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
}
parent::tearDown();
}
public function testExtractFileAsText()
{
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// Use HTML, since the extractor is always available
$file = new File(array(
'Name' => 'test1-copy.html',
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
));
/** @var File|FileTextExtractable $file */
$file = new File(['Name' => 'test1-copy.html']);
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
$file->write();
$content = $file->extractFileAsText();
$this->assertNotNull($content);
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
}
}
}

View File

@ -1,11 +1,33 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
class HTMLTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
protected function setUp()
{
parent::setUp();
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
}
public function testExtraction()
{
$extractor = new HTMLTextExtractor();
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');

View File

@ -1,17 +1,29 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
class PDFTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testExtraction()
{
$extractor = new PDFTextExtractor();
if (!$extractor->isAvailable()) {
$this->setExpectedException(
'FileTextExtractor_Exception',
'getRawOutput called on unavailable extractor'
);
$this->expectException(Exception::class);
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
}
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
}
}

View File

@ -0,0 +1,36 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
/**
* @group tika-tests
*/
class TikaServerTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testServerExtraction()
{
$extractor = TikaServerTextExtractor::create();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}

View File

@ -1,37 +1,32 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
/**
* Tests the {@see TikaTextExtractor} class
*
* @group tika-tests
*/
class TikaTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testExtraction()
{
$extractor = new TikaTextExtractor();
$extractor = TikaTextExtractor::create();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika cli not available');
}
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
public function testServerExtraction()
{
$extractor = new TikaServerTextExtractor();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);