diff --git a/.gitattributes b/.gitattributes index 475f5f2..89eb187 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4,3 +4,4 @@ /.gitignore export-ignore /.travis.yml export-ignore /.scrutinizer.yml export-ignore +/codecov.yml export-ignore diff --git a/.scrutinizer.yml b/.scrutinizer.yml index 61b0c9f..a22afca 100644 --- a/.scrutinizer.yml +++ b/.scrutinizer.yml @@ -1,9 +1,13 @@ inherit: true checks: - php: - code_rating: true - duplication: true + php: true + +build: + nodes: + analysis: + tests: + override: [php-scrutinizer-run] filter: - paths: [code/*, tests/*] + paths: [src/*, tests/*] diff --git a/.travis.yml b/.travis.yml index 0b96f17..986569e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,39 +1,47 @@ -# See https://github.com/silverstripe/silverstripe-travis-support for setup details language: php -sudo: false - addons: apt: packages: - poppler-utils +env: + global: + - COMPOSER_ROOT_VERSION=3.x-dev + - SS_TIKA_ENDPOINT="http://localhost:9998/" + matrix: include: - - php: 5.4 - env: DB=PGSQL CORE_RELEASE=3.2 - - php: 5.5 - env: DB=PGSQL CORE_RELEASE=3.3 - php: 5.6 - env: DB=PGSQL CORE_RELEASE=3.4 - - php: 5.6 - env: DB=MYSQL CORE_RELEASE=3.5 + env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1 - php: 7.0 - env: DB=MYSQL CORE_RELEASE=3.6 + env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1 - php: 7.1 - env: DB=MYSQL CORE_RELEASE=3 + env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1 + - php: 7.2 + env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1 before_script: - - composer self-update || true + # Init PHP + - phpenv rehash + - phpenv config-rm xdebug.ini + + # Configure Tika bin - mkdir -p $HOME/bin - export PATH=$PATH:$HOME/bin - - export SS_TIKA_ENDPOINT="http://localhost:9998/" - ./.travis/install_tika.sh - - git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support - - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - - cd ~/builds/ss - - composer install + - ($HOME/bin/tika-rest-server &) &> /dev/null + + # Install composer dependencies + - composer validate + - composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION" + - if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi + - composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile script: - - ($HOME/bin/tika-rest-server &) &> /dev/null - - vendor/bin/phpunit --verbose textextraction/tests/ + - if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi + - if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi + - if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi + +after_success: + - if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi diff --git a/.upgrade.yml b/.upgrade.yml new file mode 100644 index 0000000..5d5dd4f --- /dev/null +++ b/.upgrade.yml @@ -0,0 +1,14 @@ +mappings: + FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable + FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache + FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache + FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database + FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor + FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception + HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor + PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor + SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor + TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor + TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor + TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient + diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 5db972b..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,12 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -This project adheres to [Semantic Versioning](http://semver.org/). - - -## [2.0.1] -Using Symfony mime type detection - -## [2.0.0] -Clarified Tika docs diff --git a/README.md b/README.md index ad98b1f..dbf66a4 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ # Text extraction module -[![Build Status](https://secure.travis-ci.org/silverstripe/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe/silverstripe-textextraction) +[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction) +[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master) +[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction) [![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/) -[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe/silverstripe-textextraction.svg?style=flat)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction) -[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat)](https://packagist.org/packages/silverstripe/silverstripe-textextraction) -[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat)](license.md) - Provides a text extraction API for file content, that can hook into different extractor engines based on availability and the parsed file format. The output returned is always a string of the file content. @@ -26,14 +24,14 @@ The module supports text extraction on the following file formats: ## Requirements - * SilverStripe ^3.1 + * SilverStripe ^4.0 * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) * (optional) [Apache Tika](http://tika.apache.org/) ## Installation -```js +``` composer require silverstripe/textextraction ``` diff --git a/_config.php b/_config.php deleted file mode 100644 index e69de29..0000000 diff --git a/_config/cache.yml b/_config/cache.yml index ff793b2..2f82c29 100644 --- a/_config/cache.yml +++ b/_config/cache.yml @@ -3,9 +3,8 @@ Name: textextractioncache After: - '#corecache' --- - SilverStripe\Core\Injector\Injector: Psr\SimpleCache\CacheInterface.FileTextCache_Cache: factory: SilverStripe\Core\Cache\CacheFactory constructor: - namespace: 'FileTextCache_Cache' \ No newline at end of file + namespace: 'FileTextCache_Cache' diff --git a/_config/config.yml b/_config/config.yml new file mode 100644 index 0000000..0a0982d --- /dev/null +++ b/_config/config.yml @@ -0,0 +1,10 @@ +--- +Name: textextractionconfig +--- +SilverStripe\Core\Injector\Injector: + # Define default FileTextCache implementation + SilverStripe\TextExtraction\Cache\FileTextCache: + class: SilverStripe\TextExtraction\Cache\FileTextCache\Database + +SilverStripe\TextExtraction\Cache\FileTextCache\Database: + max_content_length: 500000 diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..9e7c838 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,3 @@ +comment: false +codecov: + branch: master diff --git a/composer.json b/composer.json index f901f30..fa6c5d8 100644 --- a/composer.json +++ b/composer.json @@ -1,37 +1,44 @@ { - "name": "silverstripe/textextraction", - "type": "silverstripe-vendormodule", - "description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)", - "homepage": "http://silverstripe.org", - "license": "BSD-3-Clause", - "keywords": ["silverstripe", "fulltext", "pdf"], - "authors": [ - { - "name": "SilverStripe", - "homepage": "http://silverstripe.com" - }, - { - "name": "The SilverStripe Community", - "homepage": "http://silverstripe.org" - } - ], - "require": { - "php": ">=5.6", - "silverstripe/framework": "^4", - "guzzlehttp/guzzle": "~6.3.0", - "symfony/event-dispatcher": "^2.6.0@stable", - "symfony/http-foundation": "^2.6.0", - "silverstripe/assets": "^1" - }, - "require-dev": { - "phpunit/phpunit": "^5.7" - }, - "suggest": { - "ext-fileinfo": "Improved support for file mime detection" - }, - "extra": { - "branch-alias": { - "dev-master": "3.x-dev" - } - } + "name": "silverstripe/textextraction", + "type": "silverstripe-vendormodule", + "description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)", + "homepage": "http://silverstripe.org", + "license": "BSD-3-Clause", + "keywords": [ + "silverstripe", + "fulltext", + "pdf" + ], + "authors": [ + { + "name": "SilverStripe", + "homepage": "http://silverstripe.com" + }, + { + "name": "The SilverStripe Community", + "homepage": "http://silverstripe.org" + } + ], + "require": { + "silverstripe/framework": "^4", + "silverstripe/assets": "^1", + "silverstripe/versioned": "^1", + "guzzlehttp/guzzle": "~6.3.0", + "symfony/event-dispatcher": "^2.6.0@stable", + "symfony/http-foundation": "^2.6.0" + }, + "require-dev": { + "squizlabs/php_codesniffer": "^3", + "phpunit/phpunit": "^5.7" + }, + "suggest": { + "ext-fileinfo": "Improved support for file mime detection" + }, + "extra": { + "branch-alias": { + "dev-master": "3.x-dev" + } + }, + "minimum-stability": "dev", + "prefer-stable": true } diff --git a/docs/en/configuration.md b/docs/en/configuration.md index 052448a..97d9d04 100644 --- a/docs/en/configuration.md +++ b/docs/en/configuration.md @@ -8,31 +8,30 @@ the content available through your `DataObject` subclass. In this case, add the following to `mysite/_config/config.yml`: ```yaml -File: +SilverStripe\Assets\File: extensions: - - FileTextExtractable + - SilverStripe\TextExtraction\Extension\FileTextExtractable ``` -By default any extracted content will be cached against the database row. -In order to stay within common size constraints for SQL queries required in this operation, -the cache sets a maximum character length after which content gets truncated (default: 500000). -You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration. - +By default any extracted content will be cached against the database row. In order to stay within common size +constraints for SQL queries required in this operation, the cache sets a maximum character length after which +content gets truncated (default: 500000). You can configure this value through +`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration. Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. In order to swap out the cache backend you can use the following yaml configuration. - ```yaml --- Name: mytextextraction After: '#textextraction' --- -Injector: - FileTextCache: FileTextCache_SSCache -FileTextCache_SSCache: - lifetime: 3600 # Number of seconds to cache content for +SilverStripe\Core\Injector\Injector: + SilverStripe\TextExtraction\Cache\FileTextCache: + class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache +SilverStripe\TextExtraction\Cache\FileTextCache\Database: + lifetime: 3600 # Number of seconds to cache content for ``` ## XPDF @@ -42,7 +41,7 @@ commandline utility. Follow their installation instructions, its presence will b detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`: ```yml -PDFTextExtractor: +SilverStripe\TextExtraction\Extractor\PDFTextExtractor: binary_location: /my/path/pdftotext ``` @@ -59,7 +58,7 @@ in your database driver, or even pass it back to Solr as part of a full index up In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): ```yml -SolrCellTextExtractor: +SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor: base_url: 'http://localhost:8983/solr/update/extract' ``` @@ -76,16 +75,27 @@ or by writing your own method around `FileTextExtractor->getContent()` (see "Usa The property should be listed in your `SolrIndex` subclass, e.g. as follows: ```php -class MyDocument extends DataObject { - static $db = array('Path' => 'Text'); - function getContent() { +use SilverStripe\ORM\DataObject; +use SilverStripe\TextExtraction\Extractor\FileTextExtractor; + +class MyDocument extends DataObject +{ + private static $db = ['Path' => 'Text']; + + public function getContent() + { $extractor = FileTextExtractor::for_file($this->Path); return $extractor ? $extractor->getContent($this->Path) : null; } } -class MySolrIndex extends SolrIndex { - function init() { - $this->addClass('MyDocument'); + +use SilverStripe\FullTextSearch\Solr; + +class MySolrIndex extends SolrIndex +{ + public function init() + { + $this->addClass(MyDocument::class); $this->addStoredField('Content', 'HTMLText'); } } @@ -120,14 +130,15 @@ exec java -jar tika-app-1.8.jar "$@" Tika can also be run as a server. You can configure your server endpoint by setting the url via config. ```yaml -TikaServerTextExtractor: +SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor: server_endpoint: 'http://localhost:9998' ``` -Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. +Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an +environment variable of the same name. -Then startup your server as below +Then startup your server as below: ```bash java -jar tika-server-1.8.jar --host=localhost --port=9998 @@ -136,7 +147,7 @@ java -jar tika-server-1.8.jar --host=localhost --port=9998 While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), it behaves differently and is not recommended. -The module will log extraction errors with `SS_Log::NOTICE` priority by default, +The module will log extraction errors with PSR-3 "notice" priority by default, for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. In case you want more information on why processing failed, you can increase the logging verbosity in the tika server instance by passing through diff --git a/docs/en/developer-docs.md b/docs/en/developer-docs.md index a009dde..60565cf 100644 --- a/docs/en/developer-docs.md +++ b/docs/en/developer-docs.md @@ -1,7 +1,8 @@ # Developer documentation + ## Usage -Manual extraction: +Manual extraction via string file path: ```php $myFile = '/my/path/myfile.pdf'; @@ -9,6 +10,14 @@ $extractor = FileTextExtractor::for_file($myFile); $content = $extractor->getContent($myFile); ``` +Manual extraction via File object: + +```php +$myFile = File::get()->filter(['Name' => 'My file')->first(); +$extractor = FileTextExtractor::for_file($myFile); +$content = $extractor->getContent($myFile); +``` + Extraction with `FileTextExtractable` extension applied: ```php diff --git a/license.md b/license.md index 8794670..30758eb 100644 --- a/license.md +++ b/license.md @@ -1,4 +1,4 @@ -Copyright (c) 2017, SilverStripe Limited +Copyright (c) 2018, SilverStripe Limited All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/phpcs.xml.dist b/phpcs.xml.dist new file mode 100644 index 0000000..a504558 --- /dev/null +++ b/phpcs.xml.dist @@ -0,0 +1,10 @@ + + + CodeSniffer ruleset for SilverStripe coding conventions. + + + + + + + diff --git a/phpunit.xml.dist b/phpunit.xml.dist new file mode 100644 index 0000000..8700f65 --- /dev/null +++ b/phpunit.xml.dist @@ -0,0 +1,14 @@ + + + tests/ + + + + + src/ + + tests/ + + + + diff --git a/src/Extension/FileTextCache.php b/src/Cache/FileTextCache.php similarity index 92% rename from src/Extension/FileTextCache.php rename to src/Cache/FileTextCache.php index d0ccd70..3586d78 100644 --- a/src/Extension/FileTextCache.php +++ b/src/Cache/FileTextCache.php @@ -1,6 +1,6 @@ get(__CLASS__, 'lifetime'); - $lifetime = $lifetime ?: 3600; + $lifetime = $this->config()->get('lifetime') ?: 3600; $key = $this->getKey($file); $cache = self::get_cache(); @@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable /** * * @param File $file - * @return type + * @return bool */ public function invalidate(File $file) { diff --git a/src/Extension/FieldTextCache_Database.php b/src/Cache/FileTextCache/Database.php similarity index 67% rename from src/Extension/FieldTextCache_Database.php rename to src/Cache/FileTextCache/Database.php index a96ff60..1379ee0 100644 --- a/src/Extension/FieldTextCache_Database.php +++ b/src/Cache/FileTextCache/Database.php @@ -1,17 +1,25 @@ get('FileTextCache_Database', 'max_content_length'); + $maxLength = $this->config()->get('max_content_length'); $file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content; $file->write(); } diff --git a/src/Exception/FileTextExtractor_Exception.php b/src/Exception/FileTextExtractor_Exception.php deleted file mode 100644 index 4fa1038..0000000 --- a/src/Exception/FileTextExtractor_Exception.php +++ /dev/null @@ -1,9 +0,0 @@ - 'Text' - ); + ]; /** - * * @var array * @config */ - private static $casting = array( + private static $casting = [ 'FileContent' => 'Text' - ); + ]; /** - * * @var array * @config */ - private static $dependencies = array( - 'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache' - ); + private static $dependencies = [ + 'TextCache' => '%$' . FileTextCache::class, + ]; /** * @var FileTextCache @@ -50,13 +47,13 @@ class FileTextExtractable extends DataExtension protected $fileTextCache = null; /** - * * @param FileTextCache $cache - * @return void + * @return $this */ public function setTextCache(FileTextCache $cache) { $this->fileTextCache = $cache; + return $this; } /** @@ -78,37 +75,38 @@ class FileTextExtractable extends DataExtension } /** - * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text. - * The value is also cached into the File record itself. + * Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and + * returns the text. The value is also cached into the File record itself. * * @param boolean $disableCache If false, the file content is only parsed on demand. * If true, the content parsing is forced, bypassing * the cached version - * @return mixed string | null + * @return string|null */ public function extractFileAsText($disableCache = false) { + /** @var File $file */ + $file = $this->owner; if (!$disableCache) { - $text = $this->getTextCache()->load($this->owner); + $text = $this->getTextCache()->load($file); if ($text) { return $text; } } // Determine which extractor can process this file. - $path = Director::baseFolder() . '/' . $this->owner->getFilename(); - $extractor = FileTextExtractor::for_file($path); + $extractor = FileTextExtractor::for_file($file); if (!$extractor) { return null; } - $text = $extractor->getContent($path); + $text = $extractor->getContent($file); if (!$text) { return null; } if (!$disableCache) { - $this->getTextCache()->save($this->owner, $text); + $this->getTextCache()->save($file, $text); } return $text; diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index 115d679..d098aad 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -2,17 +2,22 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\Core\Config\Config, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\ClassInfo; +use SilverStripe\Assets\File; +use SilverStripe\Core\ClassInfo; +use SilverStripe\Core\Config\Config; +use SilverStripe\Core\Config\Configurable; +use SilverStripe\Core\Injector\Injectable; +use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception; /** * A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents. * @author mstephens - * */ abstract class FileTextExtractor { + use Configurable; + use Injectable; /** * Set priority from 0-100. @@ -45,7 +50,7 @@ abstract class FileTextExtractor // Generate the sorted list of extractors on demand. $classes = ClassInfo::subclassesFor(__CLASS__); array_shift($classes); - $classPriorities = array(); + $classPriorities = []; foreach ($classes as $class) { $classPriorities[$class] = Config::inst()->get($class, 'priority'); @@ -76,23 +81,25 @@ abstract class FileTextExtractor */ protected static function get_mime($path) { - $file = new Symfony\Component\HttpFoundation\File\File($path); + $file = new \Symfony\Component\HttpFoundation\File\File($path); return $file->getMimeType(); } /** - * @param string $path - * @return mixed FileTextExtractor | null + * Given a File object, decide which extractor instance to use to handle it + * + * @param File $file + * @return FileTextExtractor|null */ - public static function for_file($path) + public static function for_file(File $file) { - if (!file_exists($path) || is_dir($path)) { - return; + if (!$file) { + return null; } - $extension = pathinfo($path, PATHINFO_EXTENSION); - $mime = self::get_mime($path); + $extension = $file->getExtension(); + $mime = $file->getMimeType(); foreach (self::get_extractor_classes() as $className) { $extractor = self::get_extractor($className); @@ -114,6 +121,39 @@ abstract class FileTextExtractor } } + /** + * Some text extractors (like pdftotext) may require a physical file to read from, so write the current + * file contents to a temp file and return its path + * + * @param File $file + * @return string + * @throws Exception + */ + protected function getPathFromFile(File $file) + { + $path = tempnam(TEMP_PATH, 'pdftextextractor_'); + if (false === $path) { + throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name'); + } + + // Append extension to temp file if one is set + if ($file->getExtension()) { + $path .= '.' . $file->getExtension(); + } + + // Remove any existing temp files with this name + if (file_exists($path)) { + unlink($path); + } + + $bytesWritten = file_put_contents($path, $file->getStream()); + if (false === $bytesWritten) { + throw new Exception(static::class . '->getPathFromFile() failed to write temporary file'); + } + + return $path; + } + /** * Checks if the extractor is supported on the current environment, * for example if the correct binaries or libraries are available. @@ -132,7 +172,7 @@ abstract class FileTextExtractor abstract public function supportsExtension($extension); /** - * Determine if this extractor suports the given mime type. + * Determine if this extractor supports the given mime type. * Will only be called if supportsExtension returns false. * * @param string $mime @@ -141,10 +181,10 @@ abstract class FileTextExtractor abstract public function supportsMime($mime); /** - * Given a file path, extract the contents as text. + * Given a File instance, extract the contents as text. * - * @param string $path + * @param File|string $file Either the File instance, or a file path for a file to load * @return string */ - abstract public function getContent($path); + abstract public function getContent($file); } diff --git a/src/Extractor/FileTextExtractor/Exception.php b/src/Extractor/FileTextExtractor/Exception.php new file mode 100644 index 0000000..1a54a80 --- /dev/null +++ b/src/Extractor/FileTextExtractor/Exception.php @@ -0,0 +1,7 @@ + or @siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - '@]*?.*?@siu', - // Add line breaks before and after blocks - '@]*?>.*?@siu', + '@]*?>.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@config()->binary_location) { - $locations = array($location); + if ($location = $this->config()->get('binary_location')) { + $locations = [$location]; } else { - $locations = $this->config()->search_binary_locations; + $locations = $this->config()->get('search_binary_locations'); } // Find program in each path @@ -85,35 +84,41 @@ class PDFTextExtractor extends FileTextExtractor return null; } - public function getContent($path) + public function getContent($file) { - if (!$path) { - return ""; - } // no file - $content = $this->getRawOutput($path); + if (!$file || (is_string($file) && !file_exists($file))) { + // no file + return ''; + } + $content = $this->getRawOutput($file); return $this->cleanupLigatures($content); } /** - * Invoke pdftotext with the given path + * Invoke pdftotext with the given File object * - * @param string $path + * @param File|string $file * @return string Output - * @throws FileTextExtractor_Exception + * @throws Exception */ - protected function getRawOutput($path) + protected function getRawOutput($file) { if (!$this->isAvailable()) { - throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); + throw new Exception("getRawOutput called on unavailable extractor"); } + + $path = $file instanceof File ? $this->getPathFromFile($file) : $file; exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); if ($err) { if (!is_array($err) && $err == 1) { // For Windows compatibility $err = $content; } - throw new FileTextExtractor_Exception(sprintf( - 'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err) + + throw new Exception(sprintf( + 'PDFTextExtractor->getContent() failed for %s: %s', + $path, + implode(PHP_EOL, $err) )); } @@ -130,7 +135,7 @@ class PDFTextExtractor extends FileTextExtractor */ protected function cleanupLigatures($input) { - $mapping = array( + $mapping = [ 'ff' => 'ff', 'fi' => 'fi', 'fl' => 'fl', @@ -138,9 +143,8 @@ class PDFTextExtractor extends FileTextExtractor 'ffl' => 'ffl', 'ſt' => 'ft', 'st' => 'st' - ); + ]; return str_replace(array_keys($mapping), array_values($mapping), $input); } - } diff --git a/src/Extractor/SolrCellTextExtractor.php b/src/Extractor/SolrCellTextExtractor.php index b2f149e..130966e 100644 --- a/src/Extractor/SolrCellTextExtractor.php +++ b/src/Extractor/SolrCellTextExtractor.php @@ -2,9 +2,12 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - GuzzleHttp\Client, - Psr\Log\LoggerInterface; +use Exception; +use GuzzleHttp\Client; +use InvalidArgumentException; +use Psr\Log\LoggerInterface; +use SilverStripe\Assets\File; +use SilverStripe\Core\Injector\Injector; /** * Text extractor that calls an Apache Solr instance @@ -18,7 +21,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor, class SolrCellTextExtractor extends FileTextExtractor { /** - * Base URL to use for solr text extraction. + * Base URL to use for Solr text extraction. * E.g. http://localhost:8983/solr/update/extract * * @config @@ -27,43 +30,36 @@ class SolrCellTextExtractor extends FileTextExtractor private static $base_url; /** - * * @var int * @config */ private static $priority = 75; /** - * - * @var GuzzleHttp\Client + * @var Client */ protected $httpClient; /** - * - * @return GuzzleHttp\Client - * @throws InvalidArgumentException + * @return Client */ public function getHttpClient() { - if (!$this->config()->get('base_url')) { - throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); - } if (!$this->httpClient) { - $this->httpClient = new Client($this->config()->get('base_url')); + $this->httpClient = new Client(); } return $this->httpClient; } /** - * - * @param GuzzleHttp\Client $client - * @return void + * @param Client $client + * @return $this */ - public function setHttpClient($client) + public function setHttpClient(Client $client) { $this->httpClient = $client; + return $this; } /** @@ -73,30 +69,28 @@ class SolrCellTextExtractor extends FileTextExtractor { $url = $this->config()->get('base_url'); - return (boolean) $url; + return (bool) $url; } /** - * * @param string $extension - * @return boolean + * @return bool */ public function supportsExtension($extension) { return in_array( strtolower($extension), - array( + [ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' - ) + ] ); } /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -105,48 +99,57 @@ class SolrCellTextExtractor extends FileTextExtractor } /** - * - * @param string $path + * @param File|string $file * @return string + * @throws InvalidArgumentException */ - public function getContent($path) + public function getContent($file) { - if (!$path) { - return ""; - } // no file + if (!$file || (is_string($file) && !file_exists($file))) { + // no file + return ''; + } - $fileName = basename($path); + $fileName = $file instanceof File ? $file->getFilename() : basename($file); $client = $this->getHttpClient(); + // Get and validate base URL + $baseUrl = $this->config()->get('base_url'); + if (!$this->config()->get('base_url')) { + throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); + } + try { + $path = $this->getPathFromFile($file); $request = $client - ->post() - ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) - ->addPostFiles(array('myfile' => $path)); + ->post($baseUrl) + ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text']) + ->addPostFiles(['myfile' => $path]); $response = $request->send(); - } catch (\InvalidArgumentException $e) { + } catch (InvalidArgumentException $e) { $msg = sprintf( - 'Error extracting text from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Error extracting text from "%s" (message: %s)', + $fileName, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; - } catch (\Exception $e) { - // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. + } catch (Exception $e) { + // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search + // query in some cases. $msg = sprintf( - 'Tika server error attempting to extract from "%s" (message: %s)', - $path, - $e->getMessage() - ); + 'Tika server error attempting to extract from "%s" (message: %s)', + $path, + $e->getMessage() + ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } - // Just initialise it, it doesn't take miuch. + // Just initialise it, it doesn't take much. $matches = []; // Use preg match to avoid SimpleXML running out of memory on large text nodes diff --git a/src/Extractor/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php index 2ae38e8..e94eeb9 100644 --- a/src/Extractor/TikaServerTextExtractor.php +++ b/src/Extractor/TikaServerTextExtractor.php @@ -2,10 +2,10 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor, - SilverStripe\Core\Injector\Injector, - SilverStripe\Core\Environment, - SilverStripe\TextExtraction\Rest\TikaRestClient; +use SilverStripe\Assets\File; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; +use SilverStripe\TextExtraction\Rest\TikaRestClient; /** * Enables text extraction of file content via the Tika Rest Server @@ -35,18 +35,25 @@ class TikaServerTextExtractor extends FileTextExtractor */ protected $client = null; + /** + * Cache of supported mime types + * + * @var array + */ + protected $supportedMimes = []; + /** * @return TikaRestClient */ public function getClient() { - return $this->client ?: - ($this->client = - Injector::inst()->createWithArgs( - TikaRestClient::class, - array($this->getServerEndpoint()) - ) + if (!$this->client) { + $this->client = Injector::inst()->createWithArgs( + TikaRestClient::class, + [$this->getServerEndpoint()] ); + } + return $this->client; } /** @@ -59,19 +66,17 @@ class TikaServerTextExtractor extends FileTextExtractor } // Default to configured endpoint - return $this->config()->server_endpoint; + return $this->config()->get('server_endpoint'); } /** - * Get the version of tika installed, or 0 if not installed + * Get the version of Tika installed, or 0 if not installed * - * @return float version of tika + * @return float version of Tika */ public function getVersion() { - return $this - ->getClient() - ->getVersion(); + return $this->getClient()->getVersion(); } /** @@ -79,13 +84,12 @@ class TikaServerTextExtractor extends FileTextExtractor */ public function isAvailable() { - return $this->getServerEndpoint() && - $this->getClient()->isAvailable() && - version_compare($this->getVersion(), '1.7.0') >= 0; + return $this->getServerEndpoint() + && $this->getClient()->isAvailable() + && version_compare($this->getVersion(), '1.7.0') >= 0; } /** - * * @param string $extension * @return boolean */ @@ -95,31 +99,23 @@ class TikaServerTextExtractor extends FileTextExtractor return false; } - /** - * Cache of supported mime types - * - * @var array - */ - protected $supportedMimes = array(); - - /** - * * @param string $mime * @return boolean */ public function supportsMime($mime) { - $supported = $this->supportedMimes ?: - ($this->supportedMimes = $this->getClient()->getSupportedMimes()); + if (!$this->supportedMimes) { + $this->supportedMimes = $this->getClient()->getSupportedMimes(); + } // Check if supported (most common / quickest lookup) - if (isset($supported[$mime])) { + if (isset($this->supportedMimes[$mime])) { return true; } // Check aliases - foreach ($supported as $info) { + foreach ($this->supportedMimes as $info) { if (isset($info['alias']) && in_array($mime, $info['alias'])) { return true; } @@ -128,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor return false; } - public function getContent($path) + public function getContent($file) { - return $this->getClient()->tika($path); + $tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file; + return $this->getClient()->tika($tempFile); } } diff --git a/src/Extractor/TikaTextExtractor.php b/src/Extractor/TikaTextExtractor.php index 0d4b18f..bda599b 100644 --- a/src/Extractor/TikaTextExtractor.php +++ b/src/Extractor/TikaTextExtractor.php @@ -2,7 +2,7 @@ namespace SilverStripe\TextExtraction\Extractor; -use SilverStripe\TextExtraction\Extractor\FileTextExtractor; +use SilverStripe\Assets\File; /** * Enables text extraction of file content via the Tika CLI @@ -47,13 +47,13 @@ class TikaTextExtractor extends FileTextExtractor */ protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') { - $descriptorSpecs = array( - 0 => array("pipe", "r"), - 1 => array("pipe", "w"), - 2 => array("pipe", "w") - ); + $descriptorSpecs = [ + 0 => ["pipe", "r"], + 1 => ["pipe", "w"], + 2 => ["pipe", "w"] + ]; // Invoke command - $pipes = array(); + $pipes = []; $proc = proc_open($command, $descriptorSpecs, $pipes); if (!is_resource($proc)) { @@ -74,14 +74,10 @@ class TikaTextExtractor extends FileTextExtractor return proc_close($proc); } - /** - * - * @param string $path - * @return string - */ - public function getContent($path) + public function getContent($file) { - $mode = $this->config()->output_mode; + $mode = $this->config()->get('output_mode'); + $path = $file instanceof File ? $this->getPathFromFile($file) : $file; $command = sprintf('tika %s %s', $mode, escapeshellarg($path)); $code = $this->runShell($command, $output); @@ -91,8 +87,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function isAvailable() { @@ -100,8 +95,7 @@ class TikaTextExtractor extends FileTextExtractor } /** - * - * @return boolean + * @return bool */ public function supportsExtension($extension) { @@ -111,9 +105,8 @@ class TikaTextExtractor extends FileTextExtractor /** - * * @param string $mime - * @return boolean + * @return bool */ public function supportsMime($mime) { @@ -121,8 +114,9 @@ class TikaTextExtractor extends FileTextExtractor $code = $this->runShell('tika --list-supported-types', $supportedTypes, $error); if ($code) { + // Error case return false; - } // Error case + } // Check if the mime type is inside the result $pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/')); diff --git a/src/Rest/TikaRestClient.php b/src/Rest/TikaRestClient.php index 8473f67..34ffbde 100644 --- a/src/Rest/TikaRestClient.php +++ b/src/Rest/TikaRestClient.php @@ -2,11 +2,11 @@ namespace SilverStripe\TextExtraction\Rest; -use GuzzleHttp\Client, - GuzzleHttp\Exception\RequestException, - SilverStripe\Core\Environment, - Psr\Log\LoggerInterface, - SilverStripe\Core\Injector\Injector; +use GuzzleHttp\Client; +use GuzzleHttp\Exception\RequestException; +use Psr\Log\LoggerInterface; +use SilverStripe\Core\Environment; +use SilverStripe\Core\Injector\Injector; class TikaRestClient extends Client { @@ -15,30 +15,30 @@ class TikaRestClient extends Client * * @var array */ - protected $options = array('username' => null, 'password' => null); + protected $options = ['username' => null, 'password' => null]; /** * @var array */ - protected $mimes = array(); + protected $mimes = []; /** * * @param string $baseUrl - * @param array $config + * @param array $config */ - public function __construct($baseUrl = '', $config = null) + public function __construct($baseUrl = '', $config = []) { - $psswd = Environment::getEnv('SS_TIKA_PASSWORD'); + $password = Environment::getEnv('SS_TIKA_PASSWORD'); - if (!empty($psswd)) { - $this->options = array( + if (!empty($password)) { + $this->options = [ 'username' => Environment::getEnv('SS_TIKA_USERNAME'), - 'password' => $psswd, - ); + 'password' => $password, + ]; } - parent::__construct($baseUrl, $config); + parent::__construct($config); } /** @@ -58,7 +58,7 @@ class TikaRestClient extends Client } } catch (RequestException $ex) { $msg = sprintf("Tika unavailable - %s", $ex->getMessage()); - Injector::inst()->get(LoggerInterface::class)->error($msg); + Injector::inst()->get(LoggerInterface::class)->info($msg); return false; } @@ -120,7 +120,7 @@ class TikaRestClient extends Client try { $response = $this->put( 'tika', - array('Accept' => 'text/plain'), + ['Accept' => 'text/plain'], file_get_contents($file) ); $response->setAuth($this->options['username'], $this->options['password']); @@ -139,7 +139,7 @@ class TikaRestClient extends Client $msg .= ' Body: ' . $body; } - Injector::inst()->get(LoggerInterface::class)->notice($msg); + Injector::inst()->get(LoggerInterface::class)->info($msg); } return $text; diff --git a/tests/FileTextCacheDatabaseTest.php b/tests/FileTextCacheDatabaseTest.php index e300c19..e7fb242 100644 --- a/tests/FileTextCacheDatabaseTest.php +++ b/tests/FileTextCacheDatabaseTest.php @@ -1,23 +1,23 @@ set(Database::class, 'max_content_length', 5); - Config::inst()->update('FileTextCache_Database', 'max_content_length', 5); - $cache = new FileTextCache_Database(); - $file = $this->getMock('File', array('write')); + $cache = new Database(); + $file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock(); $content = '0123456789'; $cache->save($file, $content); - $this->assertEquals($cache->load($file), '01234'); - Config::unnest(); + $this->assertEquals($cache->load($file), '01234'); } } diff --git a/tests/FileTextExtractableTest.php b/tests/FileTextExtractableTest.php index 166b1ee..b1c3c89 100644 --- a/tests/FileTextExtractableTest.php +++ b/tests/FileTextExtractableTest.php @@ -1,46 +1,58 @@ array('FileTextExtractable') - ); + protected $usesDatabase = true; - public function setUp() + protected static $required_extensions = [ + File::class => [ + FileTextExtractable::class, + ], + ]; + + protected function setUp() { parent::setUp(); // Ensure that html is a valid extension - Config::inst() - ->nest() - ->update('File', 'allowed_extensions', array('html')); + Config::modify()->merge(File::class, 'allowed_extensions', ['html']); + + // Create a copy of the file, as it may be clobbered by the test + // ($file->extractFileAsText() calls $file->write) + copy( + dirname(__FILE__) . '/fixtures/test1.html', + dirname(__FILE__) . '/fixtures/test1-copy.html' + ); } - public function tearDown() + protected function tearDown() { - Config::unnest(); + if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) { + unlink(dirname(__FILE__) . '/fixtures/test1-copy.html'); + } + parent::tearDown(); } public function testExtractFileAsText() { - // Create a copy of the file, as it may be clobbered by the test - // ($file->extractFileAsText() calls $file->write) - copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); - // Use HTML, since the extractor is always available - $file = new File(array( - 'Name' => 'test1-copy.html', - 'Filename' => 'textextraction/tests/fixtures/test1-copy.html' - )); + /** @var File|FileTextExtractable $file */ + $file = new File(['Name' => 'test1-copy.html']); + $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html'); $file->write(); - + $content = $file->extractFileAsText(); + $this->assertNotNull($content); $this->assertContains('Test Headline', $content); $this->assertContains('Test Text', $content); $this->assertEquals($content, $file->FileContentCache); - - if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) { - unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html'); - } } } diff --git a/tests/HTMLTextExtractorTest.php b/tests/HTMLTextExtractorTest.php index 8ff8b0b..98c37bf 100644 --- a/tests/HTMLTextExtractorTest.php +++ b/tests/HTMLTextExtractorTest.php @@ -1,11 +1,33 @@ merge(File::class, 'allowed_extensions', ['html']); + } + public function testExtraction() { $extractor = new HTMLTextExtractor(); - $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); + $file = new File(); + $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html'); + $file->write(); + + $content = $extractor->getContent($file); + $this->assertContains('Test Headline', $content); $this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); $this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); diff --git a/tests/PDFTextExtractorTest.php b/tests/PDFTextExtractorTest.php index 96ad1ec..9e3fc02 100644 --- a/tests/PDFTextExtractorTest.php +++ b/tests/PDFTextExtractorTest.php @@ -1,17 +1,29 @@ isAvailable()) { - $this->setExpectedException( - 'FileTextExtractor_Exception', - 'getRawOutput called on unavailable extractor' - ); + $this->expectException(Exception::class); + $this->expectExceptionMessage('getRawOutput called on unavailable extractor'); } - $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); + $file = new File(); + $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf'); + $file->write(); + + $content = $extractor->getContent($file); $this->assertContains('This is a test file with a link', $content); } } diff --git a/tests/TikaServerTextExtractor.php b/tests/TikaServerTextExtractor.php new file mode 100644 index 0000000..0d9a7ed --- /dev/null +++ b/tests/TikaServerTextExtractor.php @@ -0,0 +1,36 @@ +isAvailable()) { + $this->markTestSkipped('tika server not available'); + } + + // Check file + $file = new File(); + $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf'); + $file->write(); + + $content = $extractor->getContent($file); + $this->assertContains('This is a test file with a link', $content); + + // Check mime validation + $this->assertTrue($extractor->supportsMime('application/pdf')); + $this->assertTrue($extractor->supportsMime('text/html')); + $this->assertFalse($extractor->supportsMime('application/not-supported')); + } +} diff --git a/tests/TikaTextExtractorTest.php b/tests/TikaTextExtractorTest.php index 0342dcf..e2674d2 100644 --- a/tests/TikaTextExtractorTest.php +++ b/tests/TikaTextExtractorTest.php @@ -1,37 +1,32 @@ isAvailable()) { $this->markTestSkipped('tika cli not available'); } // Check file - $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; - $content = $extractor->getContent($file); - $this->assertContains('This is a test file with a link', $content); + $file = new File(); + $file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf'); + $file->write(); - // Check mime validation - $this->assertTrue($extractor->supportsMime('application/pdf')); - $this->assertTrue($extractor->supportsMime('text/html')); - $this->assertFalse($extractor->supportsMime('application/not-supported')); - } - - public function testServerExtraction() - { - $extractor = new TikaServerTextExtractor(); - if (!$extractor->isAvailable()) { - $this->markTestSkipped('tika server not available'); - } - - // Check file - $file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'; $content = $extractor->getContent($file); $this->assertContains('This is a test file with a link', $content);