diff --git a/.travis.yml b/.travis.yml index 1605b03..0b96f17 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,28 +1,38 @@ -# See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details - -language: php -php: - - 5.4 +# See https://github.com/silverstripe/silverstripe-travis-support for setup details +language: php sudo: false addons: apt: packages: - - poppler-utils + - poppler-utils -env: - - DB=MYSQL CORE_RELEASE=3.1 - - DB=MYSQL CORE_RELEASE=3 +matrix: + include: + - php: 5.4 + env: DB=PGSQL CORE_RELEASE=3.2 + - php: 5.5 + env: DB=PGSQL CORE_RELEASE=3.3 + - php: 5.6 + env: DB=PGSQL CORE_RELEASE=3.4 + - php: 5.6 + env: DB=MYSQL CORE_RELEASE=3.5 + - php: 7.0 + env: DB=MYSQL CORE_RELEASE=3.6 + - php: 7.1 + env: DB=MYSQL CORE_RELEASE=3 before_script: + - composer self-update || true - mkdir -p $HOME/bin - export PATH=$PATH:$HOME/bin - export SS_TIKA_ENDPOINT="http://localhost:9998/" - ./.travis/install_tika.sh - - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support + - git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - cd ~/builds/ss + - composer install script: - ($HOME/bin/tika-rest-server &) &> /dev/null diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5db972b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +This project adheres to [Semantic Versioning](http://semver.org/). + + +## [2.0.1] +Using Symfony mime type detection + +## [2.0.0] +Clarified Tika docs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d4006ed --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +# Contributing + +- Maintenance on this module is a shared effort of those who use it +- To contribute improvements to the code, ensure you raise a pull request and discuss with the module maintainers +- Please follow the SilverStripe [code contribution guidelines](https://docs.silverstripe.org/en/contributing/code/) and [Module Standard](https://docs.silverstripe.org/en/developer_guides/extending/modules/#module-standard) +- Supply documentation that followS the [GitHub Flavored Markdown](https://help.github.com/articles/markdown-basics/) conventions +- When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct/) + + +## Contributor license agreement +By supplying code to this module in patches, tickets and pull requests, you agree to assign copyright +of that code to SilverStripe Ltd., on the condition that these code changes are released under the +same BSD license as the original module. We ask for this so that the ownership in the license is clear +and unambiguous. By releasing this code under a permissive license such as BSD, this copyright assignment +won't prevent you from using the code in any way you see fit. \ No newline at end of file diff --git a/README.md b/README.md index 80e51af..0c6ef46 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,18 @@ -# Text Extraction Module +# Text extraction module [![Build Status](https://secure.travis-ci.org/silverstripe-labs/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe-labs/silverstripe-textextraction) +[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe-labs/silverstripe-textextraction.svg?style=flat-square)](https://scrutinizer-ci.com/g/silverstripe-labs/silverstripe-textextraction) +[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat-square)](https://packagist.org/packages/silverstripe/silverstripe-textextraction) +[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat-square)](license.md) -## Overview -Provides an extraction API for file content, which can hook into different extractor -engines based on availability and the parsed file format. -The output is always a string: the file content. +Provides a text extraction API for file content, that can hook into different extractor +engines based on availability and the parsed file format. The output returned is always a string of the file content. -Via the `FileTextExtractable` extension, this logic can be used to +Via the `FileTextExtractable` extension, this logic can be used to cache the extracted content on a `DataObject` subclass (usually `File`). -Note: Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx). - -## Requirements - - * SilverStripe 3.1 - * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) - * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) - * (optional) [Apache Tika](http://tika.apache.org/) - -### Supported Formats +The module supports text extraction on the following file formats: * HTML (built-in) * PDF (with XPDF or Solr) @@ -31,17 +23,17 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil * EPub (Solr) * Many others (Tika) +## Requirements + + * SilverStripe ^3.1 + * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) + * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) + * (optional) [Apache Tika](http://tika.apache.org/) + ## Installation -The recommended installation is through [composer](http://getcomposer.org). -Add the following to your `composer.json`: - ```js -{ - "require": { - "silverstripe/textextraction": "2.0.x-dev" - } -} +composer require silverstripe/textextraction ``` The module depends on the [Guzzle HTTP Library](http://guzzlephp.org), @@ -60,7 +52,7 @@ In this case, add the following to `mysite/_config/config.yml`: ```yaml File: extensions: - - FileTextExtractable + - FileTextExtractable ``` By default any extracted content will be cached against the database row. @@ -89,11 +81,11 @@ FileTextCache_SSCache: PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) commandline utility. Follow their installation instructions, its presence will be automatically -detected for *nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml` +detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml` ```yml PDFTextExtractor: - binary_location: /my/path/pdftotext + binary_location: /my/path/pdftotext ``` ### Apache Solr @@ -110,10 +102,10 @@ In order to use Solr, you need to configure a URL for it (in `mysite/_config/con ```yml SolrCellTextExtractor: - base_url: 'http://localhost:8983/solr/update/extract' + base_url: 'http://localhost:8983/solr/update/extract' ``` -Note that in case you're using multiple cores, you'll need to add the core name to the URL +Note that in case you're using multiple cores, you'll need to add the core name to the URL (e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract'). The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch) uses multiple cores by default, and comes prepackaged with a Solr server. @@ -130,7 +122,7 @@ class MyDocument extends DataObject { static $db = array('Path' => 'Text'); function getContent() { $extractor = FileTextExtractor::for_file($this->Path); - return $extractor ? $extractor->getContent($this->Path) : null; + return $extractor ? $extractor->getContent($this->Path) : null; } } class MySolrIndex extends SolrIndex { @@ -141,7 +133,7 @@ class MySolrIndex extends SolrIndex { } ``` -Note: This isn't a terribly efficient way to process large amounts of files, since +Note: This isn't a terribly efficient way to process large amounts of files, since each HTTP request is run synchronously. ### Tika @@ -153,66 +145,21 @@ configuring this. Download the latest `tika-app` for running as a CLI script, or to have it running constantly in the background. Starting tika as a CLI script for every extraction request is fairly slow, so we recommend running it as a server. -This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) -installed to perform mime detection. Tika validates support via mime type rather than file extensions. +## Bugtracker -### Tika - CLI +Bugs are tracked in the issues section of this repository. Before submitting an issue please read over +existing issues to ensure yours is unique. -Ensure that your machine has a 'tika' command available which will run the CLI script. +If the issue does look like a new bug: -```bash -#!/bin/bash -exec java -jar tika-app-1.8.jar "$@" -``` + - Create a new issue + - Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots + and screencasts can help here. + - Describe your environment as detailed as possible: SilverStripe version, Browser, PHP version, + Operating System, any installed SilverStripe modules. -### Tika Rest Server +Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker. -Tika can also be run as a server. You can configure your server endpoint by setting the url via config. - -```yaml -TikaServerTextExtractor: - server_endpoint: 'http://localhost:9998' -``` - -Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. - - -Then startup your server as below - -```bash -java -jar tika-server-1.8.jar --host=localhost --port=9998 -``` - -While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), -it behaves differently and is not recommended. - -The module will log extraction errors with `SS_Log::NOTICE` priority by default, -for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. -In case you want more information on why processing failed, you can increase -the logging verbosity in the tika server instance by passing through -a `--includeStack` flag. Logs can passed on to files or external logging services, -see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling) -documentation for SilverStripe core. - -## Usage - -Manual extraction: - -```php -$myFile = '/my/path/myfile.pdf'; -$extractor = FileTextExtractor::for_file($myFile); -$content = $extractor->getContent($myFile); -``` - -Extraction with `FileTextExtractable` extension applied: - -```php -$myFileObj = File::get()->First(); -$content = $myFileObj->getFileContent(); -``` - -This content can also be embedded directly within a template. - -``` -$MyFile.FileContent -``` +## Development and contribution +If you would like to make contributions to the module please ensure you raise a pull request and discuss + with the module maintainers. diff --git a/composer.json b/composer.json index b57bb2e..1846bbd 100644 --- a/composer.json +++ b/composer.json @@ -18,13 +18,13 @@ "require": { "php": ">=5.3.2", "composer/installers": "*", - "silverstripe/framework": "~3.1", - "guzzle/guzzle": "~3.9", - "symfony/event-dispatcher": "~2.6.0@stable", - "symfony/http-foundation": "~2.6.0" + "silverstripe/framework": "^3.1", + "guzzle/guzzle": "^3.9", + "symfony/event-dispatcher": "^2.6.0@stable", + "symfony/http-foundation": "^2.6.0" }, "require-dev": { - "phpunit/phpunit": "~3.7" + "phpunit/phpunit": "^3.7" }, "suggest": { "ext-fileinfo": "Improved support for file mime detection" diff --git a/docs/en/configuration.md b/docs/en/configuration.md new file mode 100644 index 0000000..caa61a1 --- /dev/null +++ b/docs/en/configuration.md @@ -0,0 +1,145 @@ +# Configuration + +## Basic + +By default, only extraction from HTML documents is supported. +No configuration is required for that, unless you want to make +the content available through your `DataObject` subclass. +In this case, add the following to `mysite/_config/config.yml`: + +```yaml +File: + extensions: + - FileTextExtractable +``` + +By default any extracted content will be cached against the database row. +In order to stay within common size constraints for SQL queries required in this operation, +the cache sets a maximum character length after which content gets truncated (default: 500000). +You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration. + + +Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. +In order to swap out the cache backend you can use the following yaml configuration. + + +```yaml +--- +Name: mytextextraction +After: '#textextraction' +--- +Injector: + FileTextCache: FileTextCache_SSCache +FileTextCache_SSCache: + lifetime: 3600 # Number of seconds to cache content for + +``` + +## XPDF + +PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) +commandline utility. Follow their installation instructions, its presence will be automatically +detected. You can optionally set the binary path in `mysite/_config/config.yml`: + +```yml +PDFTextExtractor: + binary_location: /my/path/pdftotext +``` + +## Apache Solr + +Apache Solr is a fulltext search engine, an aspect which is often used +alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/) +through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface. +This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files. +The textextraction module retrieves the output of this service, rather than altering the index. +With the raw text output, you can decide to store it in a database column for fulltext search +in your database driver, or even pass it back to Solr as part of a full index update. + +In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): + +```yml +SolrCellTextExtractor: + base_url: 'http://localhost:8983/solr/update/extract' +``` + +Note that in case you're using multiple cores, you'll need to add the core name to the URL +(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract'). +The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch) +uses multiple cores by default, and comes prepackaged with a Solr server. +Its a stripped-down version of Solr, follow the module README on how to add +Apache Tika text extraction capabilities. + +You need to ensure that some indexable property on your object +returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`, +or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below). +The property should be listed in your `SolrIndex` subclass, e.g. as follows: + +```php +class MyDocument extends DataObject { + static $db = array('Path' => 'Text'); + function getContent() { + $extractor = FileTextExtractor::for_file($this->Path); + return $extractor ? $extractor->getContent($this->Path) : null; + } +} +class MySolrIndex extends SolrIndex { + function init() { + $this->addClass('MyDocument'); + $this->addStoredField('Content', 'HTMLText'); + } +} +``` + +Note: This isn't a terribly efficient way to process large amounts of files, since +each HTTP request is run synchronously. + +## Tika + +Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI. + +See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and +configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning +to have it running constantly in the background. Starting tika as a CLI script for every extraction request +is fairly slow, so we recommend running it as a server. + +This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) +installed to perform mime detection. Tika validates support via mime type rather than file extensions. + +## Tika - CLI + +Ensure that your machine has a 'tika' command available which will run the CLI script. + +```bash +#!/bin/bash +exec java -jar tika-app-1.8.jar "$@" +``` + +## Tika Rest Server + +Tika can also be run as a server. You can configure your server endpoint by setting the url via config. + +```yaml +TikaServerTextExtractor: + server_endpoint: 'http://localhost:9998' +``` + +Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. + + +Then startup your server as below + +```bash +java -jar tika-server-1.8.jar --host=localhost --port=9998 +``` + +While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), +it behaves differently and is not recommended. + +The module will log extraction errors with `SS_Log::NOTICE` priority by default, +for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. +In case you want more information on why processing failed, you can increase +the logging verbosity in the tika server instance by passing through +a `--includeStack` flag. Logs can passed on to files or external logging services, +see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling) +documentation for SilverStripe core. \ No newline at end of file diff --git a/docs/en/developer-docs.md b/docs/en/developer-docs.md new file mode 100644 index 0000000..a009dde --- /dev/null +++ b/docs/en/developer-docs.md @@ -0,0 +1,23 @@ +# Developer documentation +## Usage + +Manual extraction: + +```php +$myFile = '/my/path/myfile.pdf'; +$extractor = FileTextExtractor::for_file($myFile); +$content = $extractor->getContent($myFile); +``` + +Extraction with `FileTextExtractable` extension applied: + +```php +$myFileObj = File::get()->First(); +$content = $myFileObj->getFileContent(); +``` + +This content can also be embedded directly within a template. + +``` +$MyFile.FileContent +```