diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f0f1632 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +/tests export-ignore +/docs export-ignore +/.gitattributes export-ignore +/.travis.yml export-ignore +/.travis export-ignore +/.scrutinizer.yml export-ignore diff --git a/.travis.yml b/.travis.yml index 1605b03..a58d567 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,21 +1,35 @@ # See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details +language: php -language: php php: - - 5.4 + - 5.3 + - 5.4 + - 5.5 + - 5.6 + - 7.0 sudo: false -addons: - apt: - packages: - - poppler-utils - env: - - DB=MYSQL CORE_RELEASE=3.1 - - DB=MYSQL CORE_RELEASE=3 + - DB=MYSQL CORE_RELEASE=3.2 + +matrix: + include: + - php: 5.6 + env: DB=PGSQL CORE_RELEASE=3 + - php: 5.6 + env: DB=PGSQL CORE_RELEASE=3.1 + - php: 5.6 + env: DB=PGSQL CORE_RELEASE=3.2 + - php: 5.6 + env: DB=MYSQL CORE_RELEASE=3.3 + - php: 5.6 + env: DB=MYSQL CORE_RELEASE=3.2 + - php: 5.6 + env: DB=MYSQL CORE_RELEASE=3.1 before_script: + - composer self-update || true - mkdir -p $HOME/bin - export PATH=$PATH:$HOME/bin - export SS_TIKA_ENDPOINT="http://localhost:9998/" @@ -23,7 +37,16 @@ before_script: - git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support - php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss - cd ~/builds/ss + - composer install script: - ($HOME/bin/tika-rest-server &) &> /dev/null - vendor/bin/phpunit --verbose textextraction/tests/ + +branches: + only: + - master + +matrix: + allow_failures: + - php: 7.0 diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5db972b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +This project adheres to [Semantic Versioning](http://semver.org/). + + +## [2.0.1] +Using Symfony mime type detection + +## [2.0.0] +Clarified Tika docs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..d4006ed --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +# Contributing + +- Maintenance on this module is a shared effort of those who use it +- To contribute improvements to the code, ensure you raise a pull request and discuss with the module maintainers +- Please follow the SilverStripe [code contribution guidelines](https://docs.silverstripe.org/en/contributing/code/) and [Module Standard](https://docs.silverstripe.org/en/developer_guides/extending/modules/#module-standard) +- Supply documentation that followS the [GitHub Flavored Markdown](https://help.github.com/articles/markdown-basics/) conventions +- When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct/) + + +## Contributor license agreement +By supplying code to this module in patches, tickets and pull requests, you agree to assign copyright +of that code to SilverStripe Ltd., on the condition that these code changes are released under the +same BSD license as the original module. We ask for this so that the ownership in the license is clear +and unambiguous. By releasing this code under a permissive license such as BSD, this copyright assignment +won't prevent you from using the code in any way you see fit. \ No newline at end of file diff --git a/LICENSE b/LICENSE.md similarity index 93% rename from LICENSE rename to LICENSE.md index 00345e7..655f120 100644 --- a/LICENSE +++ b/LICENSE.md @@ -1,4 +1,4 @@ -* Copyright (c) 2010-2012, SilverStripe Ltd. +* Copyright (c) 2015, SilverStripe Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 8514412..4ceb538 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,18 @@ -# Text Extraction Module +# Text extraction module [![Build Status](https://secure.travis-ci.org/silverstripe-labs/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe-labs/silverstripe-textextraction) +[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe-labs/silverstripe-textextraction.svg?style=flat-square)](https://scrutinizer-ci.com/g/silverstripe-labs/silverstripe-textextraction) +[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat-square)](https://packagist.org/packages/silverstripe/silverstripe-textextraction) +[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat-square)](license.md) -## Overview -Provides an extraction API for file content, which can hook into different extractor -engines based on availability and the parsed file format. -The output is always a string: the file content. +Provides a text extraction API for file content, that can hook into different extractor +engines based on availability and the parsed file format. The output returned is always a string of the file content. Via the `FileTextExtractable` extension, this logic can be used to cache the extracted content on a `DataObject` subclass (usually `File`). -Note: Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx). - -## Requirements - - * SilverStripe 3.1 - * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) - * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) - * (optional) [Apache Tika](http://tika.apache.org/) - -### Supported Formats +The module supports text extraction on the following file formats: * HTML (built-in) * PDF (with XPDF or Solr) @@ -31,188 +23,44 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil * EPub (Solr) * Many others (Tika) +## Requirements + + * SilverStripe ^3.1 + * (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility) + * (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) + * (optional) [Apache Tika](http://tika.apache.org/) + ## Installation -The recommended installation is through [composer](http://getcomposer.org). -Add the following to your `composer.json`: - ```js -{ - "require": { - "silverstripe/textextraction": "2.0.x-dev" - } -} +composer require silverstripe/textextraction ``` The module depends on the [Guzzle HTTP Library](http://guzzlephp.org), which is automatically checked out by composer. Alternatively, install Guzzle through PEAR and ensure its in your `include_path`. -## Configuration - -### Basic - -By default, only extraction from HTML documents is supported. -No configuration is required for that, unless you want to make -the content available through your `DataObject` subclass. -In this case, add the following to `mysite/_config/config.yml`: - -```yaml -File: - extensions: - - FileTextExtractable -``` - -By default any extracted content will be cached against the database row. -In order to stay within common size constraints for SQL queries required in this operation, -the cache sets a maximum character length after which content gets truncated (default: 500000). -You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration. +## Documentation + * [Configuration](docs/en/configuration.md) + * [Developer documentation](/docs/en/developer-docs.md) + +## Bugtracker +Bugs are tracked in the issues section of this repository. Before submitting an issue please read over +existing issues to ensure yours is unique. + +If the issue does look like a new bug: + + - Create a new issue + - Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots + and screencasts can help here. + - Describe your environment as detailed as possible: SilverStripe version, Browser, PHP version, + Operating System, any installed SilverStripe modules. + +Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker. + +## Development and contribution +If you would like to make contributions to the module please ensure you raise a pull request and discuss + with the module maintainers. -Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. -In order to swap out the cache backend you can use the following yaml configuration. - -```yaml ---- -Name: mytextextraction -After: '#textextraction' ---- -Injector: - FileTextCache: FileTextCache_SSCache -FileTextCache_SSCache: - lifetime: 3600 # Number of seconds to cache content for - -``` - -### XPDF - -PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) -commandline utility. Follow their installation instructions, its presence will be automatically -detected. You can optionally set the binary path in `mysite/_config/config.yml`: - -```yml -PDFTextExtractor: - binary_location: /my/path/pdftotext -``` - -### Apache Solr - -Apache Solr is a fulltext search engine, an aspect which is often used -alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/) -through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface. -This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files. -The textextraction module retrieves the output of this service, rather than altering the index. -With the raw text output, you can decide to store it in a database column for fulltext search -in your database driver, or even pass it back to Solr as part of a full index update. - -In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): - -```yml -SolrCellTextExtractor: - base_url: 'http://localhost:8983/solr/update/extract' -``` - -Note that in case you're using multiple cores, you'll need to add the core name to the URL -(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract'). -The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch) -uses multiple cores by default, and comes prepackaged with a Solr server. -Its a stripped-down version of Solr, follow the module README on how to add -Apache Tika text extraction capabilities. - -You need to ensure that some indexable property on your object -returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`, -or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below). -The property should be listed in your `SolrIndex` subclass, e.g. as follows: - -```php -class MyDocument extends DataObject { - static $db = array('Path' => 'Text'); - function getContent() { - $extractor = FileTextExtractor::for_file($this->Path); - return $extractor ? $extractor->getContent($this->Path) : null; - } -} -class MySolrIndex extends SolrIndex { - function init() { - $this->addClass('MyDocument'); - $this->addStoredField('Content', 'HTMLText'); - } -} -``` - -Note: This isn't a terribly efficient way to process large amounts of files, since -each HTTP request is run synchronously. - -### Tika - -Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI. - -See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and -configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning -to have it running constantly in the background. Starting tika as a CLI script for every extraction request -is fairly slow, so we recommend running it as a server. - -This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) -installed to perform mime detection. Tika validates support via mime type rather than file extensions. - -### Tika - CLI - -Ensure that your machine has a 'tika' command available which will run the CLI script. - -```bash -#!/bin/bash -exec java -jar tika-app-1.8.jar "$@" -``` - -### Tika Rest Server - -Tika can also be run as a server. You can configure your server endpoint by setting the url via config. - -```yaml -TikaServerTextExtractor: - server_endpoint: 'http://localhost:9998' -``` - -Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. - - -Then startup your server as below - -```bash -java -jar tika-server-1.8.jar --host=localhost --port=9998 -``` - -While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), -it behaves differently and is not recommended. - -The module will log extraction errors with `SS_Log::NOTICE` priority by default, -for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. -In case you want more information on why processing failed, you can increase -the logging verbosity in the tika server instance by passing through -a `--includeStack` flag. Logs can passed on to files or external logging services, -see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling) -documentation for SilverStripe core. - -## Usage - -Manual extraction: - -```php -$myFile = '/my/path/myfile.pdf'; -$extractor = FileTextExtractor::for_file($myFile); -$content = $extractor->getContent($myFile); -``` - -Extraction with `FileTextExtractable` extension applied: - -```php -$myFileObj = File::get()->First(); -$content = $myFileObj->getFileContent(); -``` - -This content can also be embedded directly within a template. - -``` -$MyFile.FileContent -``` diff --git a/composer.json b/composer.json index 59101e5..69314c9 100644 --- a/composer.json +++ b/composer.json @@ -18,13 +18,13 @@ "require": { "php": ">=5.3.2", "composer/installers": "*", - "silverstripe/framework": "~3.1", - "guzzle/guzzle": "~3.9", - "symfony/event-dispatcher": "~2.6.0@stable", - "symfony/http-foundation": "~2.6.0" + "silverstripe/framework": "^3.1", + "guzzle/guzzle": "^3.9", + "symfony/event-dispatcher": "^2.6.0@stable", + "symfony/http-foundation": "^2.6.0" }, "require-dev": { - "phpunit/phpunit": "~3.7" + "phpunit/phpunit": "^3.7" }, "suggest": { "ext-fileinfo": "Improved support for file mime detection" diff --git a/docs/en/configuration.md b/docs/en/configuration.md new file mode 100644 index 0000000..caa61a1 --- /dev/null +++ b/docs/en/configuration.md @@ -0,0 +1,145 @@ +# Configuration + +## Basic + +By default, only extraction from HTML documents is supported. +No configuration is required for that, unless you want to make +the content available through your `DataObject` subclass. +In this case, add the following to `mysite/_config/config.yml`: + +```yaml +File: + extensions: + - FileTextExtractable +``` + +By default any extracted content will be cached against the database row. +In order to stay within common size constraints for SQL queries required in this operation, +the cache sets a maximum character length after which content gets truncated (default: 500000). +You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration. + + +Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. +In order to swap out the cache backend you can use the following yaml configuration. + + +```yaml +--- +Name: mytextextraction +After: '#textextraction' +--- +Injector: + FileTextCache: FileTextCache_SSCache +FileTextCache_SSCache: + lifetime: 3600 # Number of seconds to cache content for + +``` + +## XPDF + +PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) +commandline utility. Follow their installation instructions, its presence will be automatically +detected. You can optionally set the binary path in `mysite/_config/config.yml`: + +```yml +PDFTextExtractor: + binary_location: /my/path/pdftotext +``` + +## Apache Solr + +Apache Solr is a fulltext search engine, an aspect which is often used +alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/) +through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface. +This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files. +The textextraction module retrieves the output of this service, rather than altering the index. +With the raw text output, you can decide to store it in a database column for fulltext search +in your database driver, or even pass it back to Solr as part of a full index update. + +In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`): + +```yml +SolrCellTextExtractor: + base_url: 'http://localhost:8983/solr/update/extract' +``` + +Note that in case you're using multiple cores, you'll need to add the core name to the URL +(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract'). +The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch) +uses multiple cores by default, and comes prepackaged with a Solr server. +Its a stripped-down version of Solr, follow the module README on how to add +Apache Tika text extraction capabilities. + +You need to ensure that some indexable property on your object +returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`, +or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below). +The property should be listed in your `SolrIndex` subclass, e.g. as follows: + +```php +class MyDocument extends DataObject { + static $db = array('Path' => 'Text'); + function getContent() { + $extractor = FileTextExtractor::for_file($this->Path); + return $extractor ? $extractor->getContent($this->Path) : null; + } +} +class MySolrIndex extends SolrIndex { + function init() { + $this->addClass('MyDocument'); + $this->addStoredField('Content', 'HTMLText'); + } +} +``` + +Note: This isn't a terribly efficient way to process large amounts of files, since +each HTTP request is run synchronously. + +## Tika + +Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI. + +See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and +configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning +to have it running constantly in the background. Starting tika as a CLI script for every extraction request +is fairly slow, so we recommend running it as a server. + +This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php) +installed to perform mime detection. Tika validates support via mime type rather than file extensions. + +## Tika - CLI + +Ensure that your machine has a 'tika' command available which will run the CLI script. + +```bash +#!/bin/bash +exec java -jar tika-app-1.8.jar "$@" +``` + +## Tika Rest Server + +Tika can also be run as a server. You can configure your server endpoint by setting the url via config. + +```yaml +TikaServerTextExtractor: + server_endpoint: 'http://localhost:9998' +``` + +Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name. + + +Then startup your server as below + +```bash +java -jar tika-server-1.8.jar --host=localhost --port=9998 +``` + +While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag), +it behaves differently and is not recommended. + +The module will log extraction errors with `SS_Log::NOTICE` priority by default, +for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF. +In case you want more information on why processing failed, you can increase +the logging verbosity in the tika server instance by passing through +a `--includeStack` flag. Logs can passed on to files or external logging services, +see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling) +documentation for SilverStripe core. \ No newline at end of file diff --git a/docs/en/developer-docs.md b/docs/en/developer-docs.md new file mode 100644 index 0000000..a009dde --- /dev/null +++ b/docs/en/developer-docs.md @@ -0,0 +1,23 @@ +# Developer documentation +## Usage + +Manual extraction: + +```php +$myFile = '/my/path/myfile.pdf'; +$extractor = FileTextExtractor::for_file($myFile); +$content = $extractor->getContent($myFile); +``` + +Extraction with `FileTextExtractable` extension applied: + +```php +$myFileObj = File::get()->First(); +$content = $myFileObj->getFileContent(); +``` + +This content can also be embedded directly within a template. + +``` +$MyFile.FileContent +```