Compare commits
125 Commits
Author | SHA1 | Date |
---|---|---|
Guy Sartorelli | e04501cb52 | |
Maxime Rainville | 821d2858f3 | |
Guy Sartorelli | de215d63f6 | |
Maxime Rainville | 2a260607ec | |
Steve Boyd | 6a92eb58e2 | |
Guy Sartorelli | 87869e94a6 | |
Guy Sartorelli | 61f443d49c | |
Guy Sartorelli | 8f2e1d9b75 | |
Sabina Talipova | a281114ed2 | |
Steve Boyd | 1a0cd6d6a6 | |
Steve Boyd | 3bfa989a7e | |
Steve Boyd | 46b7f51040 | |
Guy Sartorelli | 041296bda2 | |
Steve Boyd | e8997870c5 | |
Steve Boyd | e8061724c5 | |
Steve Boyd | db8a36fa3e | |
Guy Sartorelli | 04ff0c6084 | |
Steve Boyd | e5bf4f1322 | |
Guy Sartorelli | 4674084d0d | |
Steve Boyd | df8b17ab85 | |
Michal Kleiner | 77fecc4c53 | |
Guy Sartorelli | d03a9f06e2 | |
Michal Kleiner | 88e7f27c5c | |
Guy Sartorelli | 04e4b60435 | |
Maxime Rainville | 25d8a55058 | |
Steve Boyd | e8f015ddd2 | |
Michal Kleiner | 254c4e31f8 | |
GuySartorelli | 7ad3fc9f13 | |
Maxime Rainville | eb36dcf5fb | |
Steve Boyd | b92616eb4e | |
Steve Boyd | 90d4812aa8 | |
Maxime Rainville | d1bdc003ad | |
Steve Boyd | 6af13768d3 | |
Garion Herman | 4250acb50e | |
Steve Boyd | 795abde8f1 | |
Steve Boyd | d1e241ed56 | |
Steve Boyd | 8e9a0243bb | |
Robbie Averill | cb15845a95 | |
Steve Boyd | 3564066245 | |
Steve Boyd | 06995b2ec7 | |
Maxime Rainville | 01848af86d | |
Steve Boyd | e451f96b0b | |
Robbie Averill | d0a7db0b68 | |
Russell Michell | 42cc545414 | |
Robbie Averill | 6234a971d1 | |
Robbie Averill | 0d7c507b53 | |
Robbie Averill | d5313674c3 | |
Robbie Averill | 32e2f9f84f | |
Robbie Averill | 5b967fd5d3 | |
Robbie Averill | 943f393ee8 | |
Charlie Bergthaler | 242e5a307d | |
Charlie Bergthaler | a9270d73ad | |
Robbie Averill | b4c634bb1f | |
Robbie Averill | 20079bd33f | |
Guy Marriott | c5cfe4ea1e | |
Martin Hipp | bff5eb2b79 | |
Robbie Averill | 801cd9cacb | |
Dylan Wagstaff | 9c2da06178 | |
Robbie Averill | 276fd9c856 | |
Robbie Averill | 759d92ccb4 | |
Robbie Averill | b9502653c2 | |
Robbie Averill | 86eba78064 | |
Ishan Jayamanne | 21ed6e0f86 | |
Robbie Averill | 75a8c66eee | |
Robbie Averill | 07c000dc0d | |
Dylan Wagstaff | 03d1fef4ae | |
Robbie Averill | e1e7cdbfa4 | |
Robbie Averill | 231a2091af | |
Daniel Hensby | b20738573f | |
Robbie Averill | 1b8ea2e451 | |
Dylan Wagstaff | 9795866abe | |
Robbie Averill | 9e8ed243d0 | |
Robbie Averill | 397e7a5d40 | |
Robbie Averill | 40e4b05f5d | |
Robbie Averill | 5e5a1f05da | |
Robbie Averill | 6bf932e5f0 | |
Robbie Averill | 770af5cfc9 | |
Robbie Averill | 3c1457c0ee | |
Robbie Averill | 5d53be9df6 | |
Robbie Averill | edb02e9189 | |
Robbie Averill | 8bd019b2aa | |
Robbie Averill | e2404fc904 | |
Robbie Averill | 8d295ada9c | |
Robbie Averill | fe5148e678 | |
Robbie Averill | 66c9db8c0d | |
Robbie Averill | f1bacd2aa9 | |
Robbie Averill | 300941c9e8 | |
Robbie Averill | dd292bd554 | |
Robbie Averill | 45cd9ae4ed | |
Dylan Wagstaff | d06569c8fd | |
Dylan Wagstaff | 31925d654e | |
Robbie Averill | e491042d3b | |
Robbie Averill | 33746e0cd7 | |
Russell Michell | 912c457c7d | |
Russell Michell | d09a5aa97c | |
Russell Michell | f341010d7a | |
Robbie Averill | 875e608d0f | |
Robbie Averill | c83a7c3403 | |
Robbie Averill | 95d96efe40 | |
Robbie Averill | 9f04583ed5 | |
Robbie Averill | a8a4e0c02f | |
Robbie Averill | 9f3819408c | |
Daniel Hensby | eb25505a8e | |
Jake Dale Ovenden | eb7a45865b | |
Robbie Averill | 40ba6a245d | |
Robbie Averill | 3d289b4e05 | |
Robbie Averill | f8c3015161 | |
Damian Mooyman | 23e255b5c6 | |
Juan van den Anker | 0761311170 | |
Damian Mooyman | 1b89000fcd | |
Alexandre Guidet | 196007314a | |
Damian Mooyman | 545e711f16 | |
Daniel Hensby | 5d24770d79 | |
Damian Mooyman | 5a5c648c1e | |
Daniel Hensby | e9e33605b4 | |
Damian Mooyman | e0125ba745 | |
Daniel Hensby | aaf9238384 | |
Daniel Hensby | 61750e33fc | |
Jake Bentvelzen | 75ffe7b56a | |
Hamish Friedlander | bde4cf4536 | |
Damian Mooyman | f72ba3a978 | |
Damian Mooyman | 9e44e834cf | |
helpfulrobot | 0420d56e4d | |
Cam Findlay | 7b3fb280c6 | |
cam-findlay | a34c443be5 |
|
@ -4,3 +4,4 @@
|
|||
/.gitignore export-ignore
|
||||
/.travis.yml export-ignore
|
||||
/.scrutinizer.yml export-ignore
|
||||
/codecov.yml export-ignore
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
name: CI
|
||||
uses: silverstripe/gha-ci/.github/workflows/ci.yml@v1
|
|
@ -0,0 +1,16 @@
|
|||
name: Dispatch CI
|
||||
|
||||
on:
|
||||
# At 12:20 PM UTC, only on Saturday and Sunday
|
||||
schedule:
|
||||
- cron: '20 12 * * 6,0'
|
||||
|
||||
jobs:
|
||||
dispatch-ci:
|
||||
name: Dispatch CI
|
||||
# Only run cron on the silverstripe account
|
||||
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dispatch CI
|
||||
uses: silverstripe/gha-dispatch-ci@v1
|
|
@ -0,0 +1,17 @@
|
|||
name: Keepalive
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
# The 4th of every month at 10:50am UTC
|
||||
schedule:
|
||||
- cron: '50 10 4 * *'
|
||||
|
||||
jobs:
|
||||
keepalive:
|
||||
name: Keepalive
|
||||
# Only run cron on the silverstripe account
|
||||
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Keepalive
|
||||
uses: silverstripe/gha-keepalive@v1
|
|
@ -1,9 +0,0 @@
|
|||
inherit: true
|
||||
|
||||
checks:
|
||||
php:
|
||||
code_rating: true
|
||||
duplication: true
|
||||
|
||||
filter:
|
||||
paths: [code/*, tests/*]
|
29
.travis.yml
29
.travis.yml
|
@ -1,29 +0,0 @@
|
|||
# See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details
|
||||
|
||||
language: php
|
||||
php:
|
||||
- 5.4
|
||||
|
||||
sudo: false
|
||||
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- poppler-utils
|
||||
|
||||
env:
|
||||
- DB=MYSQL CORE_RELEASE=3.1
|
||||
- DB=MYSQL CORE_RELEASE=3
|
||||
|
||||
before_script:
|
||||
- mkdir -p $HOME/bin
|
||||
- export PATH=$PATH:$HOME/bin
|
||||
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
|
||||
- ./.travis/install_tika.sh
|
||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
||||
- cd ~/builds/ss
|
||||
|
||||
script:
|
||||
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
|
@ -0,0 +1,14 @@
|
|||
mappings:
|
||||
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
|
||||
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
|
||||
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
|
||||
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
|
||||
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
|
||||
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
|
||||
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
|
||||
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
|
||||
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
# Contributing
|
||||
|
||||
- Maintenance on this module is a shared effort of those who use it
|
||||
- To contribute improvements to the code, ensure you raise a pull request and discuss with the module maintainers
|
||||
- Please follow the SilverStripe [code contribution guidelines](https://docs.silverstripe.org/en/contributing/code/) and [Module Standard](https://docs.silverstripe.org/en/developer_guides/extending/modules/#module-standard)
|
||||
- Supply documentation that followS the [GitHub Flavored Markdown](https://help.github.com/articles/markdown-basics/) conventions
|
||||
- When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct/)
|
||||
|
||||
|
||||
## Contributor license agreement
|
||||
By supplying code to this module in patches, tickets and pull requests, you agree to assign copyright
|
||||
of that code to SilverStripe Ltd., on the condition that these code changes are released under the
|
||||
same BSD license as the original module. We ask for this so that the ownership in the license is clear
|
||||
and unambiguous. By releasing this code under a permissive license such as BSD, this copyright assignment
|
||||
won't prevent you from using the code in any way you see fit.
|
220
README.md
220
README.md
|
@ -1,26 +1,15 @@
|
|||
# Text Extraction Module
|
||||
# Text extraction module
|
||||
|
||||
[![Build Status](https://secure.travis-ci.org/silverstripe-labs/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe-labs/silverstripe-textextraction)
|
||||
[![CI](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml/badge.svg)](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml)
|
||||
[![Silverstripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
||||
|
||||
## Overview
|
||||
Provides a text extraction API for file content, that can hook into different extractor
|
||||
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
||||
|
||||
Provides an extraction API for file content, which can hook into different extractor
|
||||
engines based on availability and the parsed file format.
|
||||
The output is always a string: the file content.
|
||||
|
||||
Via the `FileTextExtractable` extension, this logic can be used to
|
||||
Via the `FileTextExtractable` extension, this logic can be used to
|
||||
cache the extracted content on a `DataObject` subclass (usually `File`).
|
||||
|
||||
Note: Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
|
||||
|
||||
## Requirements
|
||||
|
||||
* SilverStripe 3.1
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||
|
||||
### Supported Formats
|
||||
The module supports text extraction on the following file formats:
|
||||
|
||||
* HTML (built-in)
|
||||
* PDF (with XPDF or Solr)
|
||||
|
@ -31,188 +20,43 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
|||
* EPub (Solr)
|
||||
* Many others (Tika)
|
||||
|
||||
## Requirements
|
||||
|
||||
* Silverstripe ^4.0
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||
|
||||
## Installation
|
||||
|
||||
The recommended installation is through [composer](http://getcomposer.org).
|
||||
Add the following to your `composer.json`:
|
||||
|
||||
```js
|
||||
{
|
||||
"require": {
|
||||
"silverstripe/textextraction": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
```
|
||||
composer require silverstripe/textextraction
|
||||
```
|
||||
|
||||
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
||||
which is automatically checked out by composer. Alternatively, install Guzzle
|
||||
through PEAR and ensure its in your `include_path`.
|
||||
|
||||
## Configuration
|
||||
## Documentation
|
||||
|
||||
### Basic
|
||||
* [Configuration](docs/en/configuration.md)
|
||||
* [Developer documentation](/docs/en/developer-docs.md)
|
||||
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
## Bugtracker
|
||||
|
||||
```yaml
|
||||
File:
|
||||
extensions:
|
||||
- FileTextExtractable
|
||||
```
|
||||
Bugs are tracked in the issues section of this repository. Before submitting an issue please read over
|
||||
existing issues to ensure yours is unique.
|
||||
|
||||
By default any extracted content will be cached against the database row.
|
||||
In order to stay within common size constraints for SQL queries required in this operation,
|
||||
the cache sets a maximum character length after which content gets truncated (default: 500000).
|
||||
You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration.
|
||||
If the issue does look like a new bug:
|
||||
|
||||
- Create a new issue
|
||||
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
|
||||
and screencasts can help here.
|
||||
- Describe your environment as detailed as possible: Silverstripe version, Browser, PHP version,
|
||||
Operating System, any installed Silverstripe modules.
|
||||
|
||||
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
|
||||
In order to swap out the cache backend you can use the following yaml configuration.
|
||||
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.
|
||||
|
||||
|
||||
```yaml
|
||||
---
|
||||
Name: mytextextraction
|
||||
After: '#textextraction'
|
||||
---
|
||||
Injector:
|
||||
FileTextCache: FileTextCache_SSCache
|
||||
FileTextCache_SSCache:
|
||||
lifetime: 3600 # Number of seconds to cache content for
|
||||
|
||||
```
|
||||
|
||||
### XPDF
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
||||
|
||||
```yml
|
||||
PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
```
|
||||
|
||||
### Apache Solr
|
||||
|
||||
Apache Solr is a fulltext search engine, an aspect which is often used
|
||||
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
|
||||
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
|
||||
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
|
||||
The textextraction module retrieves the output of this service, rather than altering the index.
|
||||
With the raw text output, you can decide to store it in a database column for fulltext search
|
||||
in your database driver, or even pass it back to Solr as part of a full index update.
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
```yml
|
||||
SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
```
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
|
||||
uses multiple cores by default, and comes prepackaged with a Solr server.
|
||||
Its a stripped-down version of Solr, follow the module README on how to add
|
||||
Apache Tika text extraction capabilities.
|
||||
|
||||
You need to ensure that some indexable property on your object
|
||||
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
|
||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
|
||||
```php
|
||||
class MyDocument extends DataObject {
|
||||
static $db = array('Path' => 'Text');
|
||||
function getContent() {
|
||||
$extractor = FileTextExtractor::for_file($this->Path);
|
||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||
}
|
||||
}
|
||||
class MySolrIndex extends SolrIndex {
|
||||
function init() {
|
||||
$this->addClass('MyDocument');
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
### Tika
|
||||
|
||||
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
|
||||
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
|
||||
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
|
||||
is fairly slow, so we recommend running it as a server.
|
||||
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
### Tika - CLI
|
||||
|
||||
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
exec java -jar tika-app-1.8.jar "$@"
|
||||
```
|
||||
|
||||
### Tika Rest Server
|
||||
|
||||
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
||||
|
||||
```yaml
|
||||
TikaServerTextExtractor:
|
||||
server_endpoint: 'http://localhost:9998'
|
||||
```
|
||||
|
||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name.
|
||||
|
||||
|
||||
Then startup your server as below
|
||||
|
||||
```bash
|
||||
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||
```
|
||||
|
||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||
it behaves differently and is not recommended.
|
||||
|
||||
The module will log extraction errors with `SS_Log::NOTICE` priority by default,
|
||||
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
||||
In case you want more information on why processing failed, you can increase
|
||||
the logging verbosity in the tika server instance by passing through
|
||||
a `--includeStack` flag. Logs can passed on to files or external logging services,
|
||||
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
|
||||
documentation for SilverStripe core.
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
||||
```php
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Extraction with `FileTextExtractable` extension applied:
|
||||
|
||||
```php
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->getFileContent();
|
||||
```
|
||||
|
||||
This content can also be embedded directly within a template.
|
||||
|
||||
```
|
||||
$MyFile.FileContent
|
||||
```
|
||||
## Development and contribution
|
||||
If you would like to make contributions to the module please ensure you raise a pull request and discuss
|
||||
with the module maintainers.
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
Name: textextractioncache
|
||||
After:
|
||||
- '#corecache'
|
||||
---
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
||||
factory: SilverStripe\Core\Cache\CacheFactory
|
||||
constructor:
|
||||
namespace: 'FileTextCache_Cache'
|
|
@ -1,11 +1,10 @@
|
|||
---
|
||||
Name: textextraction
|
||||
Name: textextractionconfig
|
||||
---
|
||||
Injector:
|
||||
FileTextCache: FileTextCache_Database
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
# Define default FileTextCache implementation
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||
|
||||
SolrCellTextExtractor:
|
||||
# base_url: 'http://localhost:8983/solr/update/extract'
|
||||
|
||||
FileTextCache_Database:
|
||||
max_content_length: 500000
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||
max_content_length: 500000
|
||||
|
|
|
@ -1,112 +0,0 @@
|
|||
<?php
|
||||
|
||||
interface FileTextCache
|
||||
{
|
||||
/**
|
||||
* Save extracted content for a given File entity
|
||||
*
|
||||
* @param File $file
|
||||
* @param string $content
|
||||
*/
|
||||
public function save(File $file, $content);
|
||||
|
||||
/**
|
||||
* Return any cached extracted content for a given file entity
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function load(File $file);
|
||||
|
||||
/**
|
||||
* Invalidate the cache for a given file.
|
||||
* Invoked in onBeforeWrite on the file
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function invalidate(File $file);
|
||||
}
|
||||
|
||||
/**
|
||||
* Caches the extracted content on the record for the file.
|
||||
* Limits the stored file content by default to avoid hitting query size limits.
|
||||
*/
|
||||
class FileTextCache_Database implements FileTextCache
|
||||
{
|
||||
public function load(File $file)
|
||||
{
|
||||
return $file->FileContentCache;
|
||||
}
|
||||
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||
$file->write();
|
||||
}
|
||||
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
// To prevent writing to the cache from invalidating it
|
||||
if (!$file->isChanged('FileContentCache')) {
|
||||
$file->FileContentCache = '';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses SS_Cache with a lifetime to cache extracted content
|
||||
*/
|
||||
class FileTextCache_SSCache implements FileTextCache, Flushable
|
||||
{
|
||||
/**
|
||||
* Lifetime of cache in seconds
|
||||
* Null is indefinite
|
||||
*
|
||||
* @var int|null
|
||||
* @config
|
||||
*/
|
||||
private static $lifetime = null;
|
||||
|
||||
/**
|
||||
* @return SS_Cache
|
||||
*/
|
||||
protected static function get_cache()
|
||||
{
|
||||
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
||||
$cache = SS_Cache::factory(__CLASS__);
|
||||
$cache->setLifetime($lifetime);
|
||||
return $cache;
|
||||
}
|
||||
|
||||
protected function getKey(File $file)
|
||||
{
|
||||
return md5($file->getFullPath());
|
||||
}
|
||||
|
||||
public function load(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
return $cache->load($key);
|
||||
}
|
||||
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
return $cache->save($content, $key);
|
||||
}
|
||||
|
||||
public static function flush()
|
||||
{
|
||||
$cache = self::get_cache();
|
||||
$cache->clean();
|
||||
}
|
||||
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
return $cache->remove($key);
|
||||
}
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor
|
||||
{
|
||||
public function isAvailable()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array("html", "htm", "xhtml")
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return strtolower($mime) === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
public function getContent($path)
|
||||
{
|
||||
$content = file_get_contents($path);
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
array(
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
),
|
||||
array(
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
||||
),
|
||||
$content
|
||||
);
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
|
@ -1,107 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor
|
||||
{
|
||||
public function isAvailable()
|
||||
{
|
||||
$bin = $this->bin('pdftotext');
|
||||
return (file_exists($bin) && is_executable($bin));
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return strtolower($extension) === 'pdf';
|
||||
}
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($mime),
|
||||
array(
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
*
|
||||
* @param string $prog Name of binary
|
||||
* @return string
|
||||
*/
|
||||
protected function bin($prog = '')
|
||||
{
|
||||
if ($this->config()->binary_location) {
|
||||
// By config
|
||||
$path = $this->config()->binary_location;
|
||||
} elseif (file_exists('/usr/bin/pdftotext')) {
|
||||
// By searching common directories
|
||||
$path = '/usr/bin';
|
||||
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
||||
$path = '/usr/local/bin';
|
||||
} else {
|
||||
$path = '.'; // Hope it's in path
|
||||
}
|
||||
|
||||
return ($path ? $path . '/' : '') . $prog;
|
||||
}
|
||||
|
||||
public function getContent($path)
|
||||
{
|
||||
if (!$path) {
|
||||
return "";
|
||||
} // no file
|
||||
$content = $this->getRawOutput($path);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke pdftotext with the given path
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Output
|
||||
* @throws FileTextExtractor_Exception
|
||||
*/
|
||||
protected function getRawOutput($path)
|
||||
{
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||
if ($err) {
|
||||
throw new FileTextExtractor_Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
$path,
|
||||
implode('', $err)
|
||||
));
|
||||
}
|
||||
return implode('', $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes utf-8 ligatures.
|
||||
*
|
||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||
*
|
||||
* @param string $input
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanupLigatures($input)
|
||||
{
|
||||
$mapping = array(
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
'ffi' => 'ffi',
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
);
|
||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||
}
|
||||
}
|
|
@ -1,102 +0,0 @@
|
|||
<?php
|
||||
use Guzzle\Http\Client;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
||||
* Does not alter the Solr index itself, but uses it purely
|
||||
* for its file parsing abilities.
|
||||
*
|
||||
* @author ischommer
|
||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||
*/
|
||||
class SolrCellTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Base URL to use for solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
* @var string
|
||||
*/
|
||||
private static $base_url;
|
||||
|
||||
private static $priority = 75;
|
||||
|
||||
protected $httpClient;
|
||||
|
||||
public function getHttpClient()
|
||||
{
|
||||
if (!$this->config()->get('base_url')) {
|
||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
if (!$this->httpClient) {
|
||||
$this->httpClient = new Client($this->config()->get('base_url'));
|
||||
}
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
public function setHttpClient($client)
|
||||
{
|
||||
$this->httpClient = $client;
|
||||
}
|
||||
|
||||
public function isAvailable()
|
||||
{
|
||||
$url = $this->config()->get('base_url');
|
||||
return (boolean) $url;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array(
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
// Rely on supportsExtension
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path)
|
||||
{
|
||||
if (!$path) {
|
||||
return "";
|
||||
} // no file
|
||||
|
||||
$fileName = basename($path);
|
||||
$client = $this->getHttpClient();
|
||||
try {
|
||||
$request = $client
|
||||
->post()
|
||||
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
||||
->addPostFiles(array('myfile' => $path));
|
||||
$response = $request->send();
|
||||
} catch (InvalidArgumentException $e) {
|
||||
SS_Log::log(
|
||||
sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
),
|
||||
SS_Log::NOTICE
|
||||
);
|
||||
return null;
|
||||
}
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
||||
(string)$response->getBody(),
|
||||
$matches
|
||||
);
|
||||
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
|
@ -1,116 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaServerTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Tika server is pretty efficient so use it immediately if available
|
||||
*
|
||||
* @var integer
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 80;
|
||||
|
||||
/**
|
||||
* Server endpoint
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $server_endpoint;
|
||||
|
||||
/**
|
||||
* @var TikaRestClient
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient()
|
||||
{
|
||||
return $this->client ?:
|
||||
($this->client =
|
||||
Injector::inst()->createWithArgs(
|
||||
'TikaRestClient',
|
||||
array($this->getServerEndpoint())
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function getServerEndpoint()
|
||||
{
|
||||
if (defined('SS_TIKA_ENDPOINT')) {
|
||||
return SS_TIKA_ENDPOINT;
|
||||
}
|
||||
|
||||
if (getenv('SS_TIKA_ENDPOINT')) {
|
||||
return getenv('SS_TIKA_ENDPOINT');
|
||||
}
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->server_endpoint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
return $this
|
||||
->getClient()
|
||||
->getVersion();
|
||||
}
|
||||
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getServerEndpoint() &&
|
||||
$this->getClient()->isAvailable() &&
|
||||
$this->getVersion() >= 1.7;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = array();
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
$supported = $this->supportedMimes ?:
|
||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if (isset($supported[$mime])) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check aliases
|
||||
foreach ($supported as $info) {
|
||||
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path)
|
||||
{
|
||||
return $this->getClient()->tika($path);
|
||||
}
|
||||
}
|
|
@ -1,99 +0,0 @@
|
|||
<?php
|
||||
|
||||
use Guzzle\Http\Client;
|
||||
use Guzzle\Http\Exception\RequestException;
|
||||
|
||||
class TikaRestClient extends Client
|
||||
{
|
||||
/**
|
||||
* Detect if the service is available
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
try {
|
||||
return $this
|
||||
->get()->send()
|
||||
->getStatusCode() == 200;
|
||||
} catch (RequestException $ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get version code
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
$response = $this->get('version')->send();
|
||||
// Parse output
|
||||
if ($response->getStatusCode() == 200 &&
|
||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
||||
) {
|
||||
return (float)$matches['version'];
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
protected $mimes = array();
|
||||
|
||||
/**
|
||||
* Gets supported mime data. May include aliased mime types.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getSupportedMimes()
|
||||
{
|
||||
if ($this->mimes) {
|
||||
return $this->mimes;
|
||||
}
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
array('Accept' => 'application/json')
|
||||
)->send();
|
||||
|
||||
return $this->mimes = $response->json();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file.
|
||||
* Logs a notice-level error if the document can't be parsed.
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file)
|
||||
{
|
||||
$text = null;
|
||||
try {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array('Accept' => 'text/plain'),
|
||||
file_get_contents($file)
|
||||
)->send();
|
||||
$text = $response->getBody(true);
|
||||
} catch (RequestException $e) {
|
||||
$msg = sprintf(
|
||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||
$file,
|
||||
$e->getResponse()->getStatusCode(),
|
||||
$e->getResponse()->getReasonPhrase()
|
||||
);
|
||||
|
||||
// Only available if tika-server was started with --includeStack
|
||||
$body = $e->getResponse()->getBody(true);
|
||||
if ($body) {
|
||||
$msg .= ' Body: ' . $body;
|
||||
}
|
||||
|
||||
SS_Log::log($msg, SS_Log::NOTICE);
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
comment: false
|
||||
codecov:
|
||||
branch: master
|
|
@ -1,37 +1,45 @@
|
|||
{
|
||||
"name": "silverstripe/textextraction",
|
||||
"type": "silverstripe-module",
|
||||
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
||||
"homepage": "http://silverstripe.org",
|
||||
"license": "BSD-3-Clause",
|
||||
"keywords": ["silverstripe", "fulltext", "pdf"],
|
||||
"authors": [
|
||||
{
|
||||
"name": "SilverStripe",
|
||||
"homepage": "http://silverstripe.com"
|
||||
},
|
||||
{
|
||||
"name": "The SilverStripe Community",
|
||||
"homepage": "http://silverstripe.org"
|
||||
}
|
||||
],
|
||||
"require": {
|
||||
"php": ">=5.3.2",
|
||||
"composer/installers": "*",
|
||||
"silverstripe/framework": "~3.1",
|
||||
"guzzle/guzzle": "~3.9",
|
||||
"symfony/event-dispatcher": "~2.6.0@stable",
|
||||
"symfony/http-foundation": "~2.6.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "~3.7"
|
||||
},
|
||||
"suggest": {
|
||||
"ext-fileinfo": "Improved support for file mime detection"
|
||||
},
|
||||
"extra": {
|
||||
"branch-alias": {
|
||||
"dev-master": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
}
|
||||
"name": "silverstripe/textextraction",
|
||||
"type": "silverstripe-vendormodule",
|
||||
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
||||
"homepage": "http://silverstripe.org",
|
||||
"license": "BSD-3-Clause",
|
||||
"keywords": [
|
||||
"silverstripe",
|
||||
"fulltext",
|
||||
"pdf"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "SilverStripe",
|
||||
"homepage": "http://silverstripe.com"
|
||||
},
|
||||
{
|
||||
"name": "The SilverStripe Community",
|
||||
"homepage": "http://silverstripe.org"
|
||||
}
|
||||
],
|
||||
"require": {
|
||||
"php": "^7.4 || ^8.0",
|
||||
"silverstripe/framework": "^4.10",
|
||||
"silverstripe/assets": "^1",
|
||||
"silverstripe/versioned": "^1",
|
||||
"guzzlehttp/guzzle": "^6.3 || ^7.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"squizlabs/php_codesniffer": "^3",
|
||||
"phpunit/phpunit": "^9.5"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"SilverStripe\\TextExtraction\\": "src/",
|
||||
"SilverStripe\\TextExtraction\\Tests\\": "tests/"
|
||||
}
|
||||
},
|
||||
"suggest": {
|
||||
"ext-fileinfo": "Improved support for file mime detection"
|
||||
},
|
||||
"extra": [],
|
||||
"minimum-stability": "dev",
|
||||
"prefer-stable": true
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
# Configuration
|
||||
|
||||
## Basic
|
||||
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
|
||||
```yaml
|
||||
SilverStripe\Assets\File:
|
||||
extensions:
|
||||
- SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||
```
|
||||
|
||||
By default any extracted content will be cached against the database row. In order to stay within common size
|
||||
constraints for SQL queries required in this operation, the cache sets a maximum character length after which
|
||||
content gets truncated (default: 500000). You can configure this value through
|
||||
`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
|
||||
|
||||
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
|
||||
In order to swap out the cache backend you can use the following yaml configuration.
|
||||
|
||||
```yaml
|
||||
---
|
||||
Name: mytextextraction
|
||||
After: '#textextraction'
|
||||
---
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache\Cache:
|
||||
lifetime: 3600 # Number of seconds to cache content for
|
||||
```
|
||||
|
||||
## XPDF
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
||||
|
||||
```yml
|
||||
SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
```
|
||||
|
||||
## Apache Solr
|
||||
|
||||
Apache Solr is a fulltext search engine, an aspect which is often used
|
||||
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
|
||||
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
|
||||
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
|
||||
The textextraction module retrieves the output of this service, rather than altering the index.
|
||||
With the raw text output, you can decide to store it in a database column for fulltext search
|
||||
in your database driver, or even pass it back to Solr as part of a full index update.
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
```yml
|
||||
SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
```
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
|
||||
uses multiple cores by default, and comes prepackaged with a Solr server.
|
||||
Its a stripped-down version of Solr, follow the module README on how to add
|
||||
Apache Tika text extraction capabilities.
|
||||
|
||||
You need to ensure that some indexable property on your object
|
||||
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
|
||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
|
||||
```php
|
||||
use SilverStripe\ORM\DataObject;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
class MyDocument extends DataObject
|
||||
{
|
||||
private static $db = ['Path' => 'Text'];
|
||||
|
||||
public function getContent()
|
||||
{
|
||||
$extractor = FileTextExtractor::for_file($this->Path);
|
||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||
}
|
||||
}
|
||||
|
||||
use SilverStripe\FullTextSearch\Solr;
|
||||
|
||||
class MySolrIndex extends SolrIndex
|
||||
{
|
||||
public function init()
|
||||
{
|
||||
$this->addClass(MyDocument::class);
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
|
||||
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
|
||||
code.
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
## Tika
|
||||
|
||||
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
|
||||
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
|
||||
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
|
||||
is fairly slow, so we recommend running it as a server.
|
||||
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
## Tika - CLI
|
||||
|
||||
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
exec java -jar tika-app-1.8.jar "$@"
|
||||
```
|
||||
|
||||
## Tika Rest Server
|
||||
|
||||
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
||||
|
||||
```yaml
|
||||
SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
|
||||
server_endpoint: 'http://localhost:9998'
|
||||
```
|
||||
|
||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
|
||||
environment variable of the same name.
|
||||
|
||||
|
||||
Then startup your server as below:
|
||||
|
||||
```bash
|
||||
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||
```
|
||||
|
||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||
it behaves differently and is not recommended.
|
||||
|
||||
The module will log extraction errors with PSR-3 "notice" priority by default,
|
||||
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
||||
In case you want more information on why processing failed, you can increase
|
||||
the logging verbosity in the tika server instance by passing through
|
||||
a `--includeStack` flag. Logs can passed on to files or external logging services,
|
||||
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
|
||||
documentation for SilverStripe core.
|
|
@ -0,0 +1,32 @@
|
|||
# Developer documentation
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction via string file path:
|
||||
|
||||
```php
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Manual extraction via File object:
|
||||
|
||||
```php
|
||||
$myFile = File::get()->filter(['Name' => 'My file')->first();
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Extraction with `FileTextExtractable` extension applied:
|
||||
|
||||
```php
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->getFileContent();
|
||||
```
|
||||
|
||||
This content can also be embedded directly within a template.
|
||||
|
||||
```
|
||||
$MyFile.FileContent
|
||||
```
|
|
@ -1,4 +1,4 @@
|
|||
Copyright (c) 2015, SilverStripe Limited
|
||||
Copyright (c) 2018, SilverStripe Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ruleset name="SilverStripe">
|
||||
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
|
||||
|
||||
<file>src</file>
|
||||
<file>tests</file>
|
||||
|
||||
<!-- base rules are PSR-2 -->
|
||||
<rule ref="PSR2" >
|
||||
<!-- Current exclusions -->
|
||||
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
|
||||
</rule>
|
||||
</ruleset>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
|
||||
<testsuites>
|
||||
<testsuite name="Default">
|
||||
<directory>tests/</directory>
|
||||
</testsuite>
|
||||
</testsuites>
|
||||
|
||||
<filter>
|
||||
<whitelist addUncoveredFilesFromWhitelist="true">
|
||||
<directory suffix=".php">src/</directory>
|
||||
<exclude>
|
||||
<directory suffix=".php">tests/</directory>
|
||||
</exclude>
|
||||
</whitelist>
|
||||
</filter>
|
||||
</phpunit>
|
|
@ -0,0 +1,31 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
interface FileTextCache
|
||||
{
|
||||
/**
|
||||
* Save extracted content for a given File entity
|
||||
*
|
||||
* @param File $file
|
||||
* @param string $content
|
||||
*/
|
||||
public function save(File $file, $content);
|
||||
|
||||
/**
|
||||
* Return any cached extracted content for a given file entity
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function load(File $file);
|
||||
|
||||
/**
|
||||
* Invalidate the cache for a given file.
|
||||
* Invoked in onBeforeWrite on the file
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function invalidate(File $file);
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use Psr\SimpleCache\CacheInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Flushable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Uses SS_Cache with a lifetime to cache extracted content
|
||||
*/
|
||||
class Cache implements FileTextCache, Flushable
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* Lifetime of cache in seconds
|
||||
* Null defaults to 3600 (1 hour)
|
||||
*
|
||||
* @var int|null
|
||||
* @config
|
||||
*/
|
||||
private static $lifetime = null;
|
||||
|
||||
/**
|
||||
* @return CacheInterface
|
||||
*/
|
||||
protected static function get_cache()
|
||||
{
|
||||
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
|
||||
|
||||
return Injector::inst()->get($for);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
*/
|
||||
protected function getKey(File $file)
|
||||
{
|
||||
return md5($file->getFilename() ?? '');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return mixed
|
||||
*/
|
||||
public function load(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->get($key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @param string $content
|
||||
* @return string
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$lifetime = $this->config()->get('lifetime') ?: 3600;
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->set($key, $content, $lifetime);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public static function flush()
|
||||
{
|
||||
$cache = self::get_cache();
|
||||
$cache->clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Alias for $this->flush()
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clear()
|
||||
{
|
||||
$cache = self::get_cache();
|
||||
$cache->clear();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return bool
|
||||
*/
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->delete($key);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Caches the extracted content on the record for the file.
|
||||
* Limits the stored file content by default to avoid hitting query size limits.
|
||||
*/
|
||||
class Database implements FileTextCache
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* @config
|
||||
* @var int
|
||||
*/
|
||||
private static $max_content_length = null;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return FileTextCache
|
||||
*/
|
||||
public function load(File $file)
|
||||
{
|
||||
return $file->FileContentCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @param mixed $content
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$maxLength = $this->config()->get('max_content_length');
|
||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||
$file->write();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @return void
|
||||
*/
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
// To prevent writing to the cache from invalidating it
|
||||
if (!$file->isChanged('FileContentCache')) {
|
||||
$file->FileContentCache = '';
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,27 +1,45 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\ORM\DataExtension;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
/**
|
||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||
* FileTextExtractor to do the extraction based on the content type of the file.
|
||||
*
|
||||
*
|
||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||
*
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class FileTextExtractable extends DataExtension
|
||||
{
|
||||
private static $db = array(
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $db = [
|
||||
'FileContentCache' => 'Text'
|
||||
);
|
||||
];
|
||||
|
||||
private static $casting = array(
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $casting = [
|
||||
'FileContent' => 'Text'
|
||||
);
|
||||
];
|
||||
|
||||
private static $dependencies = array(
|
||||
'TextCache' => '%$FileTextCache'
|
||||
);
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $dependencies = [
|
||||
'TextCache' => '%$' . FileTextCache::class,
|
||||
];
|
||||
|
||||
/**
|
||||
* @var FileTextCache
|
||||
|
@ -29,12 +47,13 @@ class FileTextExtractable extends DataExtension
|
|||
protected $fileTextCache = null;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param FileTextCache $cache
|
||||
* @param FileTextCache $cache
|
||||
* @return $this
|
||||
*/
|
||||
public function setTextCache(FileTextCache $cache)
|
||||
{
|
||||
$this->fileTextCache = $cache;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -56,38 +75,46 @@ class FileTextExtractable extends DataExtension
|
|||
}
|
||||
|
||||
/**
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||
* The value is also cached into the File record itself.
|
||||
*
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
|
||||
* returns the text. The value is also cached into the File record itself.
|
||||
*
|
||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||
* If true, the content parsing is forced, bypassing the cached version
|
||||
* @return string
|
||||
* If true, the content parsing is forced, bypassing
|
||||
* the cached version
|
||||
* @return string|null
|
||||
*/
|
||||
public function extractFileAsText($disableCache = false)
|
||||
{
|
||||
/** @var File $file */
|
||||
$file = $this->owner;
|
||||
if (!$disableCache) {
|
||||
$text = $this->getTextCache()->load($this->owner);
|
||||
$text = $this->getTextCache()->load($file);
|
||||
if ($text) {
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||
$extractor = FileTextExtractor::for_file($file);
|
||||
if (!$extractor) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$text = $extractor->getContent($this->owner->FullPath);
|
||||
$text = $extractor->getContent($file);
|
||||
if (!$text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$this->getTextCache()->save($this->owner, $text);
|
||||
if (!$disableCache) {
|
||||
$this->getTextCache()->save($file, $text);
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function onBeforeWrite()
|
||||
{
|
||||
// Clear cache before changing file
|
|
@ -1,12 +1,24 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\ClassInfo;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Injector\Injectable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
|
||||
/**
|
||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
abstract class FileTextExtractor extends Object
|
||||
abstract class FileTextExtractor
|
||||
{
|
||||
use Configurable;
|
||||
use Injectable;
|
||||
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
* The highest priority extractor for a given content type will be selected.
|
||||
|
@ -34,18 +46,19 @@ abstract class FileTextExtractor extends Object
|
|||
if (self::$sorted_extractor_classes) {
|
||||
return self::$sorted_extractor_classes;
|
||||
}
|
||||
|
||||
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
$classes = ClassInfo::subclassesFor(__CLASS__);
|
||||
array_shift($classes);
|
||||
$classPriorities = array();
|
||||
$classPriorities = [];
|
||||
|
||||
foreach ($classes as $class) {
|
||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||
}
|
||||
arsort($classPriorities);
|
||||
|
||||
// Save classes
|
||||
$sortedClasses = array_keys($classPriorities);
|
||||
$sortedClasses = array_keys($classPriorities ?? []);
|
||||
return self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
|
||||
|
@ -61,30 +74,28 @@ abstract class FileTextExtractor extends Object
|
|||
}
|
||||
|
||||
/**
|
||||
* Attempt to detect mime type for given file
|
||||
* Given a File object, decide which extractor instance to use to handle it
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Mime type if found
|
||||
*/
|
||||
protected static function get_mime($path)
|
||||
{
|
||||
$file = new Symfony\Component\HttpFoundation\File\File($path);
|
||||
|
||||
return $file->getMimeType();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $path
|
||||
* @param File|string $file
|
||||
* @return FileTextExtractor|null
|
||||
*/
|
||||
public static function for_file($path)
|
||||
public static function for_file($file)
|
||||
{
|
||||
if (!file_exists($path) || is_dir($path)) {
|
||||
return;
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
$mime = self::get_mime($path);
|
||||
// Ensure we have a File instance to work with
|
||||
if (is_string($file)) {
|
||||
/** @var File $fileObject */
|
||||
$fileObject = File::create();
|
||||
$fileObject->setFromLocalFile($file);
|
||||
$file = $fileObject;
|
||||
}
|
||||
|
||||
$extension = $file->getExtension();
|
||||
$mime = $file->getMimeType();
|
||||
|
||||
foreach (self::get_extractor_classes() as $className) {
|
||||
$extractor = self::get_extractor($className);
|
||||
|
||||
|
@ -105,10 +116,43 @@ abstract class FileTextExtractor extends Object
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
|
||||
* file contents to a temp file and return its path
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected static function getPathFromFile(File $file)
|
||||
{
|
||||
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||
if (false === $path) {
|
||||
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
|
||||
}
|
||||
|
||||
// Append extension to temp file if one is set
|
||||
if ($file->getExtension()) {
|
||||
$path .= '.' . $file->getExtension();
|
||||
}
|
||||
|
||||
// Remove any existing temp files with this name
|
||||
if (file_exists($path ?? '')) {
|
||||
unlink($path ?? '');
|
||||
}
|
||||
|
||||
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
|
||||
if (false === $bytesWritten) {
|
||||
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
||||
}
|
||||
|
||||
return $path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the extractor is supported on the current environment,
|
||||
* for example if the correct binaries or libraries are available.
|
||||
*
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function isAvailable();
|
||||
|
@ -123,23 +167,19 @@ abstract class FileTextExtractor extends Object
|
|||
abstract public function supportsExtension($extension);
|
||||
|
||||
/**
|
||||
* Determine if this extractor suports the given mime type.
|
||||
* Determine if this extractor supports the given mime type.
|
||||
* Will only be called if supportsExtension returns false.
|
||||
*
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsMime($mime);
|
||||
|
||||
/**
|
||||
* Given a file path, extract the contents as text.
|
||||
*
|
||||
* @param string $path
|
||||
* Given a File instance, extract the contents as text.
|
||||
*
|
||||
* @param File|string $file Either the File instance, or a file path for a file to load
|
||||
* @return string
|
||||
*/
|
||||
abstract public function getContent($path);
|
||||
}
|
||||
|
||||
class FileTextExtractor_Exception extends Exception
|
||||
{
|
||||
abstract public function getContent($file);
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
class Exception extends \Exception
|
||||
{
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
|
||||
* the best for readable text.
|
||||
*
|
||||
* @author mstephens
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
* @return boolean
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return array
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(strtolower($extension ?? ''), ["html", "htm", "xhtml"]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return string
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return strtolower($mime ?? '') === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
*/
|
||||
public function getContent($file)
|
||||
{
|
||||
$content = $file instanceof File ? $file->getString() : file_get_contents($file ?? '');
|
||||
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
[
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
],
|
||||
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
||||
$content ?? ''
|
||||
);
|
||||
|
||||
return strip_tags($content ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Set to bin path this extractor can execute
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
private static $binary_location = null;
|
||||
|
||||
/**
|
||||
* Used if binary_location isn't set.
|
||||
* List of locations to search for a given binary in
|
||||
*
|
||||
* @config
|
||||
* @var array
|
||||
*/
|
||||
private static $search_binary_locations = [
|
||||
'/usr/bin',
|
||||
'/usr/local/bin',
|
||||
];
|
||||
|
||||
public function isAvailable()
|
||||
{
|
||||
$bin = $this->bin('pdftotext');
|
||||
return $bin && file_exists($bin ?? '') && is_executable($bin ?? '');
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return strtolower($extension ?? '') === 'pdf';
|
||||
}
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($mime ?? ''),
|
||||
[
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
*
|
||||
* @param string $program Name of binary
|
||||
* @return string
|
||||
*/
|
||||
protected function bin($program = '')
|
||||
{
|
||||
// Get list of allowed search paths
|
||||
if ($location = $this->config()->get('binary_location')) {
|
||||
$locations = [$location];
|
||||
} else {
|
||||
$locations = $this->config()->get('search_binary_locations');
|
||||
}
|
||||
|
||||
// Find program in each path
|
||||
foreach ($locations as $location) {
|
||||
$path = "{$location}/{$program}";
|
||||
if (file_exists($path ?? '')) {
|
||||
return $path;
|
||||
}
|
||||
if (file_exists($path . '.exe')) {
|
||||
return $path . '.exe';
|
||||
}
|
||||
}
|
||||
|
||||
// Not found
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
$content = $this->getRawOutput($file);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke pdftotext with the given File object
|
||||
*
|
||||
* @param File|string $file
|
||||
* @return string Output
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function getRawOutput($file)
|
||||
{
|
||||
if (!$this->isAvailable()) {
|
||||
throw new Exception("getRawOutput called on unavailable extractor");
|
||||
}
|
||||
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path ?? '')), $content, $err);
|
||||
|
||||
if ($err) {
|
||||
throw new Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
$path,
|
||||
implode(PHP_EOL, $content)
|
||||
));
|
||||
}
|
||||
|
||||
return implode(PHP_EOL, $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes utf-8 ligatures.
|
||||
*
|
||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||
*
|
||||
* @param string $input
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanupLigatures($input)
|
||||
{
|
||||
$mapping = [
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
'ffi' => 'ffi',
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
];
|
||||
|
||||
return str_replace(array_keys($mapping ?? []), array_values($mapping ?? []), $input ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use Exception;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use InvalidArgumentException;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
||||
* Does not alter the Solr index itself, but uses it purely
|
||||
* for its file parsing abilities.
|
||||
*
|
||||
* @author ischommer
|
||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||
*/
|
||||
class SolrCellTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Base URL to use for Solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
* @var string
|
||||
*/
|
||||
private static $base_url;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 75;
|
||||
|
||||
/**
|
||||
* @var Client
|
||||
*/
|
||||
protected $httpClient;
|
||||
|
||||
/**
|
||||
* @return Client
|
||||
*/
|
||||
public function getHttpClient()
|
||||
{
|
||||
if (!$this->httpClient) {
|
||||
$this->httpClient = new Client();
|
||||
}
|
||||
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Client $client
|
||||
* @return $this
|
||||
*/
|
||||
public function setHttpClient(Client $client)
|
||||
{
|
||||
$this->httpClient = $client;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
$url = $this->config()->get('base_url');
|
||||
|
||||
return (bool) $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension ?? ''),
|
||||
[
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
// Rely on supportsExtension
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File|string $file
|
||||
* @return string
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
|
||||
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
|
||||
$client = $this->getHttpClient();
|
||||
|
||||
// Get and validate base URL
|
||||
$baseUrl = $this->config()->get('base_url');
|
||||
if (!$this->config()->get('base_url')) {
|
||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
|
||||
try {
|
||||
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
|
||||
/** @var Response $response */
|
||||
$response = $client
|
||||
->post($baseUrl, [
|
||||
'multipart' => [
|
||||
['name' => 'extractOnly', 'contents' => 'true'],
|
||||
['name' => 'extractFormat', 'contents' => 'text'],
|
||||
['name' => 'myfile', 'contents' => $stream],
|
||||
]
|
||||
]);
|
||||
} catch (InvalidArgumentException $e) {
|
||||
$msg = sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$fileName,
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
return null;
|
||||
} catch (Exception $e) {
|
||||
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
|
||||
// query in some cases.
|
||||
$msg = sprintf(
|
||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||
$fileName,
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
return null;
|
||||
}
|
||||
|
||||
$matches = [];
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
|
||||
(string)$response->getBody(),
|
||||
$matches
|
||||
);
|
||||
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaServerTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Tika server is pretty efficient so use it immediately if available
|
||||
*
|
||||
* @var integer
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 80;
|
||||
|
||||
/**
|
||||
* Server endpoint
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $server_endpoint;
|
||||
|
||||
/**
|
||||
* @var TikaRestClient
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = [];
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient()
|
||||
{
|
||||
if (!$this->client) {
|
||||
$this->client = Injector::inst()->createWithArgs(
|
||||
TikaRestClient::class,
|
||||
[$this->getServerEndpoint()]
|
||||
);
|
||||
}
|
||||
return $this->client;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function getServerEndpoint()
|
||||
{
|
||||
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
|
||||
return $endpoint;
|
||||
}
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->get('server_endpoint');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of Tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of Tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
return $this->getClient()->getVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return boolean
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getServerEndpoint()
|
||||
&& $this->getClient()->isAvailable()
|
||||
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
if (!$this->supportedMimes) {
|
||||
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
|
||||
}
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if (isset($this->supportedMimes[$mime])) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check aliases
|
||||
foreach ($this->supportedMimes as $info) {
|
||||
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
$content = $this->getClient()->tika($tempFile);
|
||||
//Cleanup temp file
|
||||
if ($file instanceof File) {
|
||||
unlink($tempFile ?? '');
|
||||
}
|
||||
return $content;
|
||||
}
|
||||
}
|
|
@ -1,8 +1,12 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika CLI
|
||||
*
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor
|
||||
|
@ -18,14 +22,14 @@ class TikaTextExtractor extends FileTextExtractor
|
|||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
* @return mixed float | int The version of tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
$code = $this->runShell('tika --version', $stdout);
|
||||
|
||||
// Parse output
|
||||
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
||||
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout ?? '', $matches)) {
|
||||
return $matches['version'];
|
||||
}
|
||||
|
||||
|
@ -35,28 +39,29 @@ class TikaTextExtractor extends FileTextExtractor
|
|||
/**
|
||||
* Runs an arbitrary and safely escaped shell command
|
||||
*
|
||||
* @param string $command Full command including arguments
|
||||
* @param string &$stdout Standand output
|
||||
* @param string &$stderr Standard error
|
||||
* @param string $input Content to pass via standard input
|
||||
* @return int Exit code. 0 is success
|
||||
* @param string $command Full command including arguments
|
||||
* @param string &$stdout Standand output
|
||||
* @param string &$stderr Standard error
|
||||
* @param string $input Content to pass via standard input
|
||||
* @return int Exit code. 0 is success
|
||||
*/
|
||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
||||
{
|
||||
$descriptorSpecs = array(
|
||||
0 => array("pipe", "r"),
|
||||
1 => array("pipe", "w"),
|
||||
2 => array("pipe", "w")
|
||||
);
|
||||
$descriptorSpecs = [
|
||||
0 => ["pipe", "r"],
|
||||
1 => ["pipe", "w"],
|
||||
2 => ["pipe", "w"]
|
||||
];
|
||||
// Invoke command
|
||||
$pipes = array();
|
||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||
$pipes = [];
|
||||
$proc = proc_open($command ?? '', $descriptorSpecs ?? [], $pipes);
|
||||
|
||||
if (!is_resource($proc)) {
|
||||
return 255;
|
||||
}
|
||||
|
||||
// Send content as input
|
||||
fwrite($pipes[0], $input);
|
||||
fwrite($pipes[0], $input ?? '');
|
||||
fclose($pipes[0]);
|
||||
|
||||
// Get output
|
||||
|
@ -68,38 +73,58 @@ class TikaTextExtractor extends FileTextExtractor
|
|||
// Get result
|
||||
return proc_close($proc);
|
||||
}
|
||||
|
||||
public function getContent($path)
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
$mode = $this->config()->output_mode;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||
$mode = $this->config()->get('output_mode');
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path ?? ''));
|
||||
$code = $this->runShell($command, $output);
|
||||
//Cleanup temp file
|
||||
if ($file instanceof File) {
|
||||
unlink($path ?? '');
|
||||
}
|
||||
|
||||
if ($code == 0) {
|
||||
return $output;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getVersion() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
// Get list of supported mime types
|
||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||
|
||||
if ($code) {
|
||||
// Error case
|
||||
return false;
|
||||
} // Error case
|
||||
}
|
||||
|
||||
// Check if the mime type is inside the result
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||
return (bool)preg_match($pattern, $supportedTypes);
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime ?? '', '/'));
|
||||
|
||||
return (bool)preg_match($pattern ?? '', $supportedTypes ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Rest;
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Core\Convert;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
class TikaRestClient extends Client
|
||||
{
|
||||
/**
|
||||
* Authentication options to be sent to the Tika server
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $options = ['username' => null, 'password' => null];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $mimes = [];
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $baseUrl
|
||||
* @param array $config
|
||||
*/
|
||||
public function __construct($baseUrl = '', $config = [])
|
||||
{
|
||||
$password = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||
|
||||
if (!empty($password)) {
|
||||
$this->options = [
|
||||
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
||||
'password' => $password,
|
||||
];
|
||||
}
|
||||
|
||||
$config['base_uri'] = $baseUrl;
|
||||
|
||||
parent::__construct($config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if the service is available
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
try {
|
||||
/** @var Response $result */
|
||||
$result = $this->get('/', $this->getGuzzleOptions());
|
||||
|
||||
if ($result->getStatusCode() == 200) {
|
||||
return true;
|
||||
}
|
||||
} catch (RequestException $ex) {
|
||||
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
||||
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get version code
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
/** @var Response $response */
|
||||
$response = $this->get('version', $this->getGuzzleOptions());
|
||||
$version = 0;
|
||||
|
||||
// Parse output
|
||||
if ($response->getStatusCode() == 200
|
||||
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody() ?? '', $matches)
|
||||
) {
|
||||
$version = $matches['version'];
|
||||
}
|
||||
|
||||
return (string) $version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets supported mime data. May include aliased mime types.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getSupportedMimes()
|
||||
{
|
||||
if ($this->mimes) {
|
||||
return $this->mimes;
|
||||
}
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'application/json',
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
return $this->mimes = json_decode($response->getBody(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file.
|
||||
* Logs a notice-level error if the document can't be parsed.
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file)
|
||||
{
|
||||
$text = null;
|
||||
try {
|
||||
/** @var Response $response */
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'text/plain',
|
||||
],
|
||||
'body' => file_get_contents($file ?? ''),
|
||||
])
|
||||
);
|
||||
$text = $response->getBody();
|
||||
} catch (RequestException $e) {
|
||||
$msg = sprintf(
|
||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||
$file,
|
||||
$e->getResponse()->getStatusCode(),
|
||||
$e->getResponse()->getReasonPhrase()
|
||||
);
|
||||
// Only available if tika-server was started with --includeStack
|
||||
$body = $e->getResponse()->getBody();
|
||||
if ($body) {
|
||||
$msg .= ' Body: ' . $body;
|
||||
}
|
||||
|
||||
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||
}
|
||||
|
||||
return (string) $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assembles an array of request options to pass to Guzzle
|
||||
*
|
||||
* @param array $options Authentication (etc) will be merged into this array and returned
|
||||
* @return array
|
||||
*/
|
||||
protected function getGuzzleOptions($options = [])
|
||||
{
|
||||
if (!empty($this->options['username']) && !empty($this->options['password'])) {
|
||||
$options['auth'] = [
|
||||
$this->options['username'],
|
||||
$this->options['password']
|
||||
];
|
||||
}
|
||||
return $options;
|
||||
}
|
||||
}
|
|
@ -1,17 +1,23 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
|
||||
|
||||
class FileTextCacheDatabaseTest extends SapphireTest
|
||||
{
|
||||
public function testTruncatesByMaxLength()
|
||||
{
|
||||
Config::nest();
|
||||
|
||||
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
|
||||
$cache = new FileTextCache_Database();
|
||||
$file = $this->getMock('File', array('write'));
|
||||
Config::modify()->set(Database::class, 'max_content_length', 5);
|
||||
|
||||
$cache = new Database();
|
||||
$file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
|
||||
$content = '0123456789';
|
||||
$cache->save($file, $content);
|
||||
$this->assertEquals($cache->load($file), '01234');
|
||||
|
||||
Config::unnest();
|
||||
$this->assertEquals($cache->load($file), '01234');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,46 +1,60 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
|
||||
|
||||
class FileTextExtractableTest extends SapphireTest
|
||||
{
|
||||
protected $requiredExtensions = array(
|
||||
'File' => array('FileTextExtractable')
|
||||
);
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function setUp()
|
||||
protected static $required_extensions = [
|
||||
File::class => [
|
||||
FileTextExtractable::class,
|
||||
],
|
||||
];
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
// Ensure that html is a valid extension
|
||||
Config::inst()
|
||||
->nest()
|
||||
->update('File', 'allowed_extensions', array('html'));
|
||||
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||
|
||||
// Create a copy of the file, as it may be clobbered by the test
|
||||
// ($file->extractFileAsText() calls $file->write)
|
||||
copy(
|
||||
dirname(__FILE__) . '/fixtures/test1.html',
|
||||
dirname(__FILE__) . '/fixtures/test1-copy.html'
|
||||
);
|
||||
}
|
||||
|
||||
public function tearDown()
|
||||
protected function tearDown(): void
|
||||
{
|
||||
Config::unnest();
|
||||
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
||||
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||
}
|
||||
|
||||
parent::tearDown();
|
||||
}
|
||||
|
||||
public function testExtractFileAsText()
|
||||
{
|
||||
// Create a copy of the file, as it may be clobbered by the test
|
||||
// ($file->extractFileAsText() calls $file->write)
|
||||
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||
|
||||
// Use HTML, since the extractor is always available
|
||||
$file = new File(array(
|
||||
'Name' => 'test1-copy.html',
|
||||
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
|
||||
));
|
||||
/** @var File&FileTextExtractable $file */
|
||||
$file = new File(['Name' => 'test1-copy.html']);
|
||||
$file->setTextCache(new FileTextCache\Database());
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||
$file->write();
|
||||
|
||||
$content = $file->extractFileAsText();
|
||||
$this->assertContains('Test Headline', $content);
|
||||
$this->assertContains('Test Text', $content);
|
||||
$this->assertEquals($content, $file->FileContentCache);
|
||||
|
||||
if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
|
||||
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||
}
|
||||
$content = $file->extractFileAsText();
|
||||
$this->assertNotNull($content);
|
||||
$this->assertStringContainsString('Test Headline', $content);
|
||||
$this->assertStringContainsString('Test Text', $content);
|
||||
$this->assertEquals($content, $file->FileContentCache);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,36 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
|
||||
|
||||
class HTMLTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||
}
|
||||
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = new HTMLTextExtractor();
|
||||
|
||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
||||
$this->assertContains('Test Headline', $content);
|
||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
||||
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
|
||||
$this->assertStringContainsString('Test Headline', $content);
|
||||
$this->assertStringNotContainsString('Test Comment', $content, 'Strips HTML comments');
|
||||
$this->assertStringNotContainsString('Test Style', $content, 'Strips non-content style tags');
|
||||
$this->assertStringNotContainsString('Test Script', $content, 'Strips non-content script tags');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,29 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
|
||||
|
||||
class PDFTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = new PDFTextExtractor();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('pdftotext not available');
|
||||
$this->expectException(Exception::class);
|
||||
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
|
||||
}
|
||||
|
||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use PHPUnit\Framework\MockObject\MockObject;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
|
||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
|
||||
/**
|
||||
* @group tika-tests
|
||||
*/
|
||||
class TikaServerTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function testServerExtraction()
|
||||
{
|
||||
$extractor = TikaServerTextExtractor::create();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('tika server not available');
|
||||
}
|
||||
|
||||
// Check file
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $version
|
||||
* @param bool $expected
|
||||
* @dataProvider isAvailableProvider
|
||||
*/
|
||||
public function testIsAvailable($version, $expected)
|
||||
{
|
||||
/** @var MockObject|TikaServerTextExtractor $extractor */
|
||||
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
|
||||
->setMethods(['getClient', 'getServerEndpoint'])
|
||||
->getMock();
|
||||
|
||||
$client = $this->createMock(TikaRestClient::class);
|
||||
$client->method('isAvailable')->willReturn(true);
|
||||
$client->method('getVersion')->willReturn($version);
|
||||
|
||||
$extractor->method('getClient')->willReturn($client);
|
||||
$extractor->method('getServerEndpoint')->willReturn('tikaserver.example');
|
||||
|
||||
$result = $extractor->isAvailable();
|
||||
$this->assertSame($expected, $result);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array[]
|
||||
*/
|
||||
public function isAvailableProvider()
|
||||
{
|
||||
return [
|
||||
['1.5.2', false],
|
||||
['1.5', false],
|
||||
['1.7.0', true],
|
||||
['1.7.5', true],
|
||||
['1.8.0', true],
|
||||
['1.7', true],
|
||||
['1.8', true],
|
||||
['2.0.0', true],
|
||||
];
|
||||
}
|
||||
}
|
|
@ -1,39 +1,34 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
|
||||
|
||||
/**
|
||||
* Tests the {@see TikaTextExtractor} class
|
||||
*
|
||||
* @group tika-tests
|
||||
*/
|
||||
class TikaTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = new TikaTextExtractor();
|
||||
$extractor = TikaTextExtractor::create();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('tika cli not available');
|
||||
}
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
|
||||
public function testServerExtraction()
|
||||
{
|
||||
$extractor = new TikaServerTextExtractor();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('tika server not available');
|
||||
}
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
|
|
Loading…
Reference in New Issue