Compare commits
158 Commits
Author | SHA1 | Date |
---|---|---|
Guy Sartorelli | e04501cb52 | |
Maxime Rainville | 821d2858f3 | |
Guy Sartorelli | de215d63f6 | |
Maxime Rainville | 2a260607ec | |
Steve Boyd | 6a92eb58e2 | |
Guy Sartorelli | 87869e94a6 | |
Guy Sartorelli | 61f443d49c | |
Guy Sartorelli | 8f2e1d9b75 | |
Sabina Talipova | a281114ed2 | |
Steve Boyd | 1a0cd6d6a6 | |
Steve Boyd | 3bfa989a7e | |
Steve Boyd | 46b7f51040 | |
Guy Sartorelli | 041296bda2 | |
Steve Boyd | e8997870c5 | |
Steve Boyd | e8061724c5 | |
Steve Boyd | db8a36fa3e | |
Guy Sartorelli | 04ff0c6084 | |
Steve Boyd | e5bf4f1322 | |
Guy Sartorelli | 4674084d0d | |
Steve Boyd | df8b17ab85 | |
Michal Kleiner | 77fecc4c53 | |
Guy Sartorelli | d03a9f06e2 | |
Michal Kleiner | 88e7f27c5c | |
Guy Sartorelli | 04e4b60435 | |
Maxime Rainville | 25d8a55058 | |
Steve Boyd | e8f015ddd2 | |
Michal Kleiner | 254c4e31f8 | |
GuySartorelli | 7ad3fc9f13 | |
Maxime Rainville | eb36dcf5fb | |
Steve Boyd | b92616eb4e | |
Steve Boyd | 90d4812aa8 | |
Maxime Rainville | d1bdc003ad | |
Steve Boyd | 6af13768d3 | |
Garion Herman | 4250acb50e | |
Steve Boyd | 795abde8f1 | |
Steve Boyd | d1e241ed56 | |
Steve Boyd | 8e9a0243bb | |
Robbie Averill | cb15845a95 | |
Steve Boyd | 3564066245 | |
Steve Boyd | 06995b2ec7 | |
Maxime Rainville | 01848af86d | |
Steve Boyd | e451f96b0b | |
Robbie Averill | d0a7db0b68 | |
Russell Michell | 42cc545414 | |
Robbie Averill | 6234a971d1 | |
Robbie Averill | 0d7c507b53 | |
Robbie Averill | d5313674c3 | |
Robbie Averill | 32e2f9f84f | |
Robbie Averill | 5b967fd5d3 | |
Robbie Averill | 943f393ee8 | |
Charlie Bergthaler | 242e5a307d | |
Charlie Bergthaler | a9270d73ad | |
Robbie Averill | b4c634bb1f | |
Robbie Averill | 20079bd33f | |
Guy Marriott | c5cfe4ea1e | |
Martin Hipp | bff5eb2b79 | |
Robbie Averill | 801cd9cacb | |
Dylan Wagstaff | 9c2da06178 | |
Robbie Averill | 276fd9c856 | |
Robbie Averill | 759d92ccb4 | |
Robbie Averill | b9502653c2 | |
Robbie Averill | 86eba78064 | |
Ishan Jayamanne | 21ed6e0f86 | |
Robbie Averill | 75a8c66eee | |
Robbie Averill | 07c000dc0d | |
Dylan Wagstaff | 03d1fef4ae | |
Robbie Averill | e1e7cdbfa4 | |
Robbie Averill | 231a2091af | |
Daniel Hensby | b20738573f | |
Robbie Averill | 1b8ea2e451 | |
Dylan Wagstaff | 9795866abe | |
Robbie Averill | 9e8ed243d0 | |
Robbie Averill | 397e7a5d40 | |
Robbie Averill | 40e4b05f5d | |
Robbie Averill | 5e5a1f05da | |
Robbie Averill | 6bf932e5f0 | |
Robbie Averill | 770af5cfc9 | |
Robbie Averill | 3c1457c0ee | |
Robbie Averill | 5d53be9df6 | |
Robbie Averill | edb02e9189 | |
Robbie Averill | 8bd019b2aa | |
Robbie Averill | e2404fc904 | |
Robbie Averill | 8d295ada9c | |
Robbie Averill | fe5148e678 | |
Robbie Averill | 66c9db8c0d | |
Robbie Averill | f1bacd2aa9 | |
Robbie Averill | 300941c9e8 | |
Robbie Averill | dd292bd554 | |
Robbie Averill | 45cd9ae4ed | |
Dylan Wagstaff | d06569c8fd | |
Dylan Wagstaff | 31925d654e | |
Robbie Averill | e491042d3b | |
Robbie Averill | 33746e0cd7 | |
Russell Michell | 912c457c7d | |
Russell Michell | d09a5aa97c | |
Russell Michell | f341010d7a | |
Robbie Averill | 875e608d0f | |
Robbie Averill | c83a7c3403 | |
Robbie Averill | 95d96efe40 | |
Robbie Averill | 9f04583ed5 | |
Robbie Averill | a8a4e0c02f | |
Robbie Averill | 9f3819408c | |
Daniel Hensby | eb25505a8e | |
Jake Dale Ovenden | eb7a45865b | |
Robbie Averill | 40ba6a245d | |
Robbie Averill | 3d289b4e05 | |
Robbie Averill | f8c3015161 | |
Damian Mooyman | 23e255b5c6 | |
Juan van den Anker | 0761311170 | |
Damian Mooyman | 1b89000fcd | |
Alexandre Guidet | 196007314a | |
Damian Mooyman | 545e711f16 | |
Daniel Hensby | 5d24770d79 | |
Damian Mooyman | 5a5c648c1e | |
Daniel Hensby | e9e33605b4 | |
Damian Mooyman | e0125ba745 | |
Daniel Hensby | aaf9238384 | |
Daniel Hensby | 61750e33fc | |
Jake Bentvelzen | 75ffe7b56a | |
Hamish Friedlander | bde4cf4536 | |
Damian Mooyman | f72ba3a978 | |
Damian Mooyman | 9e44e834cf | |
helpfulrobot | 0420d56e4d | |
Daniel Hensby | 5a070eb47d | |
helpfulrobot | 7c45684dbb | |
Daniel Hensby | cdea0f0798 | |
Daniel Hensby | dcd527deb1 | |
helpfulrobot | 08cc7c37da | |
helpfulrobot | df3af6722b | |
Damian Mooyman | 1d2a9bc296 | |
helpfulrobot | 80a4773cce | |
Daniel Hensby | ebfa07dc5f | |
Daniel Hensby | 9cb2a79f8d | |
helpfulrobot | 8e14595f1a | |
helpfulrobot | 03de223162 | |
Daniel Hensby | 80f61a21be | |
Cam Findlay | 7b3fb280c6 | |
Christopher Pitt | 4c955bde13 | |
Damian Mooyman | 1e8581d7f8 | |
Daniel Hensby | e67fb97672 | |
Damian Mooyman | 832437e4bf | |
Loz Calver | 9ea4b79543 | |
Damian Mooyman | 9b36af2791 | |
Christopher Pitt | fbc31692e7 | |
Damian Mooyman | fd917f04a1 | |
Ingo Schommer | da6c554acb | |
Ingo Schommer | 15f9647bca | |
Damian Mooyman | c9d74f83db | |
Damian Mooyman | 6cf09f26c8 | |
Damian Mooyman | 6c7ffa2c6f | |
Damian Mooyman | 1f4083dda4 | |
Ingo Schommer | 8aca06aef2 | |
Ingo Schommer | 72ce8fc0bc | |
Christopher Pitt | adb71a7823 | |
Damian Mooyman | 3ffb303a0b | |
Ingo Schommer | 62637c6197 | |
Damian Mooyman | 98fd4228f9 | |
cam-findlay | a34c443be5 |
|
@ -0,0 +1,17 @@
|
|||
# For more information about the properties used in this file,
|
||||
# please see the EditorConfig documentation:
|
||||
# http://editorconfig.org
|
||||
|
||||
[*]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
indent_size = 4
|
||||
indent_style = space
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
|
||||
[{*.yml,package.json}]
|
||||
indent_size = 2
|
||||
|
||||
# The indent size used in the package.json file cannot be changed:
|
||||
# https://github.com/npm/npm/pull/3180#issuecomment-16336516
|
|
@ -0,0 +1,7 @@
|
|||
/tests export-ignore
|
||||
/docs export-ignore
|
||||
/.gitattributes export-ignore
|
||||
/.gitignore export-ignore
|
||||
/.travis.yml export-ignore
|
||||
/.scrutinizer.yml export-ignore
|
||||
/codecov.yml export-ignore
|
|
@ -0,0 +1,11 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
ci:
|
||||
name: CI
|
||||
uses: silverstripe/gha-ci/.github/workflows/ci.yml@v1
|
|
@ -0,0 +1,16 @@
|
|||
name: Dispatch CI
|
||||
|
||||
on:
|
||||
# At 12:20 PM UTC, only on Saturday and Sunday
|
||||
schedule:
|
||||
- cron: '20 12 * * 6,0'
|
||||
|
||||
jobs:
|
||||
dispatch-ci:
|
||||
name: Dispatch CI
|
||||
# Only run cron on the silverstripe account
|
||||
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dispatch CI
|
||||
uses: silverstripe/gha-dispatch-ci@v1
|
|
@ -0,0 +1,17 @@
|
|||
name: Keepalive
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
# The 4th of every month at 10:50am UTC
|
||||
schedule:
|
||||
- cron: '50 10 4 * *'
|
||||
|
||||
jobs:
|
||||
keepalive:
|
||||
name: Keepalive
|
||||
# Only run cron on the silverstripe account
|
||||
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Keepalive
|
||||
uses: silverstripe/gha-keepalive@v1
|
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
|||
# See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details
|
||||
|
||||
language: php
|
||||
php:
|
||||
- 5.4
|
||||
|
||||
env:
|
||||
- DB=MYSQL CORE_RELEASE=3.1
|
||||
- DB=MYSQL CORE_RELEASE=3
|
||||
|
||||
before_script:
|
||||
- mkdir -p $HOME/bin
|
||||
- export PATH=$PATH:$HOME/bin
|
||||
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
|
||||
- ./.travis/install_tika.sh
|
||||
- sudo ./.travis/install_pdftotext.sh
|
||||
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
|
||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
||||
- cd ~/builds/ss
|
||||
|
||||
script:
|
||||
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
|
@ -1,3 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
apt-get update
|
||||
apt-get install -y xpdf
|
|
@ -0,0 +1,14 @@
|
|||
mappings:
|
||||
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
|
||||
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
|
||||
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
|
||||
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
|
||||
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
|
||||
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
|
||||
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
|
||||
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
|
||||
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
# Contributing
|
||||
|
||||
- Maintenance on this module is a shared effort of those who use it
|
||||
- To contribute improvements to the code, ensure you raise a pull request and discuss with the module maintainers
|
||||
- Please follow the SilverStripe [code contribution guidelines](https://docs.silverstripe.org/en/contributing/code/) and [Module Standard](https://docs.silverstripe.org/en/developer_guides/extending/modules/#module-standard)
|
||||
- Supply documentation that followS the [GitHub Flavored Markdown](https://help.github.com/articles/markdown-basics/) conventions
|
||||
- When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct/)
|
||||
|
||||
|
||||
## Contributor license agreement
|
||||
By supplying code to this module in patches, tickets and pull requests, you agree to assign copyright
|
||||
of that code to SilverStripe Ltd., on the condition that these code changes are released under the
|
||||
same BSD license as the original module. We ask for this so that the ownership in the license is clear
|
||||
and unambiguous. By releasing this code under a permissive license such as BSD, this copyright assignment
|
||||
won't prevent you from using the code in any way you see fit.
|
24
LICENSE
24
LICENSE
|
@ -1,24 +0,0 @@
|
|||
* Copyright (c) 2010-2012, SilverStripe Ltd.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of the <organization> nor the
|
||||
* names of its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY SilverStripe Ltd. ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL SilverStripe Ltd. BE LIABLE FOR ANY
|
||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
190
README.md
190
README.md
|
@ -1,26 +1,15 @@
|
|||
# Text Extraction Module
|
||||
# Text extraction module
|
||||
|
||||
[![Build Status](https://secure.travis-ci.org/silverstripe-labs/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe-labs/silverstripe-textextraction)
|
||||
[![CI](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml/badge.svg)](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml)
|
||||
[![Silverstripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
||||
|
||||
## Overview
|
||||
Provides a text extraction API for file content, that can hook into different extractor
|
||||
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
||||
|
||||
Provides an extraction API for file content, which can hook into different extractor
|
||||
engines based on availability and the parsed file format.
|
||||
The output is always a string: the file content.
|
||||
|
||||
Via the `FileTextExtractable` extension, this logic can be used to
|
||||
Via the `FileTextExtractable` extension, this logic can be used to
|
||||
cache the extracted content on a `DataObject` subclass (usually `File`).
|
||||
|
||||
Note: Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
|
||||
|
||||
## Requirements
|
||||
|
||||
* SilverStripe 3.1
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||
|
||||
### Supported Formats
|
||||
The module supports text extraction on the following file formats:
|
||||
|
||||
* HTML (built-in)
|
||||
* PDF (with XPDF or Solr)
|
||||
|
@ -31,158 +20,43 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
|
|||
* EPub (Solr)
|
||||
* Many others (Tika)
|
||||
|
||||
## Requirements
|
||||
|
||||
* Silverstripe ^4.0
|
||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||
|
||||
## Installation
|
||||
|
||||
The recommended installation is through [composer](http://getcomposer.org).
|
||||
Add the following to your `composer.json`:
|
||||
|
||||
```js
|
||||
{
|
||||
"require": {
|
||||
"silverstripe/textextraction": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
```
|
||||
composer require silverstripe/textextraction
|
||||
```
|
||||
|
||||
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
|
||||
which is automatically checked out by composer. Alternatively, install Guzzle
|
||||
through PEAR and ensure its in your `include_path`.
|
||||
|
||||
## Configuration
|
||||
## Documentation
|
||||
|
||||
### Basic
|
||||
* [Configuration](docs/en/configuration.md)
|
||||
* [Developer documentation](/docs/en/developer-docs.md)
|
||||
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
## Bugtracker
|
||||
|
||||
```yaml
|
||||
File:
|
||||
extensions:
|
||||
- FileTextExtractable
|
||||
```
|
||||
Bugs are tracked in the issues section of this repository. Before submitting an issue please read over
|
||||
existing issues to ensure yours is unique.
|
||||
|
||||
### XPDF
|
||||
If the issue does look like a new bug:
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
|
||||
- Create a new issue
|
||||
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
|
||||
and screencasts can help here.
|
||||
- Describe your environment as detailed as possible: Silverstripe version, Browser, PHP version,
|
||||
Operating System, any installed Silverstripe modules.
|
||||
|
||||
```yml
|
||||
PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
```
|
||||
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.
|
||||
|
||||
### Apache Solr
|
||||
|
||||
Apache Solr is a fulltext search engine, an aspect which is often used
|
||||
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
|
||||
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
|
||||
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
|
||||
The textextraction module retrieves the output of this service, rather than altering the index.
|
||||
With the raw text output, you can decide to store it in a database column for fulltext search
|
||||
in your database driver, or even pass it back to Solr as part of a full index update.
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
```yml
|
||||
SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
```
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
|
||||
uses multiple cores by default, and comes prepackaged with a Solr server.
|
||||
Its a stripped-down version of Solr, follow the module README on how to add
|
||||
Apache Tika text extraction capabilities.
|
||||
|
||||
You need to ensure that some indexable property on your object
|
||||
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
|
||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
|
||||
```php
|
||||
class MyDocument extends DataObject {
|
||||
static $db = array('Path' => 'Text');
|
||||
function getContent() {
|
||||
$extractor = FileTextExtractor::for_file($this->Path);
|
||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||
}
|
||||
}
|
||||
class MySolrIndex extends SolrIndex {
|
||||
function init() {
|
||||
$this->addClass('MyDocument');
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
### Tika
|
||||
|
||||
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
|
||||
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
|
||||
is fairly slow, so we recommend running it as a server.
|
||||
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
### Tika - CLI
|
||||
|
||||
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
exec java -jar tika-app-1.8.jar "$@"
|
||||
```
|
||||
|
||||
### Tika Rest Server
|
||||
|
||||
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
||||
|
||||
```yaml
|
||||
TikaServerTextExtractor:
|
||||
server_endpoint: 'http://localhost:9998'
|
||||
```
|
||||
|
||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name.
|
||||
|
||||
|
||||
Then startup your server as below
|
||||
|
||||
```bash
|
||||
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||
```
|
||||
|
||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||
it behaves differently and is not recommended.
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
||||
```php
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Extraction with `FileTextExtractable` extension applied:
|
||||
|
||||
```php
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->getFileContent();
|
||||
```
|
||||
|
||||
This content can also be embedded directly within a template.
|
||||
|
||||
```
|
||||
$MyFile.FileContent
|
||||
```
|
||||
## Development and contribution
|
||||
If you would like to make contributions to the module please ensure you raise a pull request and discuss
|
||||
with the module maintainers.
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
Name: textextractioncache
|
||||
After:
|
||||
- '#corecache'
|
||||
---
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
||||
factory: SilverStripe\Core\Cache\CacheFactory
|
||||
constructor:
|
||||
namespace: 'FileTextCache_Cache'
|
|
@ -1,2 +1,10 @@
|
|||
SolrCellTextExtractor:
|
||||
# base_url: 'http://localhost:8983/solr/update/extract'
|
||||
---
|
||||
Name: textextractionconfig
|
||||
---
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
# Define default FileTextCache implementation
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||
max_content_length: 500000
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct).
|
|
@ -1,54 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||
* FileTextExtractor to do the extraction based on the content type of the file.
|
||||
*
|
||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||
*
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class FileTextExtractable extends DataExtension {
|
||||
|
||||
private static $db = array(
|
||||
'FileContentCache' => 'Text'
|
||||
);
|
||||
|
||||
private static $casting = array(
|
||||
'FileContent' => 'Text'
|
||||
);
|
||||
|
||||
/**
|
||||
* Helper function for template
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getFileContent() {
|
||||
return $this->extractFileAsText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
||||
* The value is also cached into the File record itself.
|
||||
*
|
||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||
* If true, the content parsing is forced, bypassing the cached version
|
||||
* @return string
|
||||
*/
|
||||
public function extractFileAsText($disableCache = false) {
|
||||
if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
|
||||
if (!$extractor) return null;
|
||||
|
||||
$text = $extractor->getContent($this->owner->FullPath);
|
||||
if (!$text) return null;
|
||||
|
||||
$this->owner->FileContentCache = $text;
|
||||
$this->owner->write();
|
||||
|
||||
return $text;
|
||||
}
|
||||
}
|
|
@ -1,133 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
abstract class FileTextExtractor extends Object {
|
||||
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
* The highest priority extractor for a given content type will be selected.
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 50;
|
||||
|
||||
/**
|
||||
* Cache of extractor class names, sorted by priority
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected static $sorted_extractor_classes = null;
|
||||
|
||||
/**
|
||||
* Gets the list of prioritised extractor classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected static function get_extractor_classes() {
|
||||
// Check cache
|
||||
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
|
||||
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
||||
array_shift($classes);
|
||||
$classPriorities = array();
|
||||
foreach($classes as $class) {
|
||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||
}
|
||||
arsort($classPriorities);
|
||||
|
||||
// Save classes
|
||||
$sortedClasses = array_keys($classPriorities);
|
||||
return self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text file extractor for the given class
|
||||
*
|
||||
* @param string $class
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
protected static function get_extractor($class) {
|
||||
return Injector::inst()->get($class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempt to detect mime type for given file
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Mime type if found
|
||||
*/
|
||||
protected static function get_mime($path) {
|
||||
if(!class_exists('finfo')) return null;
|
||||
|
||||
// Check mime of file
|
||||
$finfo = new finfo(FILEINFO_MIME_TYPE);
|
||||
return $finfo->file($path);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $path
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
static function for_file($path) {
|
||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
||||
$mime = self::get_mime($path);
|
||||
foreach(self::get_extractor_classes() as $className) {
|
||||
$extractor = self::get_extractor($className);
|
||||
|
||||
// Skip unavailable extractors
|
||||
if(!$extractor->isAvailable()) continue;
|
||||
|
||||
// Check extension
|
||||
if($extension && $extractor->supportsExtension($extension)) {
|
||||
return $extractor;
|
||||
}
|
||||
|
||||
// Check mime
|
||||
if($mime && $extractor->supportsMime($mime)) {
|
||||
return $extractor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the extractor is supported on the current environment,
|
||||
* for example if the correct binaries or libraries are available.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function isAvailable();
|
||||
|
||||
/**
|
||||
* Determine if this extractor supports the given extension.
|
||||
* If support is determined by mime/type only, then this should return false.
|
||||
*
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsExtension($extension);
|
||||
|
||||
/**
|
||||
* Determine if this extractor suports the given mime type.
|
||||
* Will only be called if supportsExtension returns false.
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsMime($mime);
|
||||
|
||||
/**
|
||||
* Given a file path, extract the contents as text.
|
||||
*
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
abstract public function getContent($path);
|
||||
}
|
||||
|
||||
class FileTextExtractor_Exception extends Exception {}
|
|
@ -1,73 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor {
|
||||
|
||||
public function isAvailable() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array("html", "htm", "xhtml")
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
return strtolower($mime) === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param string $path
|
||||
* @return string
|
||||
*/
|
||||
public function getContent($path) {
|
||||
$content = file_get_contents($path);
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
array(
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
),
|
||||
array(
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
|
||||
),
|
||||
$content
|
||||
);
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor {
|
||||
|
||||
public function isAvailable() {
|
||||
$bin = $this->bin('pdftotext');
|
||||
return (file_exists($bin) && is_executable($bin));
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
return strtolower($extension) === 'pdf';
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
return in_array(
|
||||
strtolower($mime),
|
||||
array(
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
*
|
||||
* @param string $prog Name of binary
|
||||
* @return string
|
||||
*/
|
||||
protected function bin($prog = '') {
|
||||
if ($this->config()->binary_location) {
|
||||
// By config
|
||||
$path = $this->config()->binary_location;
|
||||
} elseif (file_exists('/usr/bin/pdftotext')) {
|
||||
// By searching common directories
|
||||
$path = '/usr/bin';
|
||||
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
||||
$path = '/usr/local/bin';
|
||||
} else {
|
||||
$path = '.'; // Hope it's in path
|
||||
}
|
||||
|
||||
return ( $path ? $path . '/' : '' ) . $prog;
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
if(!$path) return ""; // no file
|
||||
$content = $this->getRawOutput($path);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke pdftotext with the given path
|
||||
*
|
||||
* @param string $path
|
||||
* @return string Output
|
||||
* @throws FileTextExtractor_Exception
|
||||
*/
|
||||
protected function getRawOutput($path) {
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||
if($err) {
|
||||
throw new FileTextExtractor_Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
$path,
|
||||
implode('', $err)
|
||||
));
|
||||
}
|
||||
return implode('', $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes utf-8 ligatures.
|
||||
*
|
||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||
*
|
||||
* @param string $input
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanupLigatures($input) {
|
||||
$mapping = array(
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
'ffi' => 'ffi',
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
);
|
||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
<?php
|
||||
use Guzzle\Http\Client;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
||||
* Does not alter the Solr index itself, but uses it purely
|
||||
* for its file parsing abilities.
|
||||
*
|
||||
* @author ischommer
|
||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||
*/
|
||||
class SolrCellTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Base URL to use for solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
* @var string
|
||||
*/
|
||||
private static $base_url;
|
||||
|
||||
private static $priority = 75;
|
||||
|
||||
protected $httpClient;
|
||||
|
||||
public function getHttpClient() {
|
||||
if(!$this->config()->get('base_url')) {
|
||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
public function setHttpClient($client) {
|
||||
$this->httpClient = $client;
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
$url = $this->config()->get('base_url');
|
||||
if(!$url) return false;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
return in_array(
|
||||
strtolower($extension),
|
||||
array(
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
// Rely on supportsExtension
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
if (!$path) return ""; // no file
|
||||
|
||||
$fileName = basename($path);
|
||||
$client = $this->getHttpClient();
|
||||
try {
|
||||
$request = $client
|
||||
->post()
|
||||
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
||||
->addPostFiles(array('myfile' => $path));
|
||||
$response = $request->send();
|
||||
} catch(InvalidArgumentException $e) {
|
||||
SS_Log::log(
|
||||
sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$path,
|
||||
$e->getMessage()
|
||||
),
|
||||
SS_Log::NOTICE
|
||||
);
|
||||
return null;
|
||||
}
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
||||
(string)$response->getBody(),
|
||||
$matches
|
||||
);
|
||||
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaServerTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Tika server is pretty efficient so use it immediately if available
|
||||
*
|
||||
* @var integer
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 80;
|
||||
|
||||
/**
|
||||
* Server endpoint
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $server_endpoint;
|
||||
|
||||
/**
|
||||
* @var TikaRestClient
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient() {
|
||||
return $this->client ?:
|
||||
($this->client =
|
||||
Injector::inst()->createWithArgs(
|
||||
'TikaRestClient',
|
||||
array($this->getServerEndpoint())
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function getServerEndpoint() {
|
||||
if(defined('SS_TIKA_ENDPOINT')) {
|
||||
return SS_TIKA_ENDPOINT;
|
||||
}
|
||||
|
||||
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->server_endpoint;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
*/
|
||||
public function getVersion() {
|
||||
return $this
|
||||
->getClient()
|
||||
->getVersion();
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
return $this->getServerEndpoint() &&
|
||||
$this->getClient()->isAvailable() &&
|
||||
$this->getVersion() >= 1.7;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = array();
|
||||
|
||||
public function supportsMime($mime) {
|
||||
$supported = $this->supportedMimes ?:
|
||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if(isset($supported[$mime])) return true;
|
||||
|
||||
// Check aliases
|
||||
foreach($supported as $info) {
|
||||
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
return $this->getClient()->tika($path);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,94 +0,0 @@
|
|||
<?php
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika CLI
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Text extraction mode. Defaults to -t (plain text)
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $output_mode = '-t';
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of tika
|
||||
*/
|
||||
public function getVersion() {
|
||||
$code = $this->runShell('tika --version', $stdout);
|
||||
|
||||
// Parse output
|
||||
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
||||
return $matches['version'];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs an arbitrary and safely escaped shell command
|
||||
*
|
||||
* @param string $command Full command including arguments
|
||||
* @param string &$stdout Standand output
|
||||
* @param string &$stderr Standard error
|
||||
* @param string $input Content to pass via standard input
|
||||
* @return int Exit code. 0 is success
|
||||
*/
|
||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
|
||||
$descriptorSpecs = array(
|
||||
0 => array("pipe", "r"),
|
||||
1 => array("pipe", "w"),
|
||||
2 => array("pipe", "w")
|
||||
);
|
||||
// Invoke command
|
||||
$pipes = array();
|
||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||
if (!is_resource($proc)) return 255;
|
||||
|
||||
// Send content as input
|
||||
fwrite($pipes[0], $input);
|
||||
fclose($pipes[0]);
|
||||
|
||||
// Get output
|
||||
$stdout = stream_get_contents($pipes[1]);
|
||||
fclose($pipes[1]);
|
||||
$stderr = stream_get_contents($pipes[2]);
|
||||
fclose($pipes[2]);
|
||||
|
||||
// Get result
|
||||
return proc_close($proc);
|
||||
}
|
||||
|
||||
public function getContent($path) {
|
||||
$mode = $this->config()->output_mode;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||
$code = $this->runShell($command, $output);
|
||||
if($code == 0) return $output;
|
||||
}
|
||||
|
||||
public function isAvailable() {
|
||||
return $this->getVersion() > 0;
|
||||
}
|
||||
|
||||
public function supportsExtension($extension) {
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
public function supportsMime($mime) {
|
||||
// Get list of supported mime types
|
||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||
if($code) return false; // Error case
|
||||
|
||||
// Check if the mime type is inside the result
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||
return (bool)preg_match($pattern, $supportedTypes);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
<?php
|
||||
|
||||
use Guzzle\Http\Client;
|
||||
use Guzzle\Http\Exception\RequestException;
|
||||
|
||||
class TikaRestClient extends Client {
|
||||
|
||||
/**
|
||||
* Detect if the service is available
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable() {
|
||||
try {
|
||||
return $this
|
||||
->get()->send()
|
||||
->getStatusCode() == 200;
|
||||
} catch (RequestException $ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get version code
|
||||
*
|
||||
* @return float
|
||||
*/
|
||||
public function getVersion() {
|
||||
$response = $this->get('version')->send();
|
||||
// Parse output
|
||||
if($response->getStatusCode() == 200 &&
|
||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
||||
) {
|
||||
return (float)$matches['version'];
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
protected $mimes = array();
|
||||
|
||||
/**
|
||||
* Gets supported mime data. May include aliased mime types.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getSupportedMimes() {
|
||||
if($this->mimes) return $this->mimes;
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
array('Accept' => 'application/json')
|
||||
)->send();
|
||||
|
||||
return $this->mimes = $response->json();
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file) {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array('Accept' => 'text/plain'),
|
||||
file_get_contents($file)
|
||||
)->send();
|
||||
|
||||
return $response->getBody(true);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
comment: false
|
||||
codecov:
|
||||
branch: master
|
|
@ -1,35 +1,45 @@
|
|||
{
|
||||
"name": "silverstripe/textextraction",
|
||||
"type": "silverstripe-module",
|
||||
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
||||
"homepage": "http://silverstripe.org",
|
||||
"license": "BSD-3-Clause",
|
||||
"keywords": ["silverstripe", "fulltext", "pdf"],
|
||||
"authors": [
|
||||
{
|
||||
"name": "SilverStripe",
|
||||
"homepage": "http://silverstripe.com"
|
||||
},
|
||||
{
|
||||
"name": "The SilverStripe Community",
|
||||
"homepage": "http://silverstripe.org"
|
||||
}
|
||||
],
|
||||
"require": {
|
||||
"php": ">=5.3.2",
|
||||
"composer/installers": "*",
|
||||
"silverstripe/framework": "~3.1",
|
||||
"guzzle/guzzle": "~3.9"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/PHPUnit": "~3.7@stable"
|
||||
},
|
||||
"suggest": {
|
||||
"ext-fileinfo": "Improved support for file mime detection"
|
||||
},
|
||||
"extra": {
|
||||
"branch-alias": {
|
||||
"dev-master": "2.0.x-dev"
|
||||
}
|
||||
}
|
||||
}
|
||||
"name": "silverstripe/textextraction",
|
||||
"type": "silverstripe-vendormodule",
|
||||
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
||||
"homepage": "http://silverstripe.org",
|
||||
"license": "BSD-3-Clause",
|
||||
"keywords": [
|
||||
"silverstripe",
|
||||
"fulltext",
|
||||
"pdf"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "SilverStripe",
|
||||
"homepage": "http://silverstripe.com"
|
||||
},
|
||||
{
|
||||
"name": "The SilverStripe Community",
|
||||
"homepage": "http://silverstripe.org"
|
||||
}
|
||||
],
|
||||
"require": {
|
||||
"php": "^7.4 || ^8.0",
|
||||
"silverstripe/framework": "^4.10",
|
||||
"silverstripe/assets": "^1",
|
||||
"silverstripe/versioned": "^1",
|
||||
"guzzlehttp/guzzle": "^6.3 || ^7.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"squizlabs/php_codesniffer": "^3",
|
||||
"phpunit/phpunit": "^9.5"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
"SilverStripe\\TextExtraction\\": "src/",
|
||||
"SilverStripe\\TextExtraction\\Tests\\": "tests/"
|
||||
}
|
||||
},
|
||||
"suggest": {
|
||||
"ext-fileinfo": "Improved support for file mime detection"
|
||||
},
|
||||
"extra": [],
|
||||
"minimum-stability": "dev",
|
||||
"prefer-stable": true
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
# Configuration
|
||||
|
||||
## Basic
|
||||
|
||||
By default, only extraction from HTML documents is supported.
|
||||
No configuration is required for that, unless you want to make
|
||||
the content available through your `DataObject` subclass.
|
||||
In this case, add the following to `mysite/_config/config.yml`:
|
||||
|
||||
```yaml
|
||||
SilverStripe\Assets\File:
|
||||
extensions:
|
||||
- SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||
```
|
||||
|
||||
By default any extracted content will be cached against the database row. In order to stay within common size
|
||||
constraints for SQL queries required in this operation, the cache sets a maximum character length after which
|
||||
content gets truncated (default: 500000). You can configure this value through
|
||||
`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
|
||||
|
||||
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
|
||||
In order to swap out the cache backend you can use the following yaml configuration.
|
||||
|
||||
```yaml
|
||||
---
|
||||
Name: mytextextraction
|
||||
After: '#textextraction'
|
||||
---
|
||||
SilverStripe\Core\Injector\Injector:
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||
|
||||
SilverStripe\TextExtraction\Cache\FileTextCache\Cache:
|
||||
lifetime: 3600 # Number of seconds to cache content for
|
||||
```
|
||||
|
||||
## XPDF
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
||||
|
||||
```yml
|
||||
SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
|
||||
binary_location: /my/path/pdftotext
|
||||
```
|
||||
|
||||
## Apache Solr
|
||||
|
||||
Apache Solr is a fulltext search engine, an aspect which is often used
|
||||
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
|
||||
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
|
||||
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
|
||||
The textextraction module retrieves the output of this service, rather than altering the index.
|
||||
With the raw text output, you can decide to store it in a database column for fulltext search
|
||||
in your database driver, or even pass it back to Solr as part of a full index update.
|
||||
|
||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||
|
||||
```yml
|
||||
SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
|
||||
base_url: 'http://localhost:8983/solr/update/extract'
|
||||
```
|
||||
|
||||
Note that in case you're using multiple cores, you'll need to add the core name to the URL
|
||||
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
|
||||
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
|
||||
uses multiple cores by default, and comes prepackaged with a Solr server.
|
||||
Its a stripped-down version of Solr, follow the module README on how to add
|
||||
Apache Tika text extraction capabilities.
|
||||
|
||||
You need to ensure that some indexable property on your object
|
||||
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
|
||||
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
|
||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||
|
||||
```php
|
||||
use SilverStripe\ORM\DataObject;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
class MyDocument extends DataObject
|
||||
{
|
||||
private static $db = ['Path' => 'Text'];
|
||||
|
||||
public function getContent()
|
||||
{
|
||||
$extractor = FileTextExtractor::for_file($this->Path);
|
||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||
}
|
||||
}
|
||||
|
||||
use SilverStripe\FullTextSearch\Solr;
|
||||
|
||||
class MySolrIndex extends SolrIndex
|
||||
{
|
||||
public function init()
|
||||
{
|
||||
$this->addClass(MyDocument::class);
|
||||
$this->addStoredField('Content', 'HTMLText');
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
|
||||
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
|
||||
code.
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
## Tika
|
||||
|
||||
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
|
||||
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
|
||||
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
|
||||
is fairly slow, so we recommend running it as a server.
|
||||
|
||||
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
|
||||
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
|
||||
|
||||
## Tika - CLI
|
||||
|
||||
Ensure that your machine has a 'tika' command available which will run the CLI script.
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
exec java -jar tika-app-1.8.jar "$@"
|
||||
```
|
||||
|
||||
## Tika Rest Server
|
||||
|
||||
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
||||
|
||||
```yaml
|
||||
SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
|
||||
server_endpoint: 'http://localhost:9998'
|
||||
```
|
||||
|
||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
|
||||
environment variable of the same name.
|
||||
|
||||
|
||||
Then startup your server as below:
|
||||
|
||||
```bash
|
||||
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||
```
|
||||
|
||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||
it behaves differently and is not recommended.
|
||||
|
||||
The module will log extraction errors with PSR-3 "notice" priority by default,
|
||||
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
||||
In case you want more information on why processing failed, you can increase
|
||||
the logging verbosity in the tika server instance by passing through
|
||||
a `--includeStack` flag. Logs can passed on to files or external logging services,
|
||||
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
|
||||
documentation for SilverStripe core.
|
|
@ -0,0 +1,32 @@
|
|||
# Developer documentation
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction via string file path:
|
||||
|
||||
```php
|
||||
$myFile = '/my/path/myfile.pdf';
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Manual extraction via File object:
|
||||
|
||||
```php
|
||||
$myFile = File::get()->filter(['Name' => 'My file')->first();
|
||||
$extractor = FileTextExtractor::for_file($myFile);
|
||||
$content = $extractor->getContent($myFile);
|
||||
```
|
||||
|
||||
Extraction with `FileTextExtractable` extension applied:
|
||||
|
||||
```php
|
||||
$myFileObj = File::get()->First();
|
||||
$content = $myFileObj->getFileContent();
|
||||
```
|
||||
|
||||
This content can also be embedded directly within a template.
|
||||
|
||||
```
|
||||
$MyFile.FileContent
|
||||
```
|
|
@ -0,0 +1,12 @@
|
|||
Copyright (c) 2018, SilverStripe Limited
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,13 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ruleset name="SilverStripe">
|
||||
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
|
||||
|
||||
<file>src</file>
|
||||
<file>tests</file>
|
||||
|
||||
<!-- base rules are PSR-2 -->
|
||||
<rule ref="PSR2" >
|
||||
<!-- Current exclusions -->
|
||||
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
|
||||
</rule>
|
||||
</ruleset>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
|
||||
<testsuites>
|
||||
<testsuite name="Default">
|
||||
<directory>tests/</directory>
|
||||
</testsuite>
|
||||
</testsuites>
|
||||
|
||||
<filter>
|
||||
<whitelist addUncoveredFilesFromWhitelist="true">
|
||||
<directory suffix=".php">src/</directory>
|
||||
<exclude>
|
||||
<directory suffix=".php">tests/</directory>
|
||||
</exclude>
|
||||
</whitelist>
|
||||
</filter>
|
||||
</phpunit>
|
|
@ -0,0 +1,31 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
interface FileTextCache
|
||||
{
|
||||
/**
|
||||
* Save extracted content for a given File entity
|
||||
*
|
||||
* @param File $file
|
||||
* @param string $content
|
||||
*/
|
||||
public function save(File $file, $content);
|
||||
|
||||
/**
|
||||
* Return any cached extracted content for a given file entity
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function load(File $file);
|
||||
|
||||
/**
|
||||
* Invalidate the cache for a given file.
|
||||
* Invoked in onBeforeWrite on the file
|
||||
*
|
||||
* @param File $file
|
||||
*/
|
||||
public function invalidate(File $file);
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use Psr\SimpleCache\CacheInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Flushable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Uses SS_Cache with a lifetime to cache extracted content
|
||||
*/
|
||||
class Cache implements FileTextCache, Flushable
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* Lifetime of cache in seconds
|
||||
* Null defaults to 3600 (1 hour)
|
||||
*
|
||||
* @var int|null
|
||||
* @config
|
||||
*/
|
||||
private static $lifetime = null;
|
||||
|
||||
/**
|
||||
* @return CacheInterface
|
||||
*/
|
||||
protected static function get_cache()
|
||||
{
|
||||
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
|
||||
|
||||
return Injector::inst()->get($for);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
*/
|
||||
protected function getKey(File $file)
|
||||
{
|
||||
return md5($file->getFilename() ?? '');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return mixed
|
||||
*/
|
||||
public function load(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->get($key);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @param string $content
|
||||
* @return string
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$lifetime = $this->config()->get('lifetime') ?: 3600;
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->set($key, $content, $lifetime);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public static function flush()
|
||||
{
|
||||
$cache = self::get_cache();
|
||||
$cache->clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Alias for $this->flush()
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public static function clear()
|
||||
{
|
||||
$cache = self::get_cache();
|
||||
$cache->clear();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return bool
|
||||
*/
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
$key = $this->getKey($file);
|
||||
$cache = self::get_cache();
|
||||
|
||||
return $cache->delete($key);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
|
||||
/**
|
||||
* Caches the extracted content on the record for the file.
|
||||
* Limits the stored file content by default to avoid hitting query size limits.
|
||||
*/
|
||||
class Database implements FileTextCache
|
||||
{
|
||||
use Configurable;
|
||||
|
||||
/**
|
||||
* @config
|
||||
* @var int
|
||||
*/
|
||||
private static $max_content_length = null;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param File $file
|
||||
* @return FileTextCache
|
||||
*/
|
||||
public function load(File $file)
|
||||
{
|
||||
return $file->FileContentCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @param mixed $content
|
||||
*/
|
||||
public function save(File $file, $content)
|
||||
{
|
||||
$maxLength = $this->config()->get('max_content_length');
|
||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||
$file->write();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @return void
|
||||
*/
|
||||
public function invalidate(File $file)
|
||||
{
|
||||
// To prevent writing to the cache from invalidating it
|
||||
if (!$file->isChanged('FileContentCache')) {
|
||||
$file->FileContentCache = '';
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extension;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\ORM\DataExtension;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
/**
|
||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||
* FileTextExtractor to do the extraction based on the content type of the file.
|
||||
*
|
||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||
*
|
||||
* @author mstephens
|
||||
*/
|
||||
class FileTextExtractable extends DataExtension
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $db = [
|
||||
'FileContentCache' => 'Text'
|
||||
];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $casting = [
|
||||
'FileContent' => 'Text'
|
||||
];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
* @config
|
||||
*/
|
||||
private static $dependencies = [
|
||||
'TextCache' => '%$' . FileTextCache::class,
|
||||
];
|
||||
|
||||
/**
|
||||
* @var FileTextCache
|
||||
*/
|
||||
protected $fileTextCache = null;
|
||||
|
||||
/**
|
||||
* @param FileTextCache $cache
|
||||
* @return $this
|
||||
*/
|
||||
public function setTextCache(FileTextCache $cache)
|
||||
{
|
||||
$this->fileTextCache = $cache;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return FileTextCache
|
||||
*/
|
||||
public function getTextCache()
|
||||
{
|
||||
return $this->fileTextCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function for template
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getFileContent()
|
||||
{
|
||||
return $this->extractFileAsText();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
|
||||
* returns the text. The value is also cached into the File record itself.
|
||||
*
|
||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||
* If true, the content parsing is forced, bypassing
|
||||
* the cached version
|
||||
* @return string|null
|
||||
*/
|
||||
public function extractFileAsText($disableCache = false)
|
||||
{
|
||||
/** @var File $file */
|
||||
$file = $this->owner;
|
||||
if (!$disableCache) {
|
||||
$text = $this->getTextCache()->load($file);
|
||||
if ($text) {
|
||||
return $text;
|
||||
}
|
||||
}
|
||||
|
||||
// Determine which extractor can process this file.
|
||||
$extractor = FileTextExtractor::for_file($file);
|
||||
if (!$extractor) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$text = $extractor->getContent($file);
|
||||
if (!$text) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!$disableCache) {
|
||||
$this->getTextCache()->save($file, $text);
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return void
|
||||
*/
|
||||
public function onBeforeWrite()
|
||||
{
|
||||
// Clear cache before changing file
|
||||
$this->getTextCache()->invalidate($this->owner);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,185 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\ClassInfo;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Core\Config\Configurable;
|
||||
use SilverStripe\Core\Injector\Injectable;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
|
||||
/**
|
||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||
* @author mstephens
|
||||
*/
|
||||
abstract class FileTextExtractor
|
||||
{
|
||||
use Configurable;
|
||||
use Injectable;
|
||||
|
||||
/**
|
||||
* Set priority from 0-100.
|
||||
* The highest priority extractor for a given content type will be selected.
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 50;
|
||||
|
||||
/**
|
||||
* Cache of extractor class names, sorted by priority
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected static $sorted_extractor_classes = null;
|
||||
|
||||
/**
|
||||
* Gets the list of prioritised extractor classes
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
protected static function get_extractor_classes()
|
||||
{
|
||||
// Check cache
|
||||
if (self::$sorted_extractor_classes) {
|
||||
return self::$sorted_extractor_classes;
|
||||
}
|
||||
|
||||
// Generate the sorted list of extractors on demand.
|
||||
$classes = ClassInfo::subclassesFor(__CLASS__);
|
||||
array_shift($classes);
|
||||
$classPriorities = [];
|
||||
|
||||
foreach ($classes as $class) {
|
||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||
}
|
||||
arsort($classPriorities);
|
||||
|
||||
// Save classes
|
||||
$sortedClasses = array_keys($classPriorities ?? []);
|
||||
return self::$sorted_extractor_classes = $sortedClasses;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text file extractor for the given class
|
||||
*
|
||||
* @param string $class
|
||||
* @return FileTextExtractor
|
||||
*/
|
||||
protected static function get_extractor($class)
|
||||
{
|
||||
return Injector::inst()->get($class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a File object, decide which extractor instance to use to handle it
|
||||
*
|
||||
* @param File|string $file
|
||||
* @return FileTextExtractor|null
|
||||
*/
|
||||
public static function for_file($file)
|
||||
{
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure we have a File instance to work with
|
||||
if (is_string($file)) {
|
||||
/** @var File $fileObject */
|
||||
$fileObject = File::create();
|
||||
$fileObject->setFromLocalFile($file);
|
||||
$file = $fileObject;
|
||||
}
|
||||
|
||||
$extension = $file->getExtension();
|
||||
$mime = $file->getMimeType();
|
||||
|
||||
foreach (self::get_extractor_classes() as $className) {
|
||||
$extractor = self::get_extractor($className);
|
||||
|
||||
// Skip unavailable extractors
|
||||
if (!$extractor->isAvailable()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check extension
|
||||
if ($extension && $extractor->supportsExtension($extension)) {
|
||||
return $extractor;
|
||||
}
|
||||
|
||||
// Check mime
|
||||
if ($mime && $extractor->supportsMime($mime)) {
|
||||
return $extractor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
|
||||
* file contents to a temp file and return its path
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected static function getPathFromFile(File $file)
|
||||
{
|
||||
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||
if (false === $path) {
|
||||
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
|
||||
}
|
||||
|
||||
// Append extension to temp file if one is set
|
||||
if ($file->getExtension()) {
|
||||
$path .= '.' . $file->getExtension();
|
||||
}
|
||||
|
||||
// Remove any existing temp files with this name
|
||||
if (file_exists($path ?? '')) {
|
||||
unlink($path ?? '');
|
||||
}
|
||||
|
||||
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
|
||||
if (false === $bytesWritten) {
|
||||
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
||||
}
|
||||
|
||||
return $path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the extractor is supported on the current environment,
|
||||
* for example if the correct binaries or libraries are available.
|
||||
*
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function isAvailable();
|
||||
|
||||
/**
|
||||
* Determine if this extractor supports the given extension.
|
||||
* If support is determined by mime/type only, then this should return false.
|
||||
*
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsExtension($extension);
|
||||
|
||||
/**
|
||||
* Determine if this extractor supports the given mime type.
|
||||
* Will only be called if supportsExtension returns false.
|
||||
*
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
abstract public function supportsMime($mime);
|
||||
|
||||
/**
|
||||
* Given a File instance, extract the contents as text.
|
||||
*
|
||||
* @param File|string $file Either the File instance, or a file path for a file to load
|
||||
* @return string
|
||||
*/
|
||||
abstract public function getContent($file);
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||
|
||||
class Exception extends \Exception
|
||||
{
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
/**
|
||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
|
||||
* the best for readable text.
|
||||
*
|
||||
* @author mstephens
|
||||
*/
|
||||
class HTMLTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
*
|
||||
* @config
|
||||
* @var integer
|
||||
*/
|
||||
private static $priority = 10;
|
||||
|
||||
/**
|
||||
* @return boolean
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return array
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(strtolower($extension ?? ''), ["html", "htm", "xhtml"]);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return string
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return strtolower($mime ?? '') === 'text/html';
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param File $file
|
||||
* @return string
|
||||
*/
|
||||
public function getContent($file)
|
||||
{
|
||||
$content = $file instanceof File ? $file->getString() : file_get_contents($file ?? '');
|
||||
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
[
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
],
|
||||
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
||||
$content ?? ''
|
||||
);
|
||||
|
||||
return strip_tags($content ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
|
||||
/**
|
||||
* Text extractor that calls pdftotext to do the conversion.
|
||||
* @author mstephens
|
||||
*/
|
||||
class PDFTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Set to bin path this extractor can execute
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
private static $binary_location = null;
|
||||
|
||||
/**
|
||||
* Used if binary_location isn't set.
|
||||
* List of locations to search for a given binary in
|
||||
*
|
||||
* @config
|
||||
* @var array
|
||||
*/
|
||||
private static $search_binary_locations = [
|
||||
'/usr/bin',
|
||||
'/usr/local/bin',
|
||||
];
|
||||
|
||||
public function isAvailable()
|
||||
{
|
||||
$bin = $this->bin('pdftotext');
|
||||
return $bin && file_exists($bin ?? '') && is_executable($bin ?? '');
|
||||
}
|
||||
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return strtolower($extension ?? '') === 'pdf';
|
||||
}
|
||||
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($mime ?? ''),
|
||||
[
|
||||
'application/pdf',
|
||||
'application/x-pdf',
|
||||
'application/x-bzpdf',
|
||||
'application/x-gzpdf'
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessor to get the location of the binary
|
||||
*
|
||||
* @param string $program Name of binary
|
||||
* @return string
|
||||
*/
|
||||
protected function bin($program = '')
|
||||
{
|
||||
// Get list of allowed search paths
|
||||
if ($location = $this->config()->get('binary_location')) {
|
||||
$locations = [$location];
|
||||
} else {
|
||||
$locations = $this->config()->get('search_binary_locations');
|
||||
}
|
||||
|
||||
// Find program in each path
|
||||
foreach ($locations as $location) {
|
||||
$path = "{$location}/{$program}";
|
||||
if (file_exists($path ?? '')) {
|
||||
return $path;
|
||||
}
|
||||
if (file_exists($path . '.exe')) {
|
||||
return $path . '.exe';
|
||||
}
|
||||
}
|
||||
|
||||
// Not found
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
$content = $this->getRawOutput($file);
|
||||
return $this->cleanupLigatures($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Invoke pdftotext with the given File object
|
||||
*
|
||||
* @param File|string $file
|
||||
* @return string Output
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function getRawOutput($file)
|
||||
{
|
||||
if (!$this->isAvailable()) {
|
||||
throw new Exception("getRawOutput called on unavailable extractor");
|
||||
}
|
||||
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path ?? '')), $content, $err);
|
||||
|
||||
if ($err) {
|
||||
throw new Exception(sprintf(
|
||||
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||
$path,
|
||||
implode(PHP_EOL, $content)
|
||||
));
|
||||
}
|
||||
|
||||
return implode(PHP_EOL, $content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes utf-8 ligatures.
|
||||
*
|
||||
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
||||
*
|
||||
* @param string $input
|
||||
* @return string
|
||||
*/
|
||||
protected function cleanupLigatures($input)
|
||||
{
|
||||
$mapping = [
|
||||
'ff' => 'ff',
|
||||
'fi' => 'fi',
|
||||
'fl' => 'fl',
|
||||
'ffi' => 'ffi',
|
||||
'ffl' => 'ffl',
|
||||
'ſt' => 'ft',
|
||||
'st' => 'st'
|
||||
];
|
||||
|
||||
return str_replace(array_keys($mapping ?? []), array_values($mapping ?? []), $input ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use Exception;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use InvalidArgumentException;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
/**
|
||||
* Text extractor that calls an Apache Solr instance
|
||||
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
||||
* Does not alter the Solr index itself, but uses it purely
|
||||
* for its file parsing abilities.
|
||||
*
|
||||
* @author ischommer
|
||||
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
||||
*/
|
||||
class SolrCellTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Base URL to use for Solr text extraction.
|
||||
* E.g. http://localhost:8983/solr/update/extract
|
||||
*
|
||||
* @config
|
||||
* @var string
|
||||
*/
|
||||
private static $base_url;
|
||||
|
||||
/**
|
||||
* @var int
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 75;
|
||||
|
||||
/**
|
||||
* @var Client
|
||||
*/
|
||||
protected $httpClient;
|
||||
|
||||
/**
|
||||
* @return Client
|
||||
*/
|
||||
public function getHttpClient()
|
||||
{
|
||||
if (!$this->httpClient) {
|
||||
$this->httpClient = new Client();
|
||||
}
|
||||
|
||||
return $this->httpClient;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param Client $client
|
||||
* @return $this
|
||||
*/
|
||||
public function setHttpClient(Client $client)
|
||||
{
|
||||
$this->httpClient = $client;
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
$url = $this->config()->get('base_url');
|
||||
|
||||
return (bool) $url;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
return in_array(
|
||||
strtolower($extension ?? ''),
|
||||
[
|
||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
// Rely on supportsExtension
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File|string $file
|
||||
* @return string
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
|
||||
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
|
||||
$client = $this->getHttpClient();
|
||||
|
||||
// Get and validate base URL
|
||||
$baseUrl = $this->config()->get('base_url');
|
||||
if (!$this->config()->get('base_url')) {
|
||||
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||
}
|
||||
|
||||
try {
|
||||
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
|
||||
/** @var Response $response */
|
||||
$response = $client
|
||||
->post($baseUrl, [
|
||||
'multipart' => [
|
||||
['name' => 'extractOnly', 'contents' => 'true'],
|
||||
['name' => 'extractFormat', 'contents' => 'text'],
|
||||
['name' => 'myfile', 'contents' => $stream],
|
||||
]
|
||||
]);
|
||||
} catch (InvalidArgumentException $e) {
|
||||
$msg = sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
$fileName,
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
return null;
|
||||
} catch (Exception $e) {
|
||||
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
|
||||
// query in some cases.
|
||||
$msg = sprintf(
|
||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||
$fileName,
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
return null;
|
||||
}
|
||||
|
||||
$matches = [];
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
|
||||
(string)$response->getBody(),
|
||||
$matches
|
||||
);
|
||||
|
||||
return $matches ? $matches[1] : null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika Rest Server
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaServerTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Tika server is pretty efficient so use it immediately if available
|
||||
*
|
||||
* @var integer
|
||||
* @config
|
||||
*/
|
||||
private static $priority = 80;
|
||||
|
||||
/**
|
||||
* Server endpoint
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $server_endpoint;
|
||||
|
||||
/**
|
||||
* @var TikaRestClient
|
||||
*/
|
||||
protected $client = null;
|
||||
|
||||
/**
|
||||
* Cache of supported mime types
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $supportedMimes = [];
|
||||
|
||||
/**
|
||||
* @return TikaRestClient
|
||||
*/
|
||||
public function getClient()
|
||||
{
|
||||
if (!$this->client) {
|
||||
$this->client = Injector::inst()->createWithArgs(
|
||||
TikaRestClient::class,
|
||||
[$this->getServerEndpoint()]
|
||||
);
|
||||
}
|
||||
return $this->client;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function getServerEndpoint()
|
||||
{
|
||||
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
|
||||
return $endpoint;
|
||||
}
|
||||
|
||||
// Default to configured endpoint
|
||||
return $this->config()->get('server_endpoint');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the version of Tika installed, or 0 if not installed
|
||||
*
|
||||
* @return float version of Tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
return $this->getClient()->getVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return boolean
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getServerEndpoint()
|
||||
&& $this->getClient()->isAvailable()
|
||||
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $extension
|
||||
* @return boolean
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return boolean
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
if (!$this->supportedMimes) {
|
||||
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
|
||||
}
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
if (isset($this->supportedMimes[$mime])) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check aliases
|
||||
foreach ($this->supportedMimes as $info) {
|
||||
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
$content = $this->getClient()->tika($tempFile);
|
||||
//Cleanup temp file
|
||||
if ($file instanceof File) {
|
||||
unlink($tempFile ?? '');
|
||||
}
|
||||
return $content;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
|
||||
/**
|
||||
* Enables text extraction of file content via the Tika CLI
|
||||
*
|
||||
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
||||
*/
|
||||
class TikaTextExtractor extends FileTextExtractor
|
||||
{
|
||||
/**
|
||||
* Text extraction mode. Defaults to -t (plain text)
|
||||
*
|
||||
* @var string
|
||||
* @config
|
||||
*/
|
||||
private static $output_mode = '-t';
|
||||
|
||||
/**
|
||||
* Get the version of tika installed, or 0 if not installed
|
||||
*
|
||||
* @return mixed float | int The version of tika
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
$code = $this->runShell('tika --version', $stdout);
|
||||
|
||||
// Parse output
|
||||
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout ?? '', $matches)) {
|
||||
return $matches['version'];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs an arbitrary and safely escaped shell command
|
||||
*
|
||||
* @param string $command Full command including arguments
|
||||
* @param string &$stdout Standand output
|
||||
* @param string &$stderr Standard error
|
||||
* @param string $input Content to pass via standard input
|
||||
* @return int Exit code. 0 is success
|
||||
*/
|
||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
||||
{
|
||||
$descriptorSpecs = [
|
||||
0 => ["pipe", "r"],
|
||||
1 => ["pipe", "w"],
|
||||
2 => ["pipe", "w"]
|
||||
];
|
||||
// Invoke command
|
||||
$pipes = [];
|
||||
$proc = proc_open($command ?? '', $descriptorSpecs ?? [], $pipes);
|
||||
|
||||
if (!is_resource($proc)) {
|
||||
return 255;
|
||||
}
|
||||
|
||||
// Send content as input
|
||||
fwrite($pipes[0], $input ?? '');
|
||||
fclose($pipes[0]);
|
||||
|
||||
// Get output
|
||||
$stdout = stream_get_contents($pipes[1]);
|
||||
fclose($pipes[1]);
|
||||
$stderr = stream_get_contents($pipes[2]);
|
||||
fclose($pipes[2]);
|
||||
|
||||
// Get result
|
||||
return proc_close($proc);
|
||||
}
|
||||
|
||||
public function getContent($file)
|
||||
{
|
||||
$mode = $this->config()->get('output_mode');
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path ?? ''));
|
||||
$code = $this->runShell($command, $output);
|
||||
//Cleanup temp file
|
||||
if ($file instanceof File) {
|
||||
unlink($path ?? '');
|
||||
}
|
||||
|
||||
if ($code == 0) {
|
||||
return $output;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
return $this->getVersion() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsExtension($extension)
|
||||
{
|
||||
// Determine support via mime type only
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param string $mime
|
||||
* @return bool
|
||||
*/
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
// Get list of supported mime types
|
||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||
|
||||
if ($code) {
|
||||
// Error case
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the mime type is inside the result
|
||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime ?? '', '/'));
|
||||
|
||||
return (bool)preg_match($pattern ?? '', $supportedTypes ?? '');
|
||||
}
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Rest;
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Core\Convert;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
class TikaRestClient extends Client
|
||||
{
|
||||
/**
|
||||
* Authentication options to be sent to the Tika server
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
protected $options = ['username' => null, 'password' => null];
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
protected $mimes = [];
|
||||
|
||||
/**
|
||||
*
|
||||
* @param string $baseUrl
|
||||
* @param array $config
|
||||
*/
|
||||
public function __construct($baseUrl = '', $config = [])
|
||||
{
|
||||
$password = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||
|
||||
if (!empty($password)) {
|
||||
$this->options = [
|
||||
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
||||
'password' => $password,
|
||||
];
|
||||
}
|
||||
|
||||
$config['base_uri'] = $baseUrl;
|
||||
|
||||
parent::__construct($config);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if the service is available
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
public function isAvailable()
|
||||
{
|
||||
try {
|
||||
/** @var Response $result */
|
||||
$result = $this->get('/', $this->getGuzzleOptions());
|
||||
|
||||
if ($result->getStatusCode() == 200) {
|
||||
return true;
|
||||
}
|
||||
} catch (RequestException $ex) {
|
||||
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
||||
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get version code
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
/** @var Response $response */
|
||||
$response = $this->get('version', $this->getGuzzleOptions());
|
||||
$version = 0;
|
||||
|
||||
// Parse output
|
||||
if ($response->getStatusCode() == 200
|
||||
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody() ?? '', $matches)
|
||||
) {
|
||||
$version = $matches['version'];
|
||||
}
|
||||
|
||||
return (string) $version;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets supported mime data. May include aliased mime types.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
public function getSupportedMimes()
|
||||
{
|
||||
if ($this->mimes) {
|
||||
return $this->mimes;
|
||||
}
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'application/json',
|
||||
],
|
||||
])
|
||||
);
|
||||
|
||||
return $this->mimes = json_decode($response->getBody(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file.
|
||||
* Logs a notice-level error if the document can't be parsed.
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file)
|
||||
{
|
||||
$text = null;
|
||||
try {
|
||||
/** @var Response $response */
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'text/plain',
|
||||
],
|
||||
'body' => file_get_contents($file ?? ''),
|
||||
])
|
||||
);
|
||||
$text = $response->getBody();
|
||||
} catch (RequestException $e) {
|
||||
$msg = sprintf(
|
||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||
$file,
|
||||
$e->getResponse()->getStatusCode(),
|
||||
$e->getResponse()->getReasonPhrase()
|
||||
);
|
||||
// Only available if tika-server was started with --includeStack
|
||||
$body = $e->getResponse()->getBody();
|
||||
if ($body) {
|
||||
$msg .= ' Body: ' . $body;
|
||||
}
|
||||
|
||||
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||
}
|
||||
|
||||
return (string) $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assembles an array of request options to pass to Guzzle
|
||||
*
|
||||
* @param array $options Authentication (etc) will be merged into this array and returned
|
||||
* @return array
|
||||
*/
|
||||
protected function getGuzzleOptions($options = [])
|
||||
{
|
||||
if (!empty($this->options['username']) && !empty($this->options['password'])) {
|
||||
$options['auth'] = [
|
||||
$this->options['username'],
|
||||
$this->options['password']
|
||||
];
|
||||
}
|
||||
return $options;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
|
||||
|
||||
class FileTextCacheDatabaseTest extends SapphireTest
|
||||
{
|
||||
public function testTruncatesByMaxLength()
|
||||
{
|
||||
Config::modify()->set(Database::class, 'max_content_length', 5);
|
||||
|
||||
$cache = new Database();
|
||||
$file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
|
||||
$content = '0123456789';
|
||||
$cache->save($file, $content);
|
||||
|
||||
$this->assertEquals($cache->load($file), '01234');
|
||||
}
|
||||
}
|
|
@ -1,43 +1,60 @@
|
|||
<?php
|
||||
class FileTextExtractableTest extends SapphireTest {
|
||||
|
||||
protected $requiredExtensions = array(
|
||||
'File' => array('FileTextExtractable')
|
||||
);
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
public function setUp() {
|
||||
parent::setUp();
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
|
||||
|
||||
// Ensure that html is a valid extension
|
||||
Config::inst()
|
||||
->nest()
|
||||
->update('File', 'allowed_extensions', array('html'));
|
||||
}
|
||||
class FileTextExtractableTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function tearDown() {
|
||||
Config::unnest();
|
||||
parent::tearDown();
|
||||
}
|
||||
protected static $required_extensions = [
|
||||
File::class => [
|
||||
FileTextExtractable::class,
|
||||
],
|
||||
];
|
||||
|
||||
function testExtractFileAsText() {
|
||||
// Create a copy of the file, as it may be clobbered by the test
|
||||
// ($file->extractFileAsText() calls $file->write)
|
||||
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||
|
||||
// Use HTML, since the extractor is always available
|
||||
$file = new File(array(
|
||||
'Name' => 'test1-copy.html',
|
||||
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
|
||||
));
|
||||
$file->write();
|
||||
|
||||
$content = $file->extractFileAsText();
|
||||
$this->assertContains('Test Headline', $content);
|
||||
$this->assertContains('Test Text', $content);
|
||||
$this->assertEquals($content, $file->FileContentCache);
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
||||
}
|
||||
// Ensure that html is a valid extension
|
||||
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||
|
||||
// Create a copy of the file, as it may be clobbered by the test
|
||||
// ($file->extractFileAsText() calls $file->write)
|
||||
copy(
|
||||
dirname(__FILE__) . '/fixtures/test1.html',
|
||||
dirname(__FILE__) . '/fixtures/test1-copy.html'
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
protected function tearDown(): void
|
||||
{
|
||||
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
||||
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||
}
|
||||
|
||||
parent::tearDown();
|
||||
}
|
||||
|
||||
public function testExtractFileAsText()
|
||||
{
|
||||
// Use HTML, since the extractor is always available
|
||||
/** @var File&FileTextExtractable $file */
|
||||
$file = new File(['Name' => 'test1-copy.html']);
|
||||
$file->setTextCache(new FileTextCache\Database());
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||
$file->write();
|
||||
|
||||
$content = $file->extractFileAsText();
|
||||
$this->assertNotNull($content);
|
||||
$this->assertStringContainsString('Test Headline', $content);
|
||||
$this->assertStringContainsString('Test Text', $content);
|
||||
$this->assertEquals($content, $file->FileContentCache);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,36 @@
|
|||
<?php
|
||||
class HTMLTextExtractorTest extends SapphireTest {
|
||||
|
||||
function testExtraction() {
|
||||
$extractor = new HTMLTextExtractor();
|
||||
|
||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
||||
$this->assertContains('Test Headline', $content);
|
||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
||||
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
||||
}
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
}
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Core\Config\Config;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
|
||||
|
||||
class HTMLTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
protected function setUp(): void
|
||||
{
|
||||
parent::setUp();
|
||||
|
||||
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||
}
|
||||
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = new HTMLTextExtractor();
|
||||
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
|
||||
$this->assertStringContainsString('Test Headline', $content);
|
||||
$this->assertStringNotContainsString('Test Comment', $content, 'Strips HTML comments');
|
||||
$this->assertStringNotContainsString('Test Style', $content, 'Strips non-content style tags');
|
||||
$this->assertStringNotContainsString('Test Script', $content, 'Strips non-content script tags');
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,29 @@
|
|||
<?php
|
||||
class PDFTextExtractorTest extends SapphireTest {
|
||||
|
||||
function testExtraction() {
|
||||
$extractor = new PDFTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('pdftotext not available');
|
||||
|
||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
}
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
|
||||
|
||||
class PDFTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = new PDFTextExtractor();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->expectException(Exception::class);
|
||||
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
|
||||
}
|
||||
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use PHPUnit\Framework\MockObject\MockObject;
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
|
||||
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||
|
||||
/**
|
||||
* @group tika-tests
|
||||
*/
|
||||
class TikaServerTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
public function testServerExtraction()
|
||||
{
|
||||
$extractor = TikaServerTextExtractor::create();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('tika server not available');
|
||||
}
|
||||
|
||||
// Check file
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $version
|
||||
* @param bool $expected
|
||||
* @dataProvider isAvailableProvider
|
||||
*/
|
||||
public function testIsAvailable($version, $expected)
|
||||
{
|
||||
/** @var MockObject|TikaServerTextExtractor $extractor */
|
||||
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
|
||||
->setMethods(['getClient', 'getServerEndpoint'])
|
||||
->getMock();
|
||||
|
||||
$client = $this->createMock(TikaRestClient::class);
|
||||
$client->method('isAvailable')->willReturn(true);
|
||||
$client->method('getVersion')->willReturn($version);
|
||||
|
||||
$extractor->method('getClient')->willReturn($client);
|
||||
$extractor->method('getServerEndpoint')->willReturn('tikaserver.example');
|
||||
|
||||
$result = $extractor->isAvailable();
|
||||
$this->assertSame($expected, $result);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return array[]
|
||||
*/
|
||||
public function isAvailableProvider()
|
||||
{
|
||||
return [
|
||||
['1.5.2', false],
|
||||
['1.5', false],
|
||||
['1.7.0', true],
|
||||
['1.7.5', true],
|
||||
['1.8.0', true],
|
||||
['1.7', true],
|
||||
['1.8', true],
|
||||
['2.0.0', true],
|
||||
];
|
||||
}
|
||||
}
|
|
@ -1,38 +1,38 @@
|
|||
<?php
|
||||
|
||||
namespace SilverStripe\TextExtraction\Tests;
|
||||
|
||||
use SilverStripe\Assets\File;
|
||||
use SilverStripe\Dev\SapphireTest;
|
||||
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
|
||||
|
||||
/**
|
||||
* Tests the {@see TikaTextExtractor} class
|
||||
*
|
||||
* @group tika-tests
|
||||
*/
|
||||
class TikaTextExtractorTest extends SapphireTest {
|
||||
|
||||
function testExtraction() {
|
||||
$extractor = new TikaTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
|
||||
class TikaTextExtractorTest extends SapphireTest
|
||||
{
|
||||
protected $usesDatabase = true;
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
public function testExtraction()
|
||||
{
|
||||
$extractor = TikaTextExtractor::create();
|
||||
if (!$extractor->isAvailable()) {
|
||||
$this->markTestSkipped('tika cli not available');
|
||||
}
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
// Check file
|
||||
$file = new File();
|
||||
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||
$file->write();
|
||||
|
||||
function testServerExtraction() {
|
||||
$extractor = new TikaServerTextExtractor();
|
||||
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
|
||||
|
||||
// Check file
|
||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertContains('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
$content = $extractor->getContent($file);
|
||||
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||
|
||||
// Check mime validation
|
||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue