mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-01 05:39:41 +02:00
Merge pull request #45 from creative-commoners/pulls/3.0/ss4-updates
API Update namespaces and SilverStripe API implementations for SilverStripe 4 compat
This commit is contained in:
commit
9795866abe
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -4,3 +4,4 @@
|
|||||||
/.gitignore export-ignore
|
/.gitignore export-ignore
|
||||||
/.travis.yml export-ignore
|
/.travis.yml export-ignore
|
||||||
/.scrutinizer.yml export-ignore
|
/.scrutinizer.yml export-ignore
|
||||||
|
/codecov.yml export-ignore
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
inherit: true
|
inherit: true
|
||||||
|
|
||||||
checks:
|
checks:
|
||||||
php:
|
php: true
|
||||||
code_rating: true
|
|
||||||
duplication: true
|
build:
|
||||||
|
nodes:
|
||||||
|
analysis:
|
||||||
|
tests:
|
||||||
|
override: [php-scrutinizer-run]
|
||||||
|
|
||||||
filter:
|
filter:
|
||||||
paths: [code/*, tests/*]
|
paths: [src/*, tests/*]
|
||||||
|
48
.travis.yml
48
.travis.yml
@ -1,39 +1,47 @@
|
|||||||
# See https://github.com/silverstripe/silverstripe-travis-support for setup details
|
|
||||||
language: php
|
language: php
|
||||||
|
|
||||||
sudo: false
|
|
||||||
|
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
packages:
|
packages:
|
||||||
- poppler-utils
|
- poppler-utils
|
||||||
|
|
||||||
|
env:
|
||||||
|
global:
|
||||||
|
- COMPOSER_ROOT_VERSION=3.x-dev
|
||||||
|
- SS_TIKA_ENDPOINT="http://localhost:9998/"
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- php: 5.4
|
|
||||||
env: DB=PGSQL CORE_RELEASE=3.2
|
|
||||||
- php: 5.5
|
|
||||||
env: DB=PGSQL CORE_RELEASE=3.3
|
|
||||||
- php: 5.6
|
- php: 5.6
|
||||||
env: DB=PGSQL CORE_RELEASE=3.4
|
env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1
|
||||||
- php: 5.6
|
|
||||||
env: DB=MYSQL CORE_RELEASE=3.5
|
|
||||||
- php: 7.0
|
- php: 7.0
|
||||||
env: DB=MYSQL CORE_RELEASE=3.6
|
env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1
|
||||||
- php: 7.1
|
- php: 7.1
|
||||||
env: DB=MYSQL CORE_RELEASE=3
|
env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1
|
||||||
|
- php: 7.2
|
||||||
|
env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1
|
||||||
|
|
||||||
before_script:
|
before_script:
|
||||||
- composer self-update || true
|
# Init PHP
|
||||||
|
- phpenv rehash
|
||||||
|
- phpenv config-rm xdebug.ini
|
||||||
|
|
||||||
|
# Configure Tika bin
|
||||||
- mkdir -p $HOME/bin
|
- mkdir -p $HOME/bin
|
||||||
- export PATH=$PATH:$HOME/bin
|
- export PATH=$PATH:$HOME/bin
|
||||||
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
|
|
||||||
- ./.travis/install_tika.sh
|
- ./.travis/install_tika.sh
|
||||||
- git clone git://github.com/silverstripe/silverstripe-travis-support.git ~/travis-support
|
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
||||||
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
|
|
||||||
- cd ~/builds/ss
|
# Install composer dependencies
|
||||||
- composer install
|
- composer validate
|
||||||
|
- composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION"
|
||||||
|
- if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi
|
||||||
|
- composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile
|
||||||
|
|
||||||
script:
|
script:
|
||||||
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
- if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi
|
||||||
- vendor/bin/phpunit --verbose textextraction/tests/
|
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi
|
||||||
|
- if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi
|
||||||
|
|
||||||
|
after_success:
|
||||||
|
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi
|
||||||
|
14
.upgrade.yml
Normal file
14
.upgrade.yml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
mappings:
|
||||||
|
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||||
|
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
|
||||||
|
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||||
|
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||||
|
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
|
||||||
|
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
|
||||||
|
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
|
||||||
|
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
|
||||||
|
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
|
||||||
|
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
|
||||||
|
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
|
||||||
|
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient
|
||||||
|
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,12 +0,0 @@
|
|||||||
# Changelog
|
|
||||||
|
|
||||||
All notable changes to this project will be documented in this file.
|
|
||||||
|
|
||||||
This project adheres to [Semantic Versioning](http://semver.org/).
|
|
||||||
|
|
||||||
|
|
||||||
## [2.0.1]
|
|
||||||
Using Symfony mime type detection
|
|
||||||
|
|
||||||
## [2.0.0]
|
|
||||||
Clarified Tika docs
|
|
12
README.md
12
README.md
@ -1,11 +1,9 @@
|
|||||||
# Text extraction module
|
# Text extraction module
|
||||||
|
|
||||||
[![Build Status](https://secure.travis-ci.org/silverstripe/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe/silverstripe-textextraction)
|
[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction)
|
||||||
|
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master)
|
||||||
|
[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction)
|
||||||
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
||||||
[![Code Quality](http://img.shields.io/scrutinizer/g/silverstripe/silverstripe-textextraction.svg?style=flat)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction)
|
|
||||||
[![Version](http://img.shields.io/packagist/v/silverstripe/textextraction.svg?style=flat)](https://packagist.org/packages/silverstripe/silverstripe-textextraction)
|
|
||||||
[![License](http://img.shields.io/packagist/l/silverstripe/textextraction.svg?style=flat)](license.md)
|
|
||||||
|
|
||||||
|
|
||||||
Provides a text extraction API for file content, that can hook into different extractor
|
Provides a text extraction API for file content, that can hook into different extractor
|
||||||
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
||||||
@ -26,14 +24,14 @@ The module supports text extraction on the following file formats:
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
* SilverStripe ^3.1
|
* SilverStripe ^4.0
|
||||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```js
|
```
|
||||||
composer require silverstripe/textextraction
|
composer require silverstripe/textextraction
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ Name: textextractioncache
|
|||||||
After:
|
After:
|
||||||
- '#corecache'
|
- '#corecache'
|
||||||
---
|
---
|
||||||
|
|
||||||
SilverStripe\Core\Injector\Injector:
|
SilverStripe\Core\Injector\Injector:
|
||||||
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
|
||||||
factory: SilverStripe\Core\Cache\CacheFactory
|
factory: SilverStripe\Core\Cache\CacheFactory
|
||||||
|
10
_config/config.yml
Normal file
10
_config/config.yml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
---
|
||||||
|
Name: textextractionconfig
|
||||||
|
---
|
||||||
|
SilverStripe\Core\Injector\Injector:
|
||||||
|
# Define default FileTextCache implementation
|
||||||
|
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||||
|
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
|
||||||
|
|
||||||
|
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||||
|
max_content_length: 500000
|
3
codecov.yml
Normal file
3
codecov.yml
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
comment: false
|
||||||
|
codecov:
|
||||||
|
branch: master
|
@ -4,7 +4,11 @@
|
|||||||
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
|
||||||
"homepage": "http://silverstripe.org",
|
"homepage": "http://silverstripe.org",
|
||||||
"license": "BSD-3-Clause",
|
"license": "BSD-3-Clause",
|
||||||
"keywords": ["silverstripe", "fulltext", "pdf"],
|
"keywords": [
|
||||||
|
"silverstripe",
|
||||||
|
"fulltext",
|
||||||
|
"pdf"
|
||||||
|
],
|
||||||
"authors": [
|
"authors": [
|
||||||
{
|
{
|
||||||
"name": "SilverStripe",
|
"name": "SilverStripe",
|
||||||
@ -16,14 +20,15 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"require": {
|
"require": {
|
||||||
"php": ">=5.6",
|
|
||||||
"silverstripe/framework": "^4",
|
"silverstripe/framework": "^4",
|
||||||
|
"silverstripe/assets": "^1",
|
||||||
|
"silverstripe/versioned": "^1",
|
||||||
"guzzlehttp/guzzle": "~6.3.0",
|
"guzzlehttp/guzzle": "~6.3.0",
|
||||||
"symfony/event-dispatcher": "^2.6.0@stable",
|
"symfony/event-dispatcher": "^2.6.0@stable",
|
||||||
"symfony/http-foundation": "^2.6.0",
|
"symfony/http-foundation": "^2.6.0"
|
||||||
"silverstripe/assets": "^1"
|
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
|
"squizlabs/php_codesniffer": "^3",
|
||||||
"phpunit/phpunit": "^5.7"
|
"phpunit/phpunit": "^5.7"
|
||||||
},
|
},
|
||||||
"suggest": {
|
"suggest": {
|
||||||
@ -33,5 +38,7 @@
|
|||||||
"branch-alias": {
|
"branch-alias": {
|
||||||
"dev-master": "3.x-dev"
|
"dev-master": "3.x-dev"
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"minimum-stability": "dev",
|
||||||
|
"prefer-stable": true
|
||||||
}
|
}
|
||||||
|
@ -8,31 +8,30 @@ the content available through your `DataObject` subclass.
|
|||||||
In this case, add the following to `mysite/_config/config.yml`:
|
In this case, add the following to `mysite/_config/config.yml`:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
File:
|
SilverStripe\Assets\File:
|
||||||
extensions:
|
extensions:
|
||||||
- FileTextExtractable
|
- SilverStripe\TextExtraction\Extension\FileTextExtractable
|
||||||
```
|
```
|
||||||
|
|
||||||
By default any extracted content will be cached against the database row.
|
By default any extracted content will be cached against the database row. In order to stay within common size
|
||||||
In order to stay within common size constraints for SQL queries required in this operation,
|
constraints for SQL queries required in this operation, the cache sets a maximum character length after which
|
||||||
the cache sets a maximum character length after which content gets truncated (default: 500000).
|
content gets truncated (default: 500000). You can configure this value through
|
||||||
You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration.
|
`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
|
||||||
|
|
||||||
|
|
||||||
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
|
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
|
||||||
In order to swap out the cache backend you can use the following yaml configuration.
|
In order to swap out the cache backend you can use the following yaml configuration.
|
||||||
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
---
|
---
|
||||||
Name: mytextextraction
|
Name: mytextextraction
|
||||||
After: '#textextraction'
|
After: '#textextraction'
|
||||||
---
|
---
|
||||||
Injector:
|
SilverStripe\Core\Injector\Injector:
|
||||||
FileTextCache: FileTextCache_SSCache
|
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||||
FileTextCache_SSCache:
|
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||||
lifetime: 3600 # Number of seconds to cache content for
|
|
||||||
|
|
||||||
|
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||||
|
lifetime: 3600 # Number of seconds to cache content for
|
||||||
```
|
```
|
||||||
|
|
||||||
## XPDF
|
## XPDF
|
||||||
@ -42,7 +41,7 @@ commandline utility. Follow their installation instructions, its presence will b
|
|||||||
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
PDFTextExtractor:
|
SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
|
||||||
binary_location: /my/path/pdftotext
|
binary_location: /my/path/pdftotext
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -59,7 +58,7 @@ in your database driver, or even pass it back to Solr as part of a full index up
|
|||||||
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
SolrCellTextExtractor:
|
SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
|
||||||
base_url: 'http://localhost:8983/solr/update/extract'
|
base_url: 'http://localhost:8983/solr/update/extract'
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -76,16 +75,27 @@ or by writing your own method around `FileTextExtractor->getContent()` (see "Usa
|
|||||||
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
class MyDocument extends DataObject {
|
use SilverStripe\ORM\DataObject;
|
||||||
static $db = array('Path' => 'Text');
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
function getContent() {
|
|
||||||
|
class MyDocument extends DataObject
|
||||||
|
{
|
||||||
|
private static $db = ['Path' => 'Text'];
|
||||||
|
|
||||||
|
public function getContent()
|
||||||
|
{
|
||||||
$extractor = FileTextExtractor::for_file($this->Path);
|
$extractor = FileTextExtractor::for_file($this->Path);
|
||||||
return $extractor ? $extractor->getContent($this->Path) : null;
|
return $extractor ? $extractor->getContent($this->Path) : null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
class MySolrIndex extends SolrIndex {
|
|
||||||
function init() {
|
use SilverStripe\FullTextSearch\Solr;
|
||||||
$this->addClass('MyDocument');
|
|
||||||
|
class MySolrIndex extends SolrIndex
|
||||||
|
{
|
||||||
|
public function init()
|
||||||
|
{
|
||||||
|
$this->addClass(MyDocument::class);
|
||||||
$this->addStoredField('Content', 'HTMLText');
|
$this->addStoredField('Content', 'HTMLText');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -120,14 +130,15 @@ exec java -jar tika-app-1.8.jar "$@"
|
|||||||
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
TikaServerTextExtractor:
|
SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
|
||||||
server_endpoint: 'http://localhost:9998'
|
server_endpoint: 'http://localhost:9998'
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name.
|
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
|
||||||
|
environment variable of the same name.
|
||||||
|
|
||||||
|
|
||||||
Then startup your server as below
|
Then startup your server as below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||||
@ -136,7 +147,7 @@ java -jar tika-server-1.8.jar --host=localhost --port=9998
|
|||||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||||
it behaves differently and is not recommended.
|
it behaves differently and is not recommended.
|
||||||
|
|
||||||
The module will log extraction errors with `SS_Log::NOTICE` priority by default,
|
The module will log extraction errors with PSR-3 "notice" priority by default,
|
||||||
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
||||||
In case you want more information on why processing failed, you can increase
|
In case you want more information on why processing failed, you can increase
|
||||||
the logging verbosity in the tika server instance by passing through
|
the logging verbosity in the tika server instance by passing through
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
# Developer documentation
|
# Developer documentation
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Manual extraction:
|
Manual extraction via string file path:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
$myFile = '/my/path/myfile.pdf';
|
$myFile = '/my/path/myfile.pdf';
|
||||||
@ -9,6 +10,14 @@ $extractor = FileTextExtractor::for_file($myFile);
|
|||||||
$content = $extractor->getContent($myFile);
|
$content = $extractor->getContent($myFile);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Manual extraction via File object:
|
||||||
|
|
||||||
|
```php
|
||||||
|
$myFile = File::get()->filter(['Name' => 'My file')->first();
|
||||||
|
$extractor = FileTextExtractor::for_file($myFile);
|
||||||
|
$content = $extractor->getContent($myFile);
|
||||||
|
```
|
||||||
|
|
||||||
Extraction with `FileTextExtractable` extension applied:
|
Extraction with `FileTextExtractable` extension applied:
|
||||||
|
|
||||||
```php
|
```php
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
Copyright (c) 2017, SilverStripe Limited
|
Copyright (c) 2018, SilverStripe Limited
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||||
|
10
phpcs.xml.dist
Normal file
10
phpcs.xml.dist
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<ruleset name="SilverStripe">
|
||||||
|
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
|
||||||
|
|
||||||
|
<!-- base rules are PSR-2 -->
|
||||||
|
<rule ref="PSR2" >
|
||||||
|
<!-- Current exclusions -->
|
||||||
|
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
|
||||||
|
</rule>
|
||||||
|
</ruleset>
|
14
phpunit.xml.dist
Normal file
14
phpunit.xml.dist
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
|
||||||
|
<testsuite name="Default">
|
||||||
|
<directory>tests/</directory>
|
||||||
|
</testsuite>
|
||||||
|
|
||||||
|
<filter>
|
||||||
|
<whitelist addUncoveredFilesFromWhitelist="true">
|
||||||
|
<directory suffix=".php">src/</directory>
|
||||||
|
<exclude>
|
||||||
|
<directory suffix=".php">tests/</directory>
|
||||||
|
</exclude>
|
||||||
|
</whitelist>
|
||||||
|
</filter>
|
||||||
|
</phpunit>
|
@ -1,6 +1,6 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extension;
|
namespace SilverStripe\TextExtraction\Cache;
|
||||||
|
|
||||||
use SilverStripe\Assets\File;
|
use SilverStripe\Assets\File;
|
||||||
|
|
@ -1,19 +1,21 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extension;
|
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
|
|
||||||
use SilverStripe\Assets\File,
|
use Psr\SimpleCache\CacheInterface;
|
||||||
SilverStripe\Core\Config\Config,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\TextExtraction\Extension\FileTextCache,
|
use SilverStripe\Core\Config\Configurable;
|
||||||
SilverStripe\Core\Flushable,
|
use SilverStripe\Core\Flushable;
|
||||||
Psr\SimpleCache\CacheInterface,
|
use SilverStripe\Core\Injector\Injector;
|
||||||
SilverStripe\Core\Injector\Injector;
|
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses SS_Cache with a lifetime to cache extracted content
|
* Uses SS_Cache with a lifetime to cache extracted content
|
||||||
*/
|
*/
|
||||||
class FileTextCache_Cache implements FileTextCache, Flushable
|
class Cache implements FileTextCache, Flushable
|
||||||
{
|
{
|
||||||
|
use Configurable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lifetime of cache in seconds
|
* Lifetime of cache in seconds
|
||||||
* Null is indefinite
|
* Null is indefinite
|
||||||
@ -46,7 +48,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
|||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File $file
|
||||||
* @return type
|
* @return mixed
|
||||||
*/
|
*/
|
||||||
public function load(File $file)
|
public function load(File $file)
|
||||||
{
|
{
|
||||||
@ -63,8 +65,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
|||||||
*/
|
*/
|
||||||
public function save(File $file, $content)
|
public function save(File $file, $content)
|
||||||
{
|
{
|
||||||
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
|
$lifetime = $this->config()->get('lifetime') ?: 3600;
|
||||||
$lifetime = $lifetime ?: 3600;
|
|
||||||
$key = $this->getKey($file);
|
$key = $this->getKey($file);
|
||||||
$cache = self::get_cache();
|
$cache = self::get_cache();
|
||||||
|
|
||||||
@ -94,7 +95,7 @@ class FileTextCache_Cache implements FileTextCache, Flushable
|
|||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File $file
|
||||||
* @return type
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function invalidate(File $file)
|
public function invalidate(File $file)
|
||||||
{
|
{
|
@ -1,17 +1,25 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extension;
|
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
|
|
||||||
use SilverStripe\Assets\File,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\Core\Config\Config,
|
use SilverStripe\Core\Config\Configurable;
|
||||||
SilverStripe\TextExtraction\Extension\FileTextCache;
|
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Caches the extracted content on the record for the file.
|
* Caches the extracted content on the record for the file.
|
||||||
* Limits the stored file content by default to avoid hitting query size limits.
|
* Limits the stored file content by default to avoid hitting query size limits.
|
||||||
*/
|
*/
|
||||||
class FileTextCache_Database implements FileTextCache
|
class Database implements FileTextCache
|
||||||
{
|
{
|
||||||
|
use Configurable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @config
|
||||||
|
* @var int
|
||||||
|
*/
|
||||||
|
private static $max_content_length = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File $file
|
||||||
@ -28,7 +36,7 @@ class FileTextCache_Database implements FileTextCache
|
|||||||
*/
|
*/
|
||||||
public function save(File $file, $content)
|
public function save(File $file, $content)
|
||||||
{
|
{
|
||||||
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
|
$maxLength = $this->config()->get('max_content_length');
|
||||||
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
|
||||||
$file->write();
|
$file->write();
|
||||||
}
|
}
|
@ -1,9 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Exception;
|
|
||||||
|
|
||||||
use \Exception;
|
|
||||||
|
|
||||||
class FileTextExtractor_Exception extends Exception
|
|
||||||
{
|
|
||||||
}
|
|
@ -2,9 +2,10 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extension;
|
namespace SilverStripe\TextExtraction\Extension;
|
||||||
|
|
||||||
use SilverStripe\ORM\DataExtension,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\TextExtraction\Extension\FileTextCache,
|
use SilverStripe\ORM\DataExtension;
|
||||||
SilverStripe\Control\Director;
|
use SilverStripe\TextExtraction\Cache\FileTextCache;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
|
||||||
@ -13,36 +14,32 @@ use SilverStripe\ORM\DataExtension,
|
|||||||
* Adds an additional property which is the cached contents, which is populated on demand.
|
* Adds an additional property which is the cached contents, which is populated on demand.
|
||||||
*
|
*
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
class FileTextExtractable extends DataExtension
|
class FileTextExtractable extends DataExtension
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $db = array(
|
private static $db = [
|
||||||
'FileContentCache' => 'Text'
|
'FileContentCache' => 'Text'
|
||||||
);
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $casting = array(
|
private static $casting = [
|
||||||
'FileContent' => 'Text'
|
'FileContent' => 'Text'
|
||||||
);
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var array
|
* @var array
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $dependencies = array(
|
private static $dependencies = [
|
||||||
'TextCache' => '%$SilverStripe\TextExtraction\Extension\FileTextCache_Cache'
|
'TextCache' => '%$' . FileTextCache::class,
|
||||||
);
|
];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var FileTextCache
|
* @var FileTextCache
|
||||||
@ -50,13 +47,13 @@ class FileTextExtractable extends DataExtension
|
|||||||
protected $fileTextCache = null;
|
protected $fileTextCache = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param FileTextCache $cache
|
* @param FileTextCache $cache
|
||||||
* @return void
|
* @return $this
|
||||||
*/
|
*/
|
||||||
public function setTextCache(FileTextCache $cache)
|
public function setTextCache(FileTextCache $cache)
|
||||||
{
|
{
|
||||||
$this->fileTextCache = $cache;
|
$this->fileTextCache = $cache;
|
||||||
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -78,37 +75,38 @@ class FileTextExtractable extends DataExtension
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
|
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
|
||||||
* The value is also cached into the File record itself.
|
* returns the text. The value is also cached into the File record itself.
|
||||||
*
|
*
|
||||||
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
* @param boolean $disableCache If false, the file content is only parsed on demand.
|
||||||
* If true, the content parsing is forced, bypassing
|
* If true, the content parsing is forced, bypassing
|
||||||
* the cached version
|
* the cached version
|
||||||
* @return mixed string | null
|
* @return string|null
|
||||||
*/
|
*/
|
||||||
public function extractFileAsText($disableCache = false)
|
public function extractFileAsText($disableCache = false)
|
||||||
{
|
{
|
||||||
|
/** @var File $file */
|
||||||
|
$file = $this->owner;
|
||||||
if (!$disableCache) {
|
if (!$disableCache) {
|
||||||
$text = $this->getTextCache()->load($this->owner);
|
$text = $this->getTextCache()->load($file);
|
||||||
if ($text) {
|
if ($text) {
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine which extractor can process this file.
|
// Determine which extractor can process this file.
|
||||||
$path = Director::baseFolder() . '/' . $this->owner->getFilename();
|
$extractor = FileTextExtractor::for_file($file);
|
||||||
$extractor = FileTextExtractor::for_file($path);
|
|
||||||
if (!$extractor) {
|
if (!$extractor) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$text = $extractor->getContent($path);
|
$text = $extractor->getContent($file);
|
||||||
if (!$text) {
|
if (!$text) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!$disableCache) {
|
if (!$disableCache) {
|
||||||
$this->getTextCache()->save($this->owner, $text);
|
$this->getTextCache()->save($file, $text);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
|
@ -2,17 +2,22 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\Core\Config\Config,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\Core\Injector\Injector,
|
use SilverStripe\Core\ClassInfo;
|
||||||
SilverStripe\Core\ClassInfo;
|
use SilverStripe\Core\Config\Config;
|
||||||
|
use SilverStripe\Core\Config\Configurable;
|
||||||
|
use SilverStripe\Core\Injector\Injectable;
|
||||||
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
abstract class FileTextExtractor
|
abstract class FileTextExtractor
|
||||||
{
|
{
|
||||||
|
use Configurable;
|
||||||
|
use Injectable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set priority from 0-100.
|
* Set priority from 0-100.
|
||||||
@ -45,7 +50,7 @@ abstract class FileTextExtractor
|
|||||||
// Generate the sorted list of extractors on demand.
|
// Generate the sorted list of extractors on demand.
|
||||||
$classes = ClassInfo::subclassesFor(__CLASS__);
|
$classes = ClassInfo::subclassesFor(__CLASS__);
|
||||||
array_shift($classes);
|
array_shift($classes);
|
||||||
$classPriorities = array();
|
$classPriorities = [];
|
||||||
|
|
||||||
foreach ($classes as $class) {
|
foreach ($classes as $class) {
|
||||||
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
||||||
@ -76,23 +81,25 @@ abstract class FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected static function get_mime($path)
|
protected static function get_mime($path)
|
||||||
{
|
{
|
||||||
$file = new Symfony\Component\HttpFoundation\File\File($path);
|
$file = new \Symfony\Component\HttpFoundation\File\File($path);
|
||||||
|
|
||||||
return $file->getMimeType();
|
return $file->getMimeType();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param string $path
|
* Given a File object, decide which extractor instance to use to handle it
|
||||||
* @return mixed FileTextExtractor | null
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return FileTextExtractor|null
|
||||||
*/
|
*/
|
||||||
public static function for_file($path)
|
public static function for_file(File $file)
|
||||||
{
|
{
|
||||||
if (!file_exists($path) || is_dir($path)) {
|
if (!$file) {
|
||||||
return;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
$extension = $file->getExtension();
|
||||||
$mime = self::get_mime($path);
|
$mime = $file->getMimeType();
|
||||||
|
|
||||||
foreach (self::get_extractor_classes() as $className) {
|
foreach (self::get_extractor_classes() as $className) {
|
||||||
$extractor = self::get_extractor($className);
|
$extractor = self::get_extractor($className);
|
||||||
@ -114,6 +121,39 @@ abstract class FileTextExtractor
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
|
||||||
|
* file contents to a temp file and return its path
|
||||||
|
*
|
||||||
|
* @param File $file
|
||||||
|
* @return string
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
protected function getPathFromFile(File $file)
|
||||||
|
{
|
||||||
|
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||||
|
if (false === $path) {
|
||||||
|
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append extension to temp file if one is set
|
||||||
|
if ($file->getExtension()) {
|
||||||
|
$path .= '.' . $file->getExtension();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove any existing temp files with this name
|
||||||
|
if (file_exists($path)) {
|
||||||
|
unlink($path);
|
||||||
|
}
|
||||||
|
|
||||||
|
$bytesWritten = file_put_contents($path, $file->getStream());
|
||||||
|
if (false === $bytesWritten) {
|
||||||
|
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
||||||
|
}
|
||||||
|
|
||||||
|
return $path;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if the extractor is supported on the current environment,
|
* Checks if the extractor is supported on the current environment,
|
||||||
* for example if the correct binaries or libraries are available.
|
* for example if the correct binaries or libraries are available.
|
||||||
@ -132,7 +172,7 @@ abstract class FileTextExtractor
|
|||||||
abstract public function supportsExtension($extension);
|
abstract public function supportsExtension($extension);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine if this extractor suports the given mime type.
|
* Determine if this extractor supports the given mime type.
|
||||||
* Will only be called if supportsExtension returns false.
|
* Will only be called if supportsExtension returns false.
|
||||||
*
|
*
|
||||||
* @param string $mime
|
* @param string $mime
|
||||||
@ -141,10 +181,10 @@ abstract class FileTextExtractor
|
|||||||
abstract public function supportsMime($mime);
|
abstract public function supportsMime($mime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a file path, extract the contents as text.
|
* Given a File instance, extract the contents as text.
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File|string $file Either the File instance, or a file path for a file to load
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
abstract public function getContent($path);
|
abstract public function getContent($file);
|
||||||
}
|
}
|
||||||
|
7
src/Extractor/FileTextExtractor/Exception.php
Normal file
7
src/Extractor/FileTextExtractor/Exception.php
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
||||||
|
|
||||||
|
class Exception extends \Exception
|
||||||
|
{
|
||||||
|
}
|
@ -2,47 +2,16 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
use SilverStripe\Assets\File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
|
||||||
* @author mstephens
|
* the best for readable text.
|
||||||
*
|
*
|
||||||
|
* @author mstephens
|
||||||
*/
|
*/
|
||||||
class HTMLTextExtractor extends FileTextExtractor
|
class HTMLTextExtractor extends FileTextExtractor
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return boolean
|
|
||||||
*/
|
|
||||||
public function isAvailable()
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param string $extension
|
|
||||||
* @return array
|
|
||||||
*/
|
|
||||||
public function supportsExtension($extension)
|
|
||||||
{
|
|
||||||
return in_array(
|
|
||||||
strtolower($extension), array("html", "htm", "xhtml")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param string $mime
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function supportsMime($mime)
|
|
||||||
{
|
|
||||||
return strtolower($mime) === 'text/html';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||||
*
|
*
|
||||||
@ -51,21 +20,48 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
private static $priority = 10;
|
private static $priority = 10;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public function isAvailable()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $extension
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function supportsExtension($extension)
|
||||||
|
{
|
||||||
|
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $mime
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
public function supportsMime($mime)
|
||||||
|
{
|
||||||
|
return strtolower($mime) === 'text/html';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts content from regex, by using strip_tags()
|
* Extracts content from regex, by using strip_tags()
|
||||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||||
* as well as adding line breaks after block tags.
|
* as well as adding line breaks after block tags.
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File $file
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function getContent($path)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
$content = file_get_contents($path);
|
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
|
||||||
|
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
$content = preg_replace(
|
$content = preg_replace(
|
||||||
array(
|
[
|
||||||
// Remove invisible content
|
// Remove invisible content
|
||||||
'@<head[^>]*?>.*?</head>@siu',
|
'@<head[^>]*?>.*?</head>@siu',
|
||||||
'@<style[^>]*?>.*?</style>@siu',
|
'@<style[^>]*?>.*?</style>@siu',
|
||||||
@ -84,11 +80,11 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||||
), array(
|
],
|
||||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
|
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
||||||
), $content
|
$content
|
||||||
);
|
);
|
||||||
|
|
||||||
return strip_tags($content);
|
return strip_tags($content);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,17 +2,15 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\TextExtraction\Exception\FileTextExtractor_Exception;
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that calls pdftotext to do the conversion.
|
* Text extractor that calls pdftotext to do the conversion.
|
||||||
* @author mstephens
|
* @author mstephens
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
class PDFTextExtractor extends FileTextExtractor
|
class PDFTextExtractor extends FileTextExtractor
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set to bin path this extractor can execute
|
* Set to bin path this extractor can execute
|
||||||
*
|
*
|
||||||
@ -27,10 +25,10 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
* @config
|
* @config
|
||||||
* @var array
|
* @var array
|
||||||
*/
|
*/
|
||||||
private static $search_binary_locations = array(
|
private static $search_binary_locations = [
|
||||||
'/usr/bin',
|
'/usr/bin',
|
||||||
'/usr/local/bin',
|
'/usr/local/bin',
|
||||||
);
|
];
|
||||||
|
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
@ -46,12 +44,13 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
strtolower($mime), array(
|
strtolower($mime),
|
||||||
|
[
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/x-pdf',
|
'application/x-pdf',
|
||||||
'application/x-bzpdf',
|
'application/x-bzpdf',
|
||||||
'application/x-gzpdf'
|
'application/x-gzpdf'
|
||||||
)
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,10 +63,10 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
protected function bin($program = '')
|
protected function bin($program = '')
|
||||||
{
|
{
|
||||||
// Get list of allowed search paths
|
// Get list of allowed search paths
|
||||||
if ($location = $this->config()->binary_location) {
|
if ($location = $this->config()->get('binary_location')) {
|
||||||
$locations = array($location);
|
$locations = [$location];
|
||||||
} else {
|
} else {
|
||||||
$locations = $this->config()->search_binary_locations;
|
$locations = $this->config()->get('search_binary_locations');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find program in each path
|
// Find program in each path
|
||||||
@ -85,35 +84,41 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||||
return "";
|
// no file
|
||||||
} // no file
|
return '';
|
||||||
$content = $this->getRawOutput($path);
|
}
|
||||||
|
$content = $this->getRawOutput($file);
|
||||||
return $this->cleanupLigatures($content);
|
return $this->cleanupLigatures($content);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoke pdftotext with the given path
|
* Invoke pdftotext with the given File object
|
||||||
*
|
*
|
||||||
* @param string $path
|
* @param File|string $file
|
||||||
* @return string Output
|
* @return string Output
|
||||||
* @throws FileTextExtractor_Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
protected function getRawOutput($path)
|
protected function getRawOutput($file)
|
||||||
{
|
{
|
||||||
if (!$this->isAvailable()) {
|
if (!$this->isAvailable()) {
|
||||||
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
|
throw new Exception("getRawOutput called on unavailable extractor");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
if ($err) {
|
if ($err) {
|
||||||
if (!is_array($err) && $err == 1) {
|
if (!is_array($err) && $err == 1) {
|
||||||
// For Windows compatibility
|
// For Windows compatibility
|
||||||
$err = $content;
|
$err = $content;
|
||||||
}
|
}
|
||||||
throw new FileTextExtractor_Exception(sprintf(
|
|
||||||
'PDFTextExtractor->getContent() failed for %s: %s', $path, implode(PHP_EOL, $err)
|
throw new Exception(sprintf(
|
||||||
|
'PDFTextExtractor->getContent() failed for %s: %s',
|
||||||
|
$path,
|
||||||
|
implode(PHP_EOL, $err)
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,7 +135,7 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected function cleanupLigatures($input)
|
protected function cleanupLigatures($input)
|
||||||
{
|
{
|
||||||
$mapping = array(
|
$mapping = [
|
||||||
'ff' => 'ff',
|
'ff' => 'ff',
|
||||||
'fi' => 'fi',
|
'fi' => 'fi',
|
||||||
'fl' => 'fl',
|
'fl' => 'fl',
|
||||||
@ -138,9 +143,8 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
'ffl' => 'ffl',
|
'ffl' => 'ffl',
|
||||||
'ſt' => 'ft',
|
'ſt' => 'ft',
|
||||||
'st' => 'st'
|
'st' => 'st'
|
||||||
);
|
];
|
||||||
|
|
||||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,9 +2,12 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
use Exception;
|
||||||
GuzzleHttp\Client,
|
use GuzzleHttp\Client;
|
||||||
Psr\Log\LoggerInterface;
|
use InvalidArgumentException;
|
||||||
|
use Psr\Log\LoggerInterface;
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Text extractor that calls an Apache Solr instance
|
* Text extractor that calls an Apache Solr instance
|
||||||
@ -18,7 +21,7 @@ use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
|||||||
class SolrCellTextExtractor extends FileTextExtractor
|
class SolrCellTextExtractor extends FileTextExtractor
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Base URL to use for solr text extraction.
|
* Base URL to use for Solr text extraction.
|
||||||
* E.g. http://localhost:8983/solr/update/extract
|
* E.g. http://localhost:8983/solr/update/extract
|
||||||
*
|
*
|
||||||
* @config
|
* @config
|
||||||
@ -27,43 +30,36 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
private static $base_url;
|
private static $base_url;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @var int
|
* @var int
|
||||||
* @config
|
* @config
|
||||||
*/
|
*/
|
||||||
private static $priority = 75;
|
private static $priority = 75;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @var Client
|
||||||
* @var GuzzleHttp\Client
|
|
||||||
*/
|
*/
|
||||||
protected $httpClient;
|
protected $httpClient;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @return Client
|
||||||
* @return GuzzleHttp\Client
|
|
||||||
* @throws InvalidArgumentException
|
|
||||||
*/
|
*/
|
||||||
public function getHttpClient()
|
public function getHttpClient()
|
||||||
{
|
{
|
||||||
if (!$this->config()->get('base_url')) {
|
|
||||||
throw new \InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
|
||||||
}
|
|
||||||
if (!$this->httpClient) {
|
if (!$this->httpClient) {
|
||||||
$this->httpClient = new Client($this->config()->get('base_url'));
|
$this->httpClient = new Client();
|
||||||
}
|
}
|
||||||
|
|
||||||
return $this->httpClient;
|
return $this->httpClient;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @param Client $client
|
||||||
* @param GuzzleHttp\Client $client
|
* @return $this
|
||||||
* @return void
|
|
||||||
*/
|
*/
|
||||||
public function setHttpClient($client)
|
public function setHttpClient(Client $client)
|
||||||
{
|
{
|
||||||
$this->httpClient = $client;
|
$this->httpClient = $client;
|
||||||
|
return $this;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -73,30 +69,28 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
{
|
{
|
||||||
$url = $this->config()->get('base_url');
|
$url = $this->config()->get('base_url');
|
||||||
|
|
||||||
return (boolean) $url;
|
return (bool) $url;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param string $extension
|
* @param string $extension
|
||||||
* @return boolean
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
strtolower($extension),
|
strtolower($extension),
|
||||||
array(
|
[
|
||||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||||
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
||||||
)
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param string $mime
|
* @param string $mime
|
||||||
* @return boolean
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
@ -105,36 +99,45 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @param File|string $file
|
||||||
* @param string $path
|
|
||||||
* @return string
|
* @return string
|
||||||
|
* @throws InvalidArgumentException
|
||||||
*/
|
*/
|
||||||
public function getContent($path)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||||
return "";
|
// no file
|
||||||
} // no file
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
$fileName = basename($path);
|
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
|
||||||
$client = $this->getHttpClient();
|
$client = $this->getHttpClient();
|
||||||
|
|
||||||
|
// Get and validate base URL
|
||||||
|
$baseUrl = $this->config()->get('base_url');
|
||||||
|
if (!$this->config()->get('base_url')) {
|
||||||
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
$path = $this->getPathFromFile($file);
|
||||||
$request = $client
|
$request = $client
|
||||||
->post()
|
->post($baseUrl)
|
||||||
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
||||||
->addPostFiles(array('myfile' => $path));
|
->addPostFiles(['myfile' => $path]);
|
||||||
$response = $request->send();
|
$response = $request->send();
|
||||||
} catch (\InvalidArgumentException $e) {
|
} catch (InvalidArgumentException $e) {
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'Error extracting text from "%s" (message: %s)',
|
'Error extracting text from "%s" (message: %s)',
|
||||||
$path,
|
$fileName,
|
||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
);
|
);
|
||||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
} catch (\Exception $e) {
|
} catch (Exception $e) {
|
||||||
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases.
|
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
|
||||||
|
// query in some cases.
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||||
$path,
|
$path,
|
||||||
@ -146,7 +149,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Just initialise it, it doesn't take miuch.
|
// Just initialise it, it doesn't take much.
|
||||||
$matches = [];
|
$matches = [];
|
||||||
|
|
||||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||||
|
@ -2,10 +2,10 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\Core\Injector\Injector,
|
use SilverStripe\Core\Environment;
|
||||||
SilverStripe\Core\Environment,
|
use SilverStripe\Core\Injector\Injector;
|
||||||
SilverStripe\TextExtraction\Rest\TikaRestClient;
|
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables text extraction of file content via the Tika Rest Server
|
* Enables text extraction of file content via the Tika Rest Server
|
||||||
@ -35,19 +35,26 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected $client = null;
|
protected $client = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cache of supported mime types
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
|
protected $supportedMimes = [];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return TikaRestClient
|
* @return TikaRestClient
|
||||||
*/
|
*/
|
||||||
public function getClient()
|
public function getClient()
|
||||||
{
|
{
|
||||||
return $this->client ?:
|
if (!$this->client) {
|
||||||
($this->client =
|
$this->client = Injector::inst()->createWithArgs(
|
||||||
Injector::inst()->createWithArgs(
|
|
||||||
TikaRestClient::class,
|
TikaRestClient::class,
|
||||||
array($this->getServerEndpoint())
|
[$this->getServerEndpoint()]
|
||||||
)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
return $this->client;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return string
|
* @return string
|
||||||
@ -59,19 +66,17 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Default to configured endpoint
|
// Default to configured endpoint
|
||||||
return $this->config()->server_endpoint;
|
return $this->config()->get('server_endpoint');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the version of tika installed, or 0 if not installed
|
* Get the version of Tika installed, or 0 if not installed
|
||||||
*
|
*
|
||||||
* @return float version of tika
|
* @return float version of Tika
|
||||||
*/
|
*/
|
||||||
public function getVersion()
|
public function getVersion()
|
||||||
{
|
{
|
||||||
return $this
|
return $this->getClient()->getVersion();
|
||||||
->getClient()
|
|
||||||
->getVersion();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -79,13 +84,12 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
return $this->getServerEndpoint() &&
|
return $this->getServerEndpoint()
|
||||||
$this->getClient()->isAvailable() &&
|
&& $this->getClient()->isAvailable()
|
||||||
version_compare($this->getVersion(), '1.7.0') >= 0;
|
&& version_compare($this->getVersion(), '1.7.0') >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param string $extension
|
* @param string $extension
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
@ -95,31 +99,23 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cache of supported mime types
|
|
||||||
*
|
|
||||||
* @var array
|
|
||||||
*/
|
|
||||||
protected $supportedMimes = array();
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param string $mime
|
* @param string $mime
|
||||||
* @return boolean
|
* @return boolean
|
||||||
*/
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
$supported = $this->supportedMimes ?:
|
if (!$this->supportedMimes) {
|
||||||
($this->supportedMimes = $this->getClient()->getSupportedMimes());
|
$this->supportedMimes = $this->getClient()->getSupportedMimes();
|
||||||
|
}
|
||||||
|
|
||||||
// Check if supported (most common / quickest lookup)
|
// Check if supported (most common / quickest lookup)
|
||||||
if (isset($supported[$mime])) {
|
if (isset($this->supportedMimes[$mime])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check aliases
|
// Check aliases
|
||||||
foreach ($supported as $info) {
|
foreach ($this->supportedMimes as $info) {
|
||||||
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -128,8 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
return $this->getClient()->tika($path);
|
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
|
return $this->getClient()->tika($tempFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Extractor;
|
namespace SilverStripe\TextExtraction\Extractor;
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
|
use SilverStripe\Assets\File;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enables text extraction of file content via the Tika CLI
|
* Enables text extraction of file content via the Tika CLI
|
||||||
@ -47,13 +47,13 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
||||||
{
|
{
|
||||||
$descriptorSpecs = array(
|
$descriptorSpecs = [
|
||||||
0 => array("pipe", "r"),
|
0 => ["pipe", "r"],
|
||||||
1 => array("pipe", "w"),
|
1 => ["pipe", "w"],
|
||||||
2 => array("pipe", "w")
|
2 => ["pipe", "w"]
|
||||||
);
|
];
|
||||||
// Invoke command
|
// Invoke command
|
||||||
$pipes = array();
|
$pipes = [];
|
||||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
||||||
|
|
||||||
if (!is_resource($proc)) {
|
if (!is_resource($proc)) {
|
||||||
@ -74,14 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
return proc_close($proc);
|
return proc_close($proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public function getContent($file)
|
||||||
*
|
|
||||||
* @param string $path
|
|
||||||
* @return string
|
|
||||||
*/
|
|
||||||
public function getContent($path)
|
|
||||||
{
|
{
|
||||||
$mode = $this->config()->output_mode;
|
$mode = $this->config()->get('output_mode');
|
||||||
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
$code = $this->runShell($command, $output);
|
$code = $this->runShell($command, $output);
|
||||||
|
|
||||||
@ -91,8 +87,7 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @return bool
|
||||||
* @return boolean
|
|
||||||
*/
|
*/
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
@ -100,8 +95,7 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* @return bool
|
||||||
* @return boolean
|
|
||||||
*/
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
@ -111,9 +105,8 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
|
||||||
* @param string $mime
|
* @param string $mime
|
||||||
* @return boolean
|
* @return bool
|
||||||
*/
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
@ -121,8 +114,9 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
||||||
|
|
||||||
if ($code) {
|
if ($code) {
|
||||||
|
// Error case
|
||||||
return false;
|
return false;
|
||||||
} // Error case
|
}
|
||||||
|
|
||||||
// Check if the mime type is inside the result
|
// Check if the mime type is inside the result
|
||||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
||||||
|
@ -2,11 +2,11 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Rest;
|
namespace SilverStripe\TextExtraction\Rest;
|
||||||
|
|
||||||
use GuzzleHttp\Client,
|
use GuzzleHttp\Client;
|
||||||
GuzzleHttp\Exception\RequestException,
|
use GuzzleHttp\Exception\RequestException;
|
||||||
SilverStripe\Core\Environment,
|
use Psr\Log\LoggerInterface;
|
||||||
Psr\Log\LoggerInterface,
|
use SilverStripe\Core\Environment;
|
||||||
SilverStripe\Core\Injector\Injector;
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
class TikaRestClient extends Client
|
class TikaRestClient extends Client
|
||||||
{
|
{
|
||||||
@ -15,30 +15,30 @@ class TikaRestClient extends Client
|
|||||||
*
|
*
|
||||||
* @var array
|
* @var array
|
||||||
*/
|
*/
|
||||||
protected $options = array('username' => null, 'password' => null);
|
protected $options = ['username' => null, 'password' => null];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var array
|
* @var array
|
||||||
*/
|
*/
|
||||||
protected $mimes = array();
|
protected $mimes = [];
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param string $baseUrl
|
* @param string $baseUrl
|
||||||
* @param array $config
|
* @param array $config
|
||||||
*/
|
*/
|
||||||
public function __construct($baseUrl = '', $config = null)
|
public function __construct($baseUrl = '', $config = [])
|
||||||
{
|
{
|
||||||
$psswd = Environment::getEnv('SS_TIKA_PASSWORD');
|
$password = Environment::getEnv('SS_TIKA_PASSWORD');
|
||||||
|
|
||||||
if (!empty($psswd)) {
|
if (!empty($password)) {
|
||||||
$this->options = array(
|
$this->options = [
|
||||||
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
||||||
'password' => $psswd,
|
'password' => $password,
|
||||||
);
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
parent::__construct($baseUrl, $config);
|
parent::__construct($config);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -58,7 +58,7 @@ class TikaRestClient extends Client
|
|||||||
}
|
}
|
||||||
} catch (RequestException $ex) {
|
} catch (RequestException $ex) {
|
||||||
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
||||||
Injector::inst()->get(LoggerInterface::class)->error($msg);
|
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -120,7 +120,7 @@ class TikaRestClient extends Client
|
|||||||
try {
|
try {
|
||||||
$response = $this->put(
|
$response = $this->put(
|
||||||
'tika',
|
'tika',
|
||||||
array('Accept' => 'text/plain'),
|
['Accept' => 'text/plain'],
|
||||||
file_get_contents($file)
|
file_get_contents($file)
|
||||||
);
|
);
|
||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
$response->setAuth($this->options['username'], $this->options['password']);
|
||||||
@ -139,7 +139,7 @@ class TikaRestClient extends Client
|
|||||||
$msg .= ' Body: ' . $body;
|
$msg .= ' Body: ' . $body;
|
||||||
}
|
}
|
||||||
|
|
||||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
|
@ -1,23 +1,23 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
use SilverStripe\TextExtraction\Extension\FileTextCache,
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
SilverStripe\TextExtraction\Extension\FileTextCache_Database,
|
|
||||||
SilverStripe\Dev\SapphireTest,
|
use SilverStripe\Assets\File;
|
||||||
SilverStripe\Core\Config\Config;
|
use SilverStripe\Core\Config\Config;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
|
||||||
|
|
||||||
class FileTextCacheDatabaseTest extends SapphireTest
|
class FileTextCacheDatabaseTest extends SapphireTest
|
||||||
{
|
{
|
||||||
public function testTruncatesByMaxLength()
|
public function testTruncatesByMaxLength()
|
||||||
{
|
{
|
||||||
Config::nest();
|
Config::modify()->set(Database::class, 'max_content_length', 5);
|
||||||
|
|
||||||
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
|
$cache = new Database();
|
||||||
$cache = new FileTextCache_Database();
|
$file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
|
||||||
$file = $this->getMock('File', array('write'));
|
|
||||||
$content = '0123456789';
|
$content = '0123456789';
|
||||||
$cache->save($file, $content);
|
$cache->save($file, $content);
|
||||||
$this->assertEquals($cache->load($file), '01234');
|
|
||||||
|
|
||||||
Config::unnest();
|
$this->assertEquals($cache->load($file), '01234');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,46 +1,58 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Core\Config\Config;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
|
||||||
|
|
||||||
class FileTextExtractableTest extends SapphireTest
|
class FileTextExtractableTest extends SapphireTest
|
||||||
{
|
{
|
||||||
protected $requiredExtensions = array(
|
protected $usesDatabase = true;
|
||||||
'File' => array('FileTextExtractable')
|
|
||||||
);
|
|
||||||
|
|
||||||
public function setUp()
|
protected static $required_extensions = [
|
||||||
|
File::class => [
|
||||||
|
FileTextExtractable::class,
|
||||||
|
],
|
||||||
|
];
|
||||||
|
|
||||||
|
protected function setUp()
|
||||||
{
|
{
|
||||||
parent::setUp();
|
parent::setUp();
|
||||||
|
|
||||||
// Ensure that html is a valid extension
|
// Ensure that html is a valid extension
|
||||||
Config::inst()
|
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||||
->nest()
|
|
||||||
->update('File', 'allowed_extensions', array('html'));
|
// Create a copy of the file, as it may be clobbered by the test
|
||||||
|
// ($file->extractFileAsText() calls $file->write)
|
||||||
|
copy(
|
||||||
|
dirname(__FILE__) . '/fixtures/test1.html',
|
||||||
|
dirname(__FILE__) . '/fixtures/test1-copy.html'
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function tearDown()
|
protected function tearDown()
|
||||||
{
|
{
|
||||||
Config::unnest();
|
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
||||||
|
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||||
|
}
|
||||||
|
|
||||||
parent::tearDown();
|
parent::tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
public function testExtractFileAsText()
|
public function testExtractFileAsText()
|
||||||
{
|
{
|
||||||
// Create a copy of the file, as it may be clobbered by the test
|
|
||||||
// ($file->extractFileAsText() calls $file->write)
|
|
||||||
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html', BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
|
||||||
|
|
||||||
// Use HTML, since the extractor is always available
|
// Use HTML, since the extractor is always available
|
||||||
$file = new File(array(
|
/** @var File|FileTextExtractable $file */
|
||||||
'Name' => 'test1-copy.html',
|
$file = new File(['Name' => 'test1-copy.html']);
|
||||||
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||||
));
|
|
||||||
$file->write();
|
$file->write();
|
||||||
|
|
||||||
$content = $file->extractFileAsText();
|
$content = $file->extractFileAsText();
|
||||||
|
$this->assertNotNull($content);
|
||||||
$this->assertContains('Test Headline', $content);
|
$this->assertContains('Test Headline', $content);
|
||||||
$this->assertContains('Test Text', $content);
|
$this->assertContains('Test Text', $content);
|
||||||
$this->assertEquals($content, $file->FileContentCache);
|
$this->assertEquals($content, $file->FileContentCache);
|
||||||
|
|
||||||
if (file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) {
|
|
||||||
unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,33 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Core\Config\Config;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
|
||||||
|
|
||||||
class HTMLTextExtractorTest extends SapphireTest
|
class HTMLTextExtractorTest extends SapphireTest
|
||||||
{
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
|
protected function setUp()
|
||||||
|
{
|
||||||
|
parent::setUp();
|
||||||
|
|
||||||
|
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
|
||||||
|
}
|
||||||
|
|
||||||
public function testExtraction()
|
public function testExtraction()
|
||||||
{
|
{
|
||||||
$extractor = new HTMLTextExtractor();
|
$extractor = new HTMLTextExtractor();
|
||||||
|
|
||||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
$file = new File();
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
|
|
||||||
$this->assertContains('Test Headline', $content);
|
$this->assertContains('Test Headline', $content);
|
||||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
||||||
|
@ -1,17 +1,29 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
|
||||||
|
|
||||||
class PDFTextExtractorTest extends SapphireTest
|
class PDFTextExtractorTest extends SapphireTest
|
||||||
{
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
public function testExtraction()
|
public function testExtraction()
|
||||||
{
|
{
|
||||||
$extractor = new PDFTextExtractor();
|
$extractor = new PDFTextExtractor();
|
||||||
if (!$extractor->isAvailable()) {
|
if (!$extractor->isAvailable()) {
|
||||||
$this->setExpectedException(
|
$this->expectException(Exception::class);
|
||||||
'FileTextExtractor_Exception',
|
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
|
||||||
'getRawOutput called on unavailable extractor'
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
$file = new File();
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
36
tests/TikaServerTextExtractor.php
Normal file
36
tests/TikaServerTextExtractor.php
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @group tika-tests
|
||||||
|
*/
|
||||||
|
class TikaServerTextExtractorTest extends SapphireTest
|
||||||
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
|
public function testServerExtraction()
|
||||||
|
{
|
||||||
|
$extractor = TikaServerTextExtractor::create();
|
||||||
|
if (!$extractor->isAvailable()) {
|
||||||
|
$this->markTestSkipped('tika server not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check file
|
||||||
|
$file = new File();
|
||||||
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||||
|
$file->write();
|
||||||
|
|
||||||
|
$content = $extractor->getContent($file);
|
||||||
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
|
||||||
|
// Check mime validation
|
||||||
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
|
$this->assertTrue($extractor->supportsMime('text/html'));
|
||||||
|
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
||||||
|
}
|
||||||
|
}
|
@ -1,37 +1,32 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
|
use SilverStripe\Assets\File;
|
||||||
|
use SilverStripe\Dev\SapphireTest;
|
||||||
|
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests the {@see TikaTextExtractor} class
|
* Tests the {@see TikaTextExtractor} class
|
||||||
|
*
|
||||||
|
* @group tika-tests
|
||||||
*/
|
*/
|
||||||
class TikaTextExtractorTest extends SapphireTest
|
class TikaTextExtractorTest extends SapphireTest
|
||||||
{
|
{
|
||||||
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
public function testExtraction()
|
public function testExtraction()
|
||||||
{
|
{
|
||||||
$extractor = new TikaTextExtractor();
|
$extractor = TikaTextExtractor::create();
|
||||||
if (!$extractor->isAvailable()) {
|
if (!$extractor->isAvailable()) {
|
||||||
$this->markTestSkipped('tika cli not available');
|
$this->markTestSkipped('tika cli not available');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check file
|
// Check file
|
||||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
$file = new File();
|
||||||
$content = $extractor->getContent($file);
|
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$file->write();
|
||||||
|
|
||||||
// Check mime validation
|
|
||||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
|
||||||
$this->assertTrue($extractor->supportsMime('text/html'));
|
|
||||||
$this->assertFalse($extractor->supportsMime('application/not-supported'));
|
|
||||||
}
|
|
||||||
|
|
||||||
public function testServerExtraction()
|
|
||||||
{
|
|
||||||
$extractor = new TikaServerTextExtractor();
|
|
||||||
if (!$extractor->isAvailable()) {
|
|
||||||
$this->markTestSkipped('tika server not available');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check file
|
|
||||||
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
|
|
||||||
$content = $extractor->getContent($file);
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertContains('This is a test file with a link', $content);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user