Compare commits

...

143 Commits
2.0.1 ... 3

Author SHA1 Message Date
Guy Sartorelli e04501cb52
Merge branch '3.5' into 3 2023-04-26 12:49:10 +12:00
Maxime Rainville 821d2858f3
Merge pull request #81 from creative-commoners/pulls/3.5/fix-constraints
MNT Revert erroneous dependency changes
2023-03-28 17:06:30 +13:00
Guy Sartorelli de215d63f6
MNT Revert erroneous dependency changes 2023-03-28 14:55:48 +13:00
Maxime Rainville 2a260607ec
Merge pull request #80 from creative-commoners/pulls/3/dispatch-ci
MNT Use gha-dispatch-ci
2023-03-23 14:19:29 +13:00
Steve Boyd 6a92eb58e2 MNT Use gha-dispatch-ci 2023-03-21 13:42:30 +13:00
Guy Sartorelli 87869e94a6
MNT Update development dependencies 2023-03-10 16:38:07 +13:00
Guy Sartorelli 61f443d49c
MNT Update release dependencies 2023-03-10 16:38:04 +13:00
Guy Sartorelli 8f2e1d9b75
MNT Update development dependencies 2023-03-10 12:21:32 +13:00
Sabina Talipova a281114ed2
Merge pull request #77 from creative-commoners/pulls/3/stop-using-depr
API Stop using deprecated API
2022-12-05 16:44:21 +13:00
Steve Boyd 1a0cd6d6a6 API Stop using deprecated API 2022-11-28 19:20:01 +13:00
Steve Boyd 3bfa989a7e Merge branch '3.4' into 3 2022-08-02 19:01:21 +12:00
Steve Boyd 46b7f51040 Merge branch '3.3' into 3.4 2022-08-02 19:01:18 +12:00
Guy Sartorelli 041296bda2
Merge pull request #76 from creative-commoners/pulls/3.3/standardise-modules
MNT Standardise modules
2022-08-02 15:49:18 +12:00
Steve Boyd e8997870c5 MNT Standardise modules 2022-08-01 16:23:36 +12:00
Steve Boyd e8061724c5 Merge branch '3.4' into 3 2022-07-25 09:43:40 +12:00
Steve Boyd db8a36fa3e Merge branch '3.3' into 3.4 2022-07-25 09:43:36 +12:00
Guy Sartorelli 04ff0c6084
Merge pull request #74 from creative-commoners/pulls/3.3/module-standards
MNT Use GitHub Actions CI
2022-07-15 17:10:18 +12:00
Steve Boyd e5bf4f1322 MNT Use GitHub Actions CI 2022-07-05 19:08:18 +12:00
Guy Sartorelli 4674084d0d
Merge pull request #72 from creative-commoners/pulls/3/php81
ENH PHP 8.1 compatibility
2022-04-26 17:58:43 +12:00
Steve Boyd df8b17ab85 ENH PHP 8.1 compatibility 2022-04-13 13:51:04 +12:00
Michal Kleiner 77fecc4c53
Merge pull request #71 from GuySartorelli/patch-1
DOCS Fix incorrect PHPDoc about what null lifetime means.
2022-03-10 07:38:08 +13:00
Guy Sartorelli d03a9f06e2
DOCS Fix incorrect PHPDoc about what null lifetime means. 2022-03-09 16:03:03 +13:00
Michal Kleiner 88e7f27c5c
Merge pull request #69 from GuySartorelli/patch-1
DOCS Fix class reference for cache class
2022-03-07 11:31:11 +13:00
Guy Sartorelli 04e4b60435
DOCS Fix class reference for cache class
The `lifetime` config variable is on the `Cache` class, not the `Database` class.
2022-03-07 11:20:01 +13:00
Maxime Rainville 25d8a55058
Merge pull request #68 from creative-commoners/pulls/3/php74
DEP Set PHP 7.4 as the minimum version
2022-02-18 22:10:02 +13:00
Steve Boyd e8f015ddd2 DEP Set PHP 7.4 as the minimum version 2022-02-10 17:41:01 +13:00
Michal Kleiner 254c4e31f8
Merge pull request #67 from GuySartorelli/patch-1
DEP Loosen constraints for guzzlehttp/guzzle
2022-01-26 22:08:47 +13:00
GuySartorelli 7ad3fc9f13
DEP Loosen constraints for guzzlehttp/guzzle 2022-01-26 12:58:30 +13:00
Maxime Rainville eb36dcf5fb
Merge pull request #65 from creative-commoners/pulls/3/sapphire-test-nine
API phpunit 9 support
2021-11-01 22:37:19 +13:00
Steve Boyd b92616eb4e API phpunit 9 support 2021-10-27 18:16:05 +13:00
Steve Boyd 90d4812aa8 Merge branch '3.2' into 3 2021-05-21 14:30:19 +12:00
Maxime Rainville d1bdc003ad MNT Remove obsolete branch-alias 2021-05-05 11:17:57 +12:00
Steve Boyd 6af13768d3 Merge branch '3.1' into 3 2021-01-27 12:26:07 +13:00
Garion Herman 4250acb50e
Merge pull request #64 from creative-commoners/pulls/3.1/travis-shared
MNT Travis shared config
2021-01-27 11:59:07 +13:00
Steve Boyd 795abde8f1
Update build status badge 2021-01-21 16:43:07 +13:00
Steve Boyd d1e241ed56 MNT Travis shared config 2021-01-20 14:49:58 +13:00
Steve Boyd 8e9a0243bb Merge branch '3.1' into 3 2020-11-12 14:46:41 +13:00
Robbie Averill cb15845a95
Merge pull request #62 from creative-commoners/pulls/3.1/travis
Update travis 3.1
2020-06-23 09:34:53 -07:00
Steve Boyd 3564066245 Update travis 2020-06-23 16:26:32 +12:00
Steve Boyd 06995b2ec7 Merge branch '3.1' into 3 2020-06-18 13:30:39 +12:00
Maxime Rainville 01848af86d
Merge pull request #61 from creative-commoners/3.1
Update for 3.1
2020-06-15 18:35:27 +12:00
Steve Boyd e451f96b0b Update for 3.1 2020-06-15 16:17:53 +12:00
Robbie Averill d0a7db0b68
Merge pull request #60 from phptek/issue/58
FIX: Fixes #58 We always want $content (an array) passed to implode()
2019-12-16 12:04:39 -08:00
Russell Michell 42cc545414 FIX: Fixes #58 We always want $content (an array) passed to implode() 2019-12-16 10:06:55 +13:00
Robbie Averill 6234a971d1 Merge branch '3.0' 2019-08-28 10:56:39 +12:00
Robbie Averill 0d7c507b53 Use trusty in Travis builds 2019-08-28 10:56:33 +12:00
Robbie Averill d5313674c3 Merge branch '3.0' 2019-08-28 10:07:30 +12:00
Robbie Averill 32e2f9f84f FIX Ensure test uses database cache, it asserts assuming it is configured 2019-08-28 10:07:21 +12:00
Robbie Averill 5b967fd5d3 Merge branch '3.0' 2019-06-26 15:26:08 +12:00
Robbie Averill 943f393ee8
Merge pull request #55 from ichaber/fix/54-clean-temp-file
#54 Cleanup temporary file
2019-06-26 15:25:20 +12:00
Charlie Bergthaler 242e5a307d FIX Change check for cleanup of temp files only if file is instance of File. 2019-06-26 15:18:31 +12:00
Charlie Bergthaler a9270d73ad FIX Cleanup temporary file after extracting content in TikaServerTextExtractor and TikaTextExtractor 2019-06-26 15:18:31 +12:00
Robbie Averill b4c634bb1f Merge branch '3.0' 2019-06-26 15:17:42 +12:00
Robbie Averill 20079bd33f Remove SilverStripe 4.0-4.2 from Travis builds 2019-06-26 15:17:34 +12:00
Guy Marriott c5cfe4ea1e
Merge pull request #53 from martinhipp/bugfix/tika-version-number-checking
Return version number as string instead of float
2019-04-05 10:07:00 +13:00
Martin Hipp bff5eb2b79
Return version number as string instead of floats so '1.20' does not become 1.2 2019-04-05 09:56:45 +13:00
Robbie Averill 801cd9cacb Merge branch '3.0' 2019-02-22 09:34:11 +07:00
Dylan Wagstaff 9c2da06178
Merge pull request #52 from creative-commoners/pulls/3.0/fix-tests
FIX Ensure Tika responses are casted as strings, fixes broken unit tests
2019-02-14 10:15:55 +13:00
Robbie Averill 276fd9c856 Add PSR-4 autoloader and update Travis to include PHP 7.3 and SS 4.3 2019-02-13 11:42:51 +07:00
Robbie Averill 759d92ccb4 FIX Ensure Tika responses are casted as strings, fixes broken unit tests
They can be returned as a stream, but the TikaRestClient response is documented as a string
2019-02-13 11:42:51 +07:00
Robbie Averill b9502653c2
Merge pull request #51 from ishannz/patch-1
Update isAvailable check to work for identical versions
2019-02-13 11:28:04 +07:00
Robbie Averill 86eba78064 Add tests for isAvailable() 2019-02-13 11:23:28 +07:00
Ishan Jayamanne 21ed6e0f86 Update isAvailable check to work for identical versions
Tika server reports it's version as "Apache Tika 1.7". Unfortunately, `version_compare` in PHP says that version "1.7" is less than version "1.7.0", meaning that Tika server was incorrectly being ruled out unless you used Tika server version 1.8 (where "1.8" > "1.7.0").

Changing the comparison string to just "1.7" means they match exactly, and therefore `version_compare` will return `0` rather than `-1`.
2019-02-13 11:15:54 +07:00
Robbie Averill 75a8c66eee Merge branch '3.0' 2018-07-09 10:04:00 +12:00
Robbie Averill 07c000dc0d Remove obsolete branch alias 2018-07-09 10:03:13 +12:00
Dylan Wagstaff 03d1fef4ae
Merge pull request #47 from creative-commoners/pulls/3.0/fix-extractors
FIX Update Guzzle implementations in extractors to ensure they're working
2018-07-09 09:57:17 +12:00
Robbie Averill e1e7cdbfa4 FIX Update SolrCellTextExtractor to use a Guzzle 6 API implementation 2018-07-06 16:11:59 +12:00
Robbie Averill 231a2091af FIX Update Guzzle implementations in Tika extractors 2018-07-06 16:11:59 +12:00
Daniel Hensby b20738573f
Merge pull request #46 from creative-commoners/pulls/3.0/remove-deps
Remove unused symfony dependencies and FileTextExtractor::get_mime
2018-07-04 10:28:06 +01:00
Robbie Averill 1b8ea2e451 Remove unused symfony dependencies and FileTextExtractor::get_mime 2018-07-04 16:23:22 +12:00
Dylan Wagstaff 9795866abe
Merge pull request #45 from creative-commoners/pulls/3.0/ss4-updates
API Update namespaces and SilverStripe API implementations for SilverStripe 4 compat
2018-07-04 11:34:17 +12:00
Robbie Averill 9e8ed243d0 Seperate Tika tests, group them for phpunit, further reduce log level, make Extractors injectable 2018-07-03 17:15:18 +12:00
Robbie Averill 397e7a5d40 API FileTextExtractor::getContent now supports a File and a filename path string 2018-07-03 17:03:47 +12:00
Robbie Averill 40e4b05f5d DOCS Update documentation for SilverStripe 4 2018-07-03 17:03:19 +12:00
Robbie Averill 5e5a1f05da FIX Reduce log level to prevent it being caught in SilverStripe error handler
See https://github.com/silverstripe/silverstripe-framework/issues/8044 for context
2018-07-03 16:40:40 +12:00
Robbie Averill 6bf932e5f0 FIX unlink call checks that a file exists first, and tests pass a File object 2018-07-03 16:30:05 +12:00
Robbie Averill 770af5cfc9 Add versioned as a requirement, and php codesniffer 2018-07-03 16:22:43 +12:00
Robbie Averill 3c1457c0ee Update broken path in phpunit configuration 2018-07-03 16:00:31 +12:00
Robbie Averill 5d53be9df6 Set minimum stability and reformat composer.json 2018-07-03 15:58:12 +12:00
Robbie Averill edb02e9189 API FileTextExtractable::getContent now takes a File instance instead of a path 2018-07-03 15:55:02 +12:00
Robbie Averill 8bd019b2aa Update codebase to ensure relative PSR-2 compliance 2018-07-03 11:37:38 +12:00
Robbie Averill e2404fc904 Update gitattributes and Scrutinizer configuration 2018-07-03 11:36:04 +12:00
Robbie Averill 8d295ada9c Add phpunit/phpcs configuration and update Travis configuration 2018-07-03 11:35:52 +12:00
Robbie Averill fe5148e678 API Add namespaces to tests and update SapphireTest implementation 2018-07-03 11:35:24 +12:00
Robbie Averill 66c9db8c0d API Update namespaces for FileTextCache and add upgrader mapping 2018-07-03 11:23:27 +12:00
Robbie Averill f1bacd2aa9 Bump license year 2018-07-03 10:48:02 +12:00
Robbie Averill 300941c9e8 Update readme badges and requirements for SilverStripe 4 2018-07-03 10:47:56 +12:00
Robbie Averill dd292bd554 Switch to vendor module 2018-07-03 10:41:41 +12:00
Robbie Averill 45cd9ae4ed
Merge pull request #44 from creative-commoners/pulls/master/add-supported-module-badge
Add supported module badge to readme
2018-06-18 10:43:02 +12:00
Dylan Wagstaff d06569c8fd Add supported module badge to readme 2018-06-15 17:50:30 +12:00
Dylan Wagstaff 31925d654e
Merge pull request #41 from silverstripe/pulls/3.0/constraints
Update dependency constraints in composer.json
2018-02-02 14:44:19 +13:00
Robbie Averill e491042d3b
Update dependency constraints in composer.json 2018-02-02 12:52:19 +13:00
Robbie Averill 33746e0cd7
Merge pull request #38 from phptek/issue/37
FIX: Fixes #37 First-pass SS4 compatibility.
2018-01-17 14:07:59 +13:00
Russell Michell 912c457c7d FIX: Updated namespace refs for GuzzleHttp (from Guzzle\Http) 2017-12-22 14:34:40 +13:00
Russell Michell d09a5aa97c FIX: Upgraded Guzzle to 6.latest 2017-12-22 14:00:41 +13:00
Russell Michell f341010d7a FIX: First-pass SS4 compatibility.
- Added namespaces, use statements
- Added missing docblocks etc
- Uses SS4's new Cache system
- Uses proper environment vars
- Cannot instantiate 'FileTextCache' (interface) as a service. This can be configured through YML, so default to FileTextCache_Cache
- Modded YML config to make it run.
- Fixes to allow TIKA to actually get file contents.
- Addresses issues raised by @robbieaverill
- Rebased against github.com/silverstripe/silverstripe-textextraction:master
- Replaced `SS_Log` with Monolog.
2017-12-21 10:41:06 +13:00
Robbie Averill 875e608d0f Update branch alias for 3.x-dev 2017-12-20 16:43:15 +13:00
Robbie Averill c83a7c3403 Merge branch '2' 2017-12-20 16:42:44 +13:00
Robbie Averill 95d96efe40 Merge branch '2.1' into 2 2017-12-20 16:42:35 +13:00
Robbie Averill 9f04583ed5 Merge branch '2.0' into 2.1 2017-12-20 16:41:27 +13:00
Robbie Averill a8a4e0c02f Remove obsolete branch alias 2017-12-20 16:40:57 +13:00
Robbie Averill 9f3819408c Update branch alias for 2.x-dev 2017-12-20 16:39:28 +13:00
Daniel Hensby eb25505a8e
Merge pull request #2 from cam-findlay/patch-1 2017-11-23 13:18:44 +00:00
Jake Dale Ovenden eb7a45865b Allow username and password in requests to Tika server (#35) 2017-11-23 10:24:32 +13:00
Robbie Averill 40ba6a245d
DOCS Fix build badges in readme 2017-11-23 09:52:40 +13:00
Robbie Averill 3d289b4e05 DOCS Add Windows note back into Configuration guide, bump license year 2017-11-23 09:49:05 +13:00
Robbie Averill f8c3015161 Merge pull request #19 from camfindlay/feature/make-supported 2017-11-23 09:39:34 +13:00
Damian Mooyman 23e255b5c6 Merge pull request #34 from jvdanker/disable-cache-fix
Don't try to save the object to the cache if it has been disabled
2017-02-22 15:30:59 +13:00
Juan van den Anker 0761311170 Don't try to save the object to the cache if it has been disabled 2017-02-22 15:17:32 +13:00
Damian Mooyman 1b89000fcd Merge pull request #33 from alwex/master
fixed the version comparison for tika server text extractor
2016-10-19 16:10:05 +13:00
Alexandre Guidet 196007314a fixed the version comparison using version_compare() instead of plain float 2016-10-19 15:46:30 +13:00
Damian Mooyman 545e711f16 Merge pull request #31 from dhensby/pulls/composer-alias
Bumping composer alias
2016-10-04 12:35:58 +13:00
Daniel Hensby 5d24770d79
Bumping composer alias 2016-10-04 00:17:39 +01:00
Damian Mooyman 5a5c648c1e Merge pull request #30 from dhensby/pulls/pdf-extraction
FIX PDFTextExtractor no longer smushes words together than break acro…
2016-10-04 12:13:09 +13:00
Daniel Hensby e9e33605b4
FIX PDFTextExtractor no longer smushes words together than break across lines 2016-10-03 23:59:18 +01:00
Damian Mooyman e0125ba745 Merge pull request #29 from dhensby/pulls/fix-text-extraction-config
FIX UnexpectedValueException thrown when trying to set SolrCellTextEx…
2016-10-04 10:02:58 +13:00
Daniel Hensby aaf9238384
FIX UnexpectedValueException thrown when trying to set SolrCellTextExtraction.base_url in config 2016-10-03 20:19:30 +01:00
Daniel Hensby 61750e33fc Merge pull request #28 from SilbinaryWolf/fix-windowscompat
fix(PDFTextExtractor): Added support for Windows, but only if 'binary_location' config is defined
2016-05-14 12:14:31 +01:00
Jake Bentvelzen 75ffe7b56a fix(PDFTextExtractor): Added support for Windows, but only if 'binary_location' is defined. Updated documentation to inform the user of this. 2016-05-13 15:07:33 +10:00
Hamish Friedlander bde4cf4536 Merge pull request #27 from tractorcow/pulls/pdfpaths
API Whitelist bin paths for pdftotext
2016-02-25 16:45:56 +13:00
Damian Mooyman f72ba3a978 API Whitelist bin paths for pdftotext 2016-02-25 16:40:25 +13:00
Damian Mooyman 9e44e834cf Merge pull request #26 from helpfulrobot/update-license-year
Updated license year
2016-01-05 11:14:29 +13:00
helpfulrobot 0420d56e4d Updated license year 2016-01-01 06:50:40 +13:00
Daniel Hensby 5a070eb47d Merge pull request #25 from helpfulrobot/add-standard-code-of-conduct
Added standard code of conduct
2015-11-21 12:29:45 +00:00
helpfulrobot 7c45684dbb Added standard code of conduct 2015-11-21 20:17:44 +13:00
Daniel Hensby cdea0f0798 Merge pull request #23 from helpfulrobot/add-standard-license
Added standard license
2015-11-19 12:55:38 +00:00
Daniel Hensby dcd527deb1 Merge pull request #24 from helpfulrobot/add-standard-git-attributes
Added standard git attributes
2015-11-19 10:39:37 +00:00
helpfulrobot 08cc7c37da Added standard git attributes 2015-11-19 19:14:04 +13:00
helpfulrobot df3af6722b Added standard license 2015-11-19 18:32:42 +13:00
Damian Mooyman 1d2a9bc296 Merge pull request #22 from helpfulrobot/add-standard-editor-config
Added standard editor config
2015-11-19 14:05:19 +13:00
helpfulrobot 80a4773cce Added standard editor config 2015-11-19 13:27:10 +13:00
Daniel Hensby ebfa07dc5f Merge pull request #21 from helpfulrobot/convert-to-psr-2
Converted to PSR-2
2015-11-18 23:30:07 +00:00
Daniel Hensby 9cb2a79f8d Merge pull request #20 from helpfulrobot/add-standard-scrutinizer-config
Added standard Scrutinizer config
2015-11-18 12:42:33 +00:00
helpfulrobot 8e14595f1a Converted to PSR-2 2015-11-18 17:07:31 +13:00
helpfulrobot 03de223162 Added standard Scrutinizer config 2015-11-18 15:38:01 +13:00
Daniel Hensby 80f61a21be Merge pull request #18 from assertchris/add-scrutinizer-support
Added Scrutinizer support
2015-11-08 23:57:42 +00:00
Cam Findlay 7b3fb280c6 Add supported module standard docs 2015-11-07 14:06:23 +13:00
Christopher Pitt 4c955bde13 Added Scrutinizer support 2015-11-07 11:22:33 +13:00
Damian Mooyman 1e8581d7f8 Merge pull request #17 from dhensby/patch-1
Move to new travis containerised infrastructure
2015-08-26 12:32:30 +12:00
Daniel Hensby e67fb97672 Move to new travis containerised infrastructure 2015-08-25 15:28:20 +01:00
Damian Mooyman 832437e4bf Merge pull request #15 from kinglozzer/patch-2
FIX: SolrCellTextExtractor always reporting itself as unavailable (fixes #14)
2015-07-02 12:00:48 +12:00
Loz Calver 9ea4b79543 FIX: SolrCellTextExtractor always reporting itself as unavailable (fixes #14) 2015-06-08 12:42:31 +01:00
cam-findlay a34c443be5 FIX additional exception handling for Tika errors return via Guzzle.
Tika server errors via Guzzle can cause the Solr search query to return a 500 error and breaks search results pages for users. Issues was relating to uncaught exceptions from Guzzle causing a silent fail if a text file is perhaps unreadable or missing (return null never occurs which breaks the search).
2013-06-07 10:42:38 +12:00
49 changed files with 2009 additions and 1263 deletions

17
.editorconfig Normal file
View File

@ -0,0 +1,17 @@
# For more information about the properties used in this file,
# please see the EditorConfig documentation:
# http://editorconfig.org
[*]
charset = utf-8
end_of_line = lf
indent_size = 4
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
[{*.yml,package.json}]
indent_size = 2
# The indent size used in the package.json file cannot be changed:
# https://github.com/npm/npm/pull/3180#issuecomment-16336516

7
.gitattributes vendored Normal file
View File

@ -0,0 +1,7 @@
/tests export-ignore
/docs export-ignore
/.gitattributes export-ignore
/.gitignore export-ignore
/.travis.yml export-ignore
/.scrutinizer.yml export-ignore
/codecov.yml export-ignore

11
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,11 @@
name: CI
on:
push:
pull_request:
workflow_dispatch:
jobs:
ci:
name: CI
uses: silverstripe/gha-ci/.github/workflows/ci.yml@v1

16
.github/workflows/dispatch-ci.yml vendored Normal file
View File

@ -0,0 +1,16 @@
name: Dispatch CI
on:
# At 12:20 PM UTC, only on Saturday and Sunday
schedule:
- cron: '20 12 * * 6,0'
jobs:
dispatch-ci:
name: Dispatch CI
# Only run cron on the silverstripe account
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
steps:
- name: Dispatch CI
uses: silverstripe/gha-dispatch-ci@v1

17
.github/workflows/keepalive.yml vendored Normal file
View File

@ -0,0 +1,17 @@
name: Keepalive
on:
workflow_dispatch:
# The 4th of every month at 10:50am UTC
schedule:
- cron: '50 10 4 * *'
jobs:
keepalive:
name: Keepalive
# Only run cron on the silverstripe account
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
steps:
- name: Keepalive
uses: silverstripe/gha-keepalive@v1

View File

@ -1,23 +0,0 @@
# See https://github.com/silverstripe-labs/silverstripe-travis-support for setup details
language: php
php:
- 5.4
env:
- DB=MYSQL CORE_RELEASE=3.1
- DB=MYSQL CORE_RELEASE=3
before_script:
- mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin
- export SS_TIKA_ENDPOINT="http://localhost:9998/"
- ./.travis/install_tika.sh
- sudo ./.travis/install_pdftotext.sh
- git clone git://github.com/silverstripe-labs/silverstripe-travis-support.git ~/travis-support
- php ~/travis-support/travis_setup.php --source `pwd` --target ~/builds/ss
- cd ~/builds/ss
script:
- ($HOME/bin/tika-rest-server &) &> /dev/null
- vendor/bin/phpunit --verbose textextraction/tests/

View File

@ -1,3 +0,0 @@
#!/usr/bin/env bash
apt-get update
apt-get install -y xpdf

14
.upgrade.yml Normal file
View File

@ -0,0 +1,14 @@
mappings:
FileTextExtractable: SilverStripe\TextExtraction\Extension\FileTextExtractable
FileTextCache: SilverStripe\TextExtraction\Cache\FileTextCache
FileTextCache_Cache: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
FileTextCache_Database: SilverStripe\TextExtraction\Cache\FileTextCache\Database
FileTextExtractor: SilverStripe\TextExtraction\Extractor\FileTextExtractor
FileTextExtractor_Exception: SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception
HTMLTextExtractor: SilverStripe\TextExtraction\Extractor\HTMLTextExtractor
PDFTextExtractor: SilverStripe\TextExtraction\Extractor\PDFTextExtractor
SolrCellTextExtractor: SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor
TikaServerTextExtractor: SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor
TikaTextExtractor: SilverStripe\TextExtraction\Extractor\TikaTextExtractor
TikaRestClient: SilverStripe\TextExtraction\Rest\TikaRestClient

15
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,15 @@
# Contributing
- Maintenance on this module is a shared effort of those who use it
- To contribute improvements to the code, ensure you raise a pull request and discuss with the module maintainers
- Please follow the SilverStripe [code contribution guidelines](https://docs.silverstripe.org/en/contributing/code/) and [Module Standard](https://docs.silverstripe.org/en/developer_guides/extending/modules/#module-standard)
- Supply documentation that followS the [GitHub Flavored Markdown](https://help.github.com/articles/markdown-basics/) conventions
- When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct/)
## Contributor license agreement
By supplying code to this module in patches, tickets and pull requests, you agree to assign copyright
of that code to SilverStripe Ltd., on the condition that these code changes are released under the
same BSD license as the original module. We ask for this so that the ownership in the license is clear
and unambiguous. By releasing this code under a permissive license such as BSD, this copyright assignment
won't prevent you from using the code in any way you see fit.

24
LICENSE
View File

@ -1,24 +0,0 @@
* Copyright (c) 2010-2012, SilverStripe Ltd.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the <organization> nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY SilverStripe Ltd. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL SilverStripe Ltd. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

220
README.md
View File

@ -1,26 +1,15 @@
# Text Extraction Module
# Text extraction module
[![Build Status](https://secure.travis-ci.org/silverstripe-labs/silverstripe-textextraction.png)](http://travis-ci.org/silverstripe-labs/silverstripe-textextraction)
[![CI](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml/badge.svg)](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml)
[![Silverstripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
## Overview
Provides a text extraction API for file content, that can hook into different extractor
engines based on availability and the parsed file format. The output returned is always a string of the file content.
Provides an extraction API for file content, which can hook into different extractor
engines based on availability and the parsed file format.
The output is always a string: the file content.
Via the `FileTextExtractable` extension, this logic can be used to
Via the `FileTextExtractable` extension, this logic can be used to
cache the extracted content on a `DataObject` subclass (usually `File`).
Note: Previously part of the [sphinx module](https://github.com/silverstripe/silverstripe-sphinx).
## Requirements
* SilverStripe 3.1
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
### Supported Formats
The module supports text extraction on the following file formats:
* HTML (built-in)
* PDF (with XPDF or Solr)
@ -31,188 +20,43 @@ Note: Previously part of the [sphinx module](https://github.com/silverstripe/sil
* EPub (Solr)
* Many others (Tika)
## Requirements
* Silverstripe ^4.0
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
## Installation
The recommended installation is through [composer](http://getcomposer.org).
Add the following to your `composer.json`:
```js
{
"require": {
"silverstripe/textextraction": "2.0.x-dev"
}
}
```
composer require silverstripe/textextraction
```
The module depends on the [Guzzle HTTP Library](http://guzzlephp.org),
which is automatically checked out by composer. Alternatively, install Guzzle
through PEAR and ensure its in your `include_path`.
## Configuration
## Documentation
### Basic
* [Configuration](docs/en/configuration.md)
* [Developer documentation](/docs/en/developer-docs.md)
By default, only extraction from HTML documents is supported.
No configuration is required for that, unless you want to make
the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config/config.yml`:
## Bugtracker
```yaml
File:
extensions:
- FileTextExtractable
```
Bugs are tracked in the issues section of this repository. Before submitting an issue please read over
existing issues to ensure yours is unique.
By default any extracted content will be cached against the database row.
In order to stay within common size constraints for SQL queries required in this operation,
the cache sets a maximum character length after which content gets truncated (default: 500000).
You can configure this value through `FileTextCache_Database.max_content_length` in your yaml configuration.
If the issue does look like a new bug:
- Create a new issue
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
and screencasts can help here.
- Describe your environment as detailed as possible: Silverstripe version, Browser, PHP version,
Operating System, any installed Silverstripe modules.
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
In order to swap out the cache backend you can use the following yaml configuration.
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.
```yaml
---
Name: mytextextraction
After: '#textextraction'
---
Injector:
FileTextCache: FileTextCache_SSCache
FileTextCache_SSCache:
lifetime: 3600 # Number of seconds to cache content for
```
### XPDF
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
commandline utility. Follow their installation instructions, its presence will be automatically
detected. You can optionally set the binary path in `mysite/_config/config.yml`:
```yml
PDFTextExtractor:
binary_location: /my/path/pdftotext
```
### Apache Solr
Apache Solr is a fulltext search engine, an aspect which is often used
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
The textextraction module retrieves the output of this service, rather than altering the index.
With the raw text output, you can decide to store it in a database column for fulltext search
in your database driver, or even pass it back to Solr as part of a full index update.
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
```yml
SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract'
```
Note that in case you're using multiple cores, you'll need to add the core name to the URL
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
uses multiple cores by default, and comes prepackaged with a Solr server.
Its a stripped-down version of Solr, follow the module README on how to add
Apache Tika text extraction capabilities.
You need to ensure that some indexable property on your object
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
```php
class MyDocument extends DataObject {
static $db = array('Path' => 'Text');
function getContent() {
$extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null;
}
}
class MySolrIndex extends SolrIndex {
function init() {
$this->addClass('MyDocument');
$this->addStoredField('Content', 'HTMLText');
}
}
```
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.
### Tika
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
is fairly slow, so we recommend running it as a server.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
### Tika - CLI
Ensure that your machine has a 'tika' command available which will run the CLI script.
```bash
#!/bin/bash
exec java -jar tika-app-1.8.jar "$@"
```
### Tika Rest Server
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
```yaml
TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998'
```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` directive in your `_ss_environment.php` file, or an environment variable of the same name.
Then startup your server as below
```bash
java -jar tika-server-1.8.jar --host=localhost --port=9998
```
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
it behaves differently and is not recommended.
The module will log extraction errors with `SS_Log::NOTICE` priority by default,
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
In case you want more information on why processing failed, you can increase
the logging verbosity in the tika server instance by passing through
a `--includeStack` flag. Logs can passed on to files or external logging services,
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
documentation for SilverStripe core.
## Usage
Manual extraction:
```php
$myFile = '/my/path/myfile.pdf';
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied:
```php
$myFileObj = File::get()->First();
$content = $myFileObj->getFileContent();
```
This content can also be embedded directly within a template.
```
$MyFile.FileContent
```
## Development and contribution
If you would like to make contributions to the module please ensure you raise a pull request and discuss
with the module maintainers.

View File

10
_config/cache.yml Normal file
View File

@ -0,0 +1,10 @@
---
Name: textextractioncache
After:
- '#corecache'
---
SilverStripe\Core\Injector\Injector:
Psr\SimpleCache\CacheInterface.FileTextCache_Cache:
factory: SilverStripe\Core\Cache\CacheFactory
constructor:
namespace: 'FileTextCache_Cache'

View File

@ -1,11 +1,10 @@
---
Name: textextraction
Name: textextractionconfig
---
Injector:
FileTextCache: FileTextCache_Database
SilverStripe\Core\Injector\Injector:
# Define default FileTextCache implementation
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Database
SolrCellTextExtractor:
# base_url: 'http://localhost:8983/solr/update/extract'
FileTextCache_Database:
max_content_length: 500000
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
max_content_length: 500000

1
code-of-conduct.md Normal file
View File

@ -0,0 +1 @@
When having discussions about this module in issues or pull request please adhere to the [SilverStripe Community Code of Conduct](https://docs.silverstripe.org/en/contributing/code_of_conduct).

View File

@ -1,105 +0,0 @@
<?php
interface FileTextCache {
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class FileTextCache_Database implements FileTextCache {
public function load(File $file) {
return $file->FileContentCache;
}
public function save(File $file, $content) {
$maxLength = Config::inst()->get('FileTextCache_Database', 'max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
public function invalidate(File $file) {
// To prevent writing to the cache from invalidating it
if(!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_SSCache implements FileTextCache, Flushable {
/**
* Lifetime of cache in seconds
* Null is indefinite
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return SS_Cache
*/
protected static function get_cache() {
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime);
return $cache;
}
protected function getKey(File $file) {
return md5($file->getFullPath());
}
public function load(File $file) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->load($key);
}
public function save(File $file, $content) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->save($content, $key);
}
public static function flush() {
$cache = self::get_cache();
$cache->clean();
}
public function invalidate(File $file) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->remove($key);
}
}

View File

@ -1,91 +0,0 @@
<?php
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
* FileTextExtractor to do the extraction based on the content type of the file.
*
* Adds an additional property which is the cached contents, which is populated on demand.
*
* @author mstephens
*
*/
class FileTextExtractable extends DataExtension {
private static $db = array(
'FileContentCache' => 'Text'
);
private static $casting = array(
'FileContent' => 'Text'
);
private static $dependencies = array(
'TextCache' => '%$FileTextCache'
);
/**
* @var FileTextCache
*/
protected $fileTextCache = null;
/**
*
* @param FileTextCache $cache
*/
public function setTextCache(FileTextCache $cache) {
$this->fileTextCache = $cache;
}
/**
* @return FileTextCache
*/
public function getTextCache() {
return $this->fileTextCache;
}
/**
* Helper function for template
*
* @return string
*/
public function getFileContent() {
return $this->extractFileAsText();
}
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and returns the text.
* The value is also cached into the File record itself.
*
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing the cached version
* @return string
*/
public function extractFileAsText($disableCache = false) {
if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner);
if($text) {
return $text;
}
}
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
if (!$extractor) {
return null;
}
$text = $extractor->getContent($this->owner->FullPath);
if (!$text) {
return null;
}
$this->getTextCache()->save($this->owner, $text);
return $text;
}
public function onBeforeWrite() {
// Clear cache before changing file
$this->getTextCache()->invalidate($this->owner);
}
}

View File

@ -1,135 +0,0 @@
<?php
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*
*/
abstract class FileTextExtractor extends Object {
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
*
* @config
* @var integer
*/
private static $priority = 50;
/**
* Cache of extractor class names, sorted by priority
*
* @var array
*/
protected static $sorted_extractor_classes = null;
/**
* Gets the list of prioritised extractor classes
*
* @return array
*/
protected static function get_extractor_classes() {
// Check cache
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor("FileTextExtractor");
array_shift($classes);
$classPriorities = array();
foreach($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
// Save classes
$sortedClasses = array_keys($classPriorities);
return self::$sorted_extractor_classes = $sortedClasses;
}
/**
* Get the text file extractor for the given class
*
* @param string $class
* @return FileTextExtractor
*/
protected static function get_extractor($class) {
return Injector::inst()->get($class);
}
/**
* Attempt to detect mime type for given file
*
* @param string $path
* @return string Mime type if found
*/
protected static function get_mime($path) {
$file = new Symfony\Component\HttpFoundation\File\File($path);
return $file->getMimeType();
}
/**
* @param string $path
* @return FileTextExtractor|null
*/
static function for_file($path) {
if(!file_exists($path) || is_dir($path)) {
return;
}
$extension = pathinfo($path, PATHINFO_EXTENSION);
$mime = self::get_mime($path);
foreach(self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
// Skip unavailable extractors
if(!$extractor->isAvailable()) continue;
// Check extension
if($extension && $extractor->supportsExtension($extension)) {
return $extractor;
}
// Check mime
if($mime && $extractor->supportsMime($mime)) {
return $extractor;
}
}
}
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
*
* @return boolean
*/
abstract public function isAvailable();
/**
* Determine if this extractor supports the given extension.
* If support is determined by mime/type only, then this should return false.
*
* @param string $extension
* @return boolean
*/
abstract public function supportsExtension($extension);
/**
* Determine if this extractor suports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime
* @return boolean
*/
abstract public function supportsMime($mime);
/**
* Given a file path, extract the contents as text.
*
* @param string $path
* @return string
*/
abstract public function getContent($path);
}
class FileTextExtractor_Exception extends Exception {}

View File

@ -1,73 +0,0 @@
<?php
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor {
public function isAvailable() {
return true;
}
public function supportsExtension($extension) {
return in_array(
strtolower($extension),
array("html", "htm", "xhtml")
);
}
public function supportsMime($mime) {
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path) {
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
),
$content
);
return strip_tags($content);
}
}

View File

@ -1,98 +0,0 @@
<?php
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
class PDFTextExtractor extends FileTextExtractor {
public function isAvailable() {
$bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin));
}
public function supportsExtension($extension) {
return strtolower($extension) === 'pdf';
}
public function supportsMime($mime) {
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
/**
* Accessor to get the location of the binary
*
* @param string $prog Name of binary
* @return string
*/
protected function bin($prog = '') {
if ($this->config()->binary_location) {
// By config
$path = $this->config()->binary_location;
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
return ( $path ? $path . '/' : '' ) . $prog;
}
public function getContent($path) {
if(!$path) return ""; // no file
$content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path) {
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if($err) {
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode('', $err)
));
}
return implode('', $content);
}
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input) {
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}

View File

@ -1,92 +0,0 @@
<?php
use Guzzle\Http\Client;
/**
* Text extractor that calls an Apache Solr instance
* and extracts content via the "ExtractingRequestHandler" endpoint.
* Does not alter the Solr index itself, but uses it purely
* for its file parsing abilities.
*
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
class SolrCellTextExtractor extends FileTextExtractor {
/**
* Base URL to use for solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var string
*/
private static $base_url;
private static $priority = 75;
protected $httpClient;
public function getHttpClient() {
if(!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
return $this->httpClient;
}
public function setHttpClient($client) {
$this->httpClient = $client;
}
public function isAvailable() {
$url = $this->config()->get('base_url');
if(!$url) return false;
}
public function supportsExtension($extension) {
return in_array(
strtolower($extension),
array(
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
)
);
}
public function supportsMime($mime) {
// Rely on supportsExtension
return false;
}
public function getContent($path) {
if (!$path) return ""; // no file
$fileName = basename($path);
$client = $this->getHttpClient();
try {
$request = $client
->post()
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
->addPostFiles(array('myfile' => $path));
$response = $request->send();
} catch(InvalidArgumentException $e) {
SS_Log::log(
sprintf(
'Error extracting text from "%s" (message: %s)',
$path,
$e->getMessage()
),
SS_Log::NOTICE
);
return null;
}
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
(string)$response->getBody(),
$matches
);
return $matches ? $matches[1] : null;
}
}

View File

@ -1,104 +0,0 @@
<?php
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaServerTextExtractor extends FileTextExtractor {
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* @return TikaRestClient
*/
public function getClient() {
return $this->client ?:
($this->client =
Injector::inst()->createWithArgs(
'TikaRestClient',
array($this->getServerEndpoint())
)
);
}
public function getServerEndpoint() {
if(defined('SS_TIKA_ENDPOINT')) {
return SS_TIKA_ENDPOINT;
}
if(getenv('SS_TIKA_ENDPOINT')) return getenv('SS_TIKA_ENDPOINT');
// Default to configured endpoint
return $this->config()->server_endpoint;
}
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
return $this
->getClient()
->getVersion();
}
public function isAvailable() {
return $this->getServerEndpoint() &&
$this->getClient()->isAvailable() &&
$this->getVersion() >= 1.7;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = array();
public function supportsMime($mime) {
$supported = $this->supportedMimes ?:
($this->supportedMimes = $this->getClient()->getSupportedMimes());
// Check if supported (most common / quickest lookup)
if(isset($supported[$mime])) return true;
// Check aliases
foreach($supported as $info) {
if(isset($info['alias']) && in_array($mime, $info['alias'])) return true;
}
return false;
}
public function getContent($path) {
return $this->getClient()->tika($path);
}
}

View File

@ -1,94 +0,0 @@
<?php
/**
* Enables text extraction of file content via the Tika CLI
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaTextExtractor extends FileTextExtractor {
/**
* Text extraction mode. Defaults to -t (plain text)
*
* @var string
* @config
*/
private static $output_mode = '-t';
/**
* Get the version of tika installed, or 0 if not installed
*
* @return float version of tika
*/
public function getVersion() {
$code = $this->runShell('tika --version', $stdout);
// Parse output
if(!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
return $matches['version'];
}
return 0;
}
/**
* Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '') {
$descriptorSpecs = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
// Invoke command
$pipes = array();
$proc = proc_open($command, $descriptorSpecs, $pipes);
if (!is_resource($proc)) return 255;
// Send content as input
fwrite($pipes[0], $input);
fclose($pipes[0]);
// Get output
$stdout = stream_get_contents($pipes[1]);
fclose($pipes[1]);
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]);
// Get result
return proc_close($proc);
}
public function getContent($path) {
$mode = $this->config()->output_mode;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$code = $this->runShell($command, $output);
if($code == 0) return $output;
}
public function isAvailable() {
return $this->getVersion() > 0;
}
public function supportsExtension($extension) {
// Determine support via mime type only
return false;
}
public function supportsMime($mime) {
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if($code) return false; // Error case
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
return (bool)preg_match($pattern, $supportedTypes);
}
}

View File

@ -1,94 +0,0 @@
<?php
use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException;
class TikaRestClient extends Client {
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable() {
try {
return $this
->get()->send()
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
/**
* Get version code
*
* @return float
*/
public function getVersion() {
$response = $this->get('version')->send();
// Parse output
if($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) {
return (float)$matches['version'];
}
return 0.0;
}
protected $mimes = array();
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes() {
if($this->mimes) return $this->mimes;
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
)->send();
return $this->mimes = $response->json();
}
/**
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file) {
$text = null;
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
file_get_contents($file)
)->send();
$text = $response->getBody(true);
} catch(RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
if($body) {
$msg .= ' Body: ' . $body;
}
SS_Log::log($msg, SS_Log::NOTICE);
}
return $text;
}
}

3
codecov.yml Normal file
View File

@ -0,0 +1,3 @@
comment: false
codecov:
branch: master

View File

@ -1,37 +1,45 @@
{
"name": "silverstripe/textextraction",
"type": "silverstripe-module",
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
"homepage": "http://silverstripe.org",
"license": "BSD-3-Clause",
"keywords": ["silverstripe", "fulltext", "pdf"],
"authors": [
{
"name": "SilverStripe",
"homepage": "http://silverstripe.com"
},
{
"name": "The SilverStripe Community",
"homepage": "http://silverstripe.org"
}
],
"require": {
"php": ">=5.3.2",
"composer/installers": "*",
"silverstripe/framework": "~3.1",
"guzzle/guzzle": "~3.9",
"symfony/event-dispatcher": "~2.6.0@stable",
"symfony/http-foundation": "~2.6.0"
},
"require-dev": {
"phpunit/phpunit": "~3.7"
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
}
}
"name": "silverstripe/textextraction",
"type": "silverstripe-vendormodule",
"description": "Text Extraction API for SilverStripe CMS (mostly used with 'fulltextsearch' module)",
"homepage": "http://silverstripe.org",
"license": "BSD-3-Clause",
"keywords": [
"silverstripe",
"fulltext",
"pdf"
],
"authors": [
{
"name": "SilverStripe",
"homepage": "http://silverstripe.com"
},
{
"name": "The SilverStripe Community",
"homepage": "http://silverstripe.org"
}
],
"require": {
"php": "^7.4 || ^8.0",
"silverstripe/framework": "^4.10",
"silverstripe/assets": "^1",
"silverstripe/versioned": "^1",
"guzzlehttp/guzzle": "^6.3 || ^7.0"
},
"require-dev": {
"squizlabs/php_codesniffer": "^3",
"phpunit/phpunit": "^9.5"
},
"autoload": {
"psr-4": {
"SilverStripe\\TextExtraction\\": "src/",
"SilverStripe\\TextExtraction\\Tests\\": "tests/"
}
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
},
"extra": [],
"minimum-stability": "dev",
"prefer-stable": true
}

160
docs/en/configuration.md Normal file
View File

@ -0,0 +1,160 @@
# Configuration
## Basic
By default, only extraction from HTML documents is supported.
No configuration is required for that, unless you want to make
the content available through your `DataObject` subclass.
In this case, add the following to `mysite/_config/config.yml`:
```yaml
SilverStripe\Assets\File:
extensions:
- SilverStripe\TextExtraction\Extension\FileTextExtractable
```
By default any extracted content will be cached against the database row. In order to stay within common size
constraints for SQL queries required in this operation, the cache sets a maximum character length after which
content gets truncated (default: 500000). You can configure this value through
`SilverStripe\TextExtraction\Cache\FileTextCache\Database.max_content_length` in your YAML configuration.
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
In order to swap out the cache backend you can use the following yaml configuration.
```yaml
---
Name: mytextextraction
After: '#textextraction'
---
SilverStripe\Core\Injector\Injector:
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
SilverStripe\TextExtraction\Cache\FileTextCache\Cache:
lifetime: 3600 # Number of seconds to cache content for
```
## XPDF
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
commandline utility. Follow their installation instructions, its presence will be automatically
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
```yml
SilverStripe\TextExtraction\Extractor\PDFTextExtractor:
binary_location: /my/path/pdftotext
```
## Apache Solr
Apache Solr is a fulltext search engine, an aspect which is often used
alongside this module. But more importantly for us, it has bindings to [Apache Tika](http://tika.apache.org/)
through the [ExtractingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler) interface.
This allows Solr to inspect the contents of various file formats, such as Office documents and PDF files.
The textextraction module retrieves the output of this service, rather than altering the index.
With the raw text output, you can decide to store it in a database column for fulltext search
in your database driver, or even pass it back to Solr as part of a full index update.
In order to use Solr, you need to configure a URL for it (in `mysite/_config/config.yml`):
```yml
SilverStripe\TextExtraction\Extractor\SolrCellTextExtractor:
base_url: 'http://localhost:8983/solr/update/extract'
```
Note that in case you're using multiple cores, you'll need to add the core name to the URL
(e.g. 'http://localhost:8983/solr/PageSolrIndex/update/extract').
The ["fulltext" module](https://github.com/silverstripe-labs/silverstripe-fulltextsearch)
uses multiple cores by default, and comes prepackaged with a Solr server.
Its a stripped-down version of Solr, follow the module README on how to add
Apache Tika text extraction capabilities.
You need to ensure that some indexable property on your object
returns the contents, either by directly accessing `FileTextExtractable->extractFileAsText()`,
or by writing your own method around `FileTextExtractor->getContent()` (see "Usage" below).
The property should be listed in your `SolrIndex` subclass, e.g. as follows:
```php
use SilverStripe\ORM\DataObject;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class MyDocument extends DataObject
{
private static $db = ['Path' => 'Text'];
public function getContent()
{
$extractor = FileTextExtractor::for_file($this->Path);
return $extractor ? $extractor->getContent($this->Path) : null;
}
}
use SilverStripe\FullTextSearch\Solr;
class MySolrIndex extends SolrIndex
{
public function init()
{
$this->addClass(MyDocument::class);
$this->addStoredField('Content', 'HTMLText');
}
}
```
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
code.
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.
## Tika
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
is fairly slow, so we recommend running it as a server.
This extension will best work with the [fileinfo PHP extension](http://php.net/manual/en/book.fileinfo.php)
installed to perform mime detection. Tika validates support via mime type rather than file extensions.
## Tika - CLI
Ensure that your machine has a 'tika' command available which will run the CLI script.
```bash
#!/bin/bash
exec java -jar tika-app-1.8.jar "$@"
```
## Tika Rest Server
Tika can also be run as a server. You can configure your server endpoint by setting the url via config.
```yaml
SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor:
server_endpoint: 'http://localhost:9998'
```
Alternatively this may be specified via the `SS_TIKA_ENDPOINT` environment variable in your `.env` file, or an
environment variable of the same name.
Then startup your server as below:
```bash
java -jar tika-server-1.8.jar --host=localhost --port=9998
```
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
it behaves differently and is not recommended.
The module will log extraction errors with PSR-3 "notice" priority by default,
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
In case you want more information on why processing failed, you can increase
the logging verbosity in the tika server instance by passing through
a `--includeStack` flag. Logs can passed on to files or external logging services,
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
documentation for SilverStripe core.

32
docs/en/developer-docs.md Normal file
View File

@ -0,0 +1,32 @@
# Developer documentation
## Usage
Manual extraction via string file path:
```php
$myFile = '/my/path/myfile.pdf';
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Manual extraction via File object:
```php
$myFile = File::get()->filter(['Name' => 'My file')->first();
$extractor = FileTextExtractor::for_file($myFile);
$content = $extractor->getContent($myFile);
```
Extraction with `FileTextExtractable` extension applied:
```php
$myFileObj = File::get()->First();
$content = $myFileObj->getFileContent();
```
This content can also be embedded directly within a template.
```
$MyFile.FileContent
```

12
license.md Normal file
View File

@ -0,0 +1,12 @@
Copyright (c) 2018, SilverStripe Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

13
phpcs.xml.dist Normal file
View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<ruleset name="SilverStripe">
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
<file>src</file>
<file>tests</file>
<!-- base rules are PSR-2 -->
<rule ref="PSR2" >
<!-- Current exclusions -->
<exclude name="PSR1.Methods.CamelCapsMethodName.NotCamelCaps" />
</rule>
</ruleset>

17
phpunit.xml.dist Normal file
View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
<testsuites>
<testsuite name="Default">
<directory>tests/</directory>
</testsuite>
</testsuites>
<filter>
<whitelist addUncoveredFilesFromWhitelist="true">
<directory suffix=".php">src/</directory>
<exclude>
<directory suffix=".php">tests/</directory>
</exclude>
</whitelist>
</filter>
</phpunit>

View File

@ -0,0 +1,31 @@
<?php
namespace SilverStripe\TextExtraction\Cache;
use SilverStripe\Assets\File;
interface FileTextCache
{
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
/**
* Invalidate the cache for a given file.
* Invoked in onBeforeWrite on the file
*
* @param File $file
*/
public function invalidate(File $file);
}

View File

@ -0,0 +1,107 @@
<?php
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use Psr\SimpleCache\CacheInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Flushable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class Cache implements FileTextCache, Flushable
{
use Configurable;
/**
* Lifetime of cache in seconds
* Null defaults to 3600 (1 hour)
*
* @var int|null
* @config
*/
private static $lifetime = null;
/**
* @return CacheInterface
*/
protected static function get_cache()
{
$for = sprintf('%s.%s', CacheInterface::class, 'FileTextCache_Cache');
return Injector::inst()->get($for);
}
/**
*
* @param File $file
* @return string
*/
protected function getKey(File $file)
{
return md5($file->getFilename() ?? '');
}
/**
*
* @param File $file
* @return mixed
*/
public function load(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->get($key);
}
/**
* @param File $file
* @param string $content
* @return string
*/
public function save(File $file, $content)
{
$lifetime = $this->config()->get('lifetime') ?: 3600;
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->set($key, $content, $lifetime);
}
/**
* @return void
*/
public static function flush()
{
$cache = self::get_cache();
$cache->clear();
}
/**
* Alias for $this->flush()
*
* @return void
*/
public static function clear()
{
$cache = self::get_cache();
$cache->clear();
}
/**
*
* @param File $file
* @return bool
*/
public function invalidate(File $file)
{
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->delete($key);
}
}

View File

@ -0,0 +1,55 @@
<?php
namespace SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\TextExtraction\Cache\FileTextCache;
/**
* Caches the extracted content on the record for the file.
* Limits the stored file content by default to avoid hitting query size limits.
*/
class Database implements FileTextCache
{
use Configurable;
/**
* @config
* @var int
*/
private static $max_content_length = null;
/**
*
* @param File $file
* @return FileTextCache
*/
public function load(File $file)
{
return $file->FileContentCache;
}
/**
* @param File $file
* @param mixed $content
*/
public function save(File $file, $content)
{
$maxLength = $this->config()->get('max_content_length');
$file->FileContentCache = ($maxLength) ? substr($content, 0, $maxLength) : $content;
$file->write();
}
/**
* @param File $file
* @return void
*/
public function invalidate(File $file)
{
// To prevent writing to the cache from invalidating it
if (!$file->isChanged('FileContentCache')) {
$file->FileContentCache = '';
}
}
}

View File

@ -0,0 +1,123 @@
<?php
namespace SilverStripe\TextExtraction\Extension;
use SilverStripe\Assets\File;
use SilverStripe\ORM\DataExtension;
use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Decorate File or a File derivative to enable text extraction from the file content. Uses a set of subclasses of
* FileTextExtractor to do the extraction based on the content type of the file.
*
* Adds an additional property which is the cached contents, which is populated on demand.
*
* @author mstephens
*/
class FileTextExtractable extends DataExtension
{
/**
* @var array
* @config
*/
private static $db = [
'FileContentCache' => 'Text'
];
/**
* @var array
* @config
*/
private static $casting = [
'FileContent' => 'Text'
];
/**
* @var array
* @config
*/
private static $dependencies = [
'TextCache' => '%$' . FileTextCache::class,
];
/**
* @var FileTextCache
*/
protected $fileTextCache = null;
/**
* @param FileTextCache $cache
* @return $this
*/
public function setTextCache(FileTextCache $cache)
{
$this->fileTextCache = $cache;
return $this;
}
/**
* @return FileTextCache
*/
public function getTextCache()
{
return $this->fileTextCache;
}
/**
* Helper function for template
*
* @return string
*/
public function getFileContent()
{
return $this->extractFileAsText();
}
/**
* Tries to parse the file contents if a FileTextExtractor class exists to handle the file type, and
* returns the text. The value is also cached into the File record itself.
*
* @param boolean $disableCache If false, the file content is only parsed on demand.
* If true, the content parsing is forced, bypassing
* the cached version
* @return string|null
*/
public function extractFileAsText($disableCache = false)
{
/** @var File $file */
$file = $this->owner;
if (!$disableCache) {
$text = $this->getTextCache()->load($file);
if ($text) {
return $text;
}
}
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($file);
if (!$extractor) {
return null;
}
$text = $extractor->getContent($file);
if (!$text) {
return null;
}
if (!$disableCache) {
$this->getTextCache()->save($file, $text);
}
return $text;
}
/**
* @return void
*/
public function onBeforeWrite()
{
// Clear cache before changing file
$this->getTextCache()->invalidate($this->owner);
}
}

View File

@ -0,0 +1,185 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\ClassInfo;
use SilverStripe\Core\Config\Config;
use SilverStripe\Core\Config\Configurable;
use SilverStripe\Core\Injector\Injectable;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/**
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
* @author mstephens
*/
abstract class FileTextExtractor
{
use Configurable;
use Injectable;
/**
* Set priority from 0-100.
* The highest priority extractor for a given content type will be selected.
*
* @config
* @var integer
*/
private static $priority = 50;
/**
* Cache of extractor class names, sorted by priority
*
* @var array
*/
protected static $sorted_extractor_classes = null;
/**
* Gets the list of prioritised extractor classes
*
* @return array
*/
protected static function get_extractor_classes()
{
// Check cache
if (self::$sorted_extractor_classes) {
return self::$sorted_extractor_classes;
}
// Generate the sorted list of extractors on demand.
$classes = ClassInfo::subclassesFor(__CLASS__);
array_shift($classes);
$classPriorities = [];
foreach ($classes as $class) {
$classPriorities[$class] = Config::inst()->get($class, 'priority');
}
arsort($classPriorities);
// Save classes
$sortedClasses = array_keys($classPriorities ?? []);
return self::$sorted_extractor_classes = $sortedClasses;
}
/**
* Get the text file extractor for the given class
*
* @param string $class
* @return FileTextExtractor
*/
protected static function get_extractor($class)
{
return Injector::inst()->get($class);
}
/**
* Given a File object, decide which extractor instance to use to handle it
*
* @param File|string $file
* @return FileTextExtractor|null
*/
public static function for_file($file)
{
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
return null;
}
// Ensure we have a File instance to work with
if (is_string($file)) {
/** @var File $fileObject */
$fileObject = File::create();
$fileObject->setFromLocalFile($file);
$file = $fileObject;
}
$extension = $file->getExtension();
$mime = $file->getMimeType();
foreach (self::get_extractor_classes() as $className) {
$extractor = self::get_extractor($className);
// Skip unavailable extractors
if (!$extractor->isAvailable()) {
continue;
}
// Check extension
if ($extension && $extractor->supportsExtension($extension)) {
return $extractor;
}
// Check mime
if ($mime && $extractor->supportsMime($mime)) {
return $extractor;
}
}
}
/**
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
* file contents to a temp file and return its path
*
* @param File $file
* @return string
* @throws Exception
*/
protected static function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
}
// Append extension to temp file if one is set
if ($file->getExtension()) {
$path .= '.' . $file->getExtension();
}
// Remove any existing temp files with this name
if (file_exists($path ?? '')) {
unlink($path ?? '');
}
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
if (false === $bytesWritten) {
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
}
return $path;
}
/**
* Checks if the extractor is supported on the current environment,
* for example if the correct binaries or libraries are available.
*
* @return boolean
*/
abstract public function isAvailable();
/**
* Determine if this extractor supports the given extension.
* If support is determined by mime/type only, then this should return false.
*
* @param string $extension
* @return boolean
*/
abstract public function supportsExtension($extension);
/**
* Determine if this extractor supports the given mime type.
* Will only be called if supportsExtension returns false.
*
* @param string $mime
* @return boolean
*/
abstract public function supportsMime($mime);
/**
* Given a File instance, extract the contents as text.
*
* @param File|string $file Either the File instance, or a file path for a file to load
* @return string
*/
abstract public function getContent($file);
}

View File

@ -0,0 +1,7 @@
<?php
namespace SilverStripe\TextExtraction\Extractor\FileTextExtractor;
class Exception extends \Exception
{
}

View File

@ -0,0 +1,90 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not
* the best for readable text.
*
* @author mstephens
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(strtolower($extension ?? ''), ["html", "htm", "xhtml"]);
}
/**
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime ?? '') === 'text/html';
}
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param File $file
* @return string
*/
public function getContent($file)
{
$content = $file instanceof File ? $file->getString() : file_get_contents($file ?? '');
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
[
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
],
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
$content ?? ''
);
return strip_tags($content ?? '');
}
}

View File

@ -0,0 +1,146 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*/
class PDFTextExtractor extends FileTextExtractor
{
/**
* Set to bin path this extractor can execute
*
* @var string
*/
private static $binary_location = null;
/**
* Used if binary_location isn't set.
* List of locations to search for a given binary in
*
* @config
* @var array
*/
private static $search_binary_locations = [
'/usr/bin',
'/usr/local/bin',
];
public function isAvailable()
{
$bin = $this->bin('pdftotext');
return $bin && file_exists($bin ?? '') && is_executable($bin ?? '');
}
public function supportsExtension($extension)
{
return strtolower($extension ?? '') === 'pdf';
}
public function supportsMime($mime)
{
return in_array(
strtolower($mime ?? ''),
[
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
]
);
}
/**
* Accessor to get the location of the binary
*
* @param string $program Name of binary
* @return string
*/
protected function bin($program = '')
{
// Get list of allowed search paths
if ($location = $this->config()->get('binary_location')) {
$locations = [$location];
} else {
$locations = $this->config()->get('search_binary_locations');
}
// Find program in each path
foreach ($locations as $location) {
$path = "{$location}/{$program}";
if (file_exists($path ?? '')) {
return $path;
}
if (file_exists($path . '.exe')) {
return $path . '.exe';
}
}
// Not found
return null;
}
public function getContent($file)
{
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
// no file
return '';
}
$content = $this->getRawOutput($file);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given File object
*
* @param File|string $file
* @return string Output
* @throws Exception
*/
protected function getRawOutput($file)
{
if (!$this->isAvailable()) {
throw new Exception("getRawOutput called on unavailable extractor");
}
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path ?? '')), $content, $err);
if ($err) {
throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $content)
));
}
return implode(PHP_EOL, $content);
}
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input)
{
$mapping = [
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
];
return str_replace(array_keys($mapping ?? []), array_values($mapping ?? []), $input ?? '');
}
}

View File

@ -0,0 +1,164 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Injector\Injector;
/**
* Text extractor that calls an Apache Solr instance
* and extracts content via the "ExtractingRequestHandler" endpoint.
* Does not alter the Solr index itself, but uses it purely
* for its file parsing abilities.
*
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
class SolrCellTextExtractor extends FileTextExtractor
{
/**
* Base URL to use for Solr text extraction.
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var string
*/
private static $base_url;
/**
* @var int
* @config
*/
private static $priority = 75;
/**
* @var Client
*/
protected $httpClient;
/**
* @return Client
*/
public function getHttpClient()
{
if (!$this->httpClient) {
$this->httpClient = new Client();
}
return $this->httpClient;
}
/**
* @param Client $client
* @return $this
*/
public function setHttpClient(Client $client)
{
$this->httpClient = $client;
return $this;
}
/**
* @return string
*/
public function isAvailable()
{
$url = $this->config()->get('base_url');
return (bool) $url;
}
/**
* @param string $extension
* @return bool
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension ?? ''),
[
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
]
);
}
/**
* @param string $mime
* @return bool
*/
public function supportsMime($mime)
{
// Rely on supportsExtension
return false;
}
/**
* @param File|string $file
* @return string
* @throws InvalidArgumentException
*/
public function getContent($file)
{
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
// no file
return '';
}
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
$client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
try {
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
/** @var Response $response */
$response = $client
->post($baseUrl, [
'multipart' => [
['name' => 'extractOnly', 'contents' => 'true'],
['name' => 'extractFormat', 'contents' => 'text'],
['name' => 'myfile', 'contents' => $stream],
]
]);
} catch (InvalidArgumentException $e) {
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
} catch (Exception $e) {
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
// query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
}
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
(string)$response->getBody(),
$matches
);
return $matches ? $matches[1] : null;
}
}

View File

@ -0,0 +1,137 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaServerTextExtractor extends FileTextExtractor
{
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
/**
* @return TikaRestClient
*/
public function getClient()
{
if (!$this->client) {
$this->client = Injector::inst()->createWithArgs(
TikaRestClient::class,
[$this->getServerEndpoint()]
);
}
return $this->client;
}
/**
* @return string
*/
public function getServerEndpoint()
{
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
return $endpoint;
}
// Default to configured endpoint
return $this->config()->get('server_endpoint');
}
/**
* Get the version of Tika installed, or 0 if not installed
*
* @return float version of Tika
*/
public function getVersion()
{
return $this->getClient()->getVersion();
}
/**
* @return boolean
*/
public function isAvailable()
{
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
}
/**
* @param string $extension
* @return boolean
*/
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
/**
* @param string $mime
* @return boolean
*/
public function supportsMime($mime)
{
if (!$this->supportedMimes) {
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)
if (isset($this->supportedMimes[$mime])) {
return true;
}
// Check aliases
foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
return true;
}
}
return false;
}
public function getContent($file)
{
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
$content = $this->getClient()->tika($tempFile);
//Cleanup temp file
if ($file instanceof File) {
unlink($tempFile ?? '');
}
return $content;
}
}

View File

@ -0,0 +1,130 @@
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
/**
* Enables text extraction of file content via the Tika CLI
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
class TikaTextExtractor extends FileTextExtractor
{
/**
* Text extraction mode. Defaults to -t (plain text)
*
* @var string
* @config
*/
private static $output_mode = '-t';
/**
* Get the version of tika installed, or 0 if not installed
*
* @return mixed float | int The version of tika
*/
public function getVersion()
{
$code = $this->runShell('tika --version', $stdout);
// Parse output
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout ?? '', $matches)) {
return $matches['version'];
}
return 0;
}
/**
* Runs an arbitrary and safely escaped shell command
*
* @param string $command Full command including arguments
* @param string &$stdout Standand output
* @param string &$stderr Standard error
* @param string $input Content to pass via standard input
* @return int Exit code. 0 is success
*/
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
{
$descriptorSpecs = [
0 => ["pipe", "r"],
1 => ["pipe", "w"],
2 => ["pipe", "w"]
];
// Invoke command
$pipes = [];
$proc = proc_open($command ?? '', $descriptorSpecs ?? [], $pipes);
if (!is_resource($proc)) {
return 255;
}
// Send content as input
fwrite($pipes[0], $input ?? '');
fclose($pipes[0]);
// Get output
$stdout = stream_get_contents($pipes[1]);
fclose($pipes[1]);
$stderr = stream_get_contents($pipes[2]);
fclose($pipes[2]);
// Get result
return proc_close($proc);
}
public function getContent($file)
{
$mode = $this->config()->get('output_mode');
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path ?? ''));
$code = $this->runShell($command, $output);
//Cleanup temp file
if ($file instanceof File) {
unlink($path ?? '');
}
if ($code == 0) {
return $output;
}
}
/**
* @return bool
*/
public function isAvailable()
{
return $this->getVersion() > 0;
}
/**
* @return bool
*/
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
/**
* @param string $mime
* @return bool
*/
public function supportsMime($mime)
{
// Get list of supported mime types
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
if ($code) {
// Error case
return false;
}
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime ?? '', '/'));
return (bool)preg_match($pattern ?? '', $supportedTypes ?? '');
}
}

171
src/Rest/TikaRestClient.php Normal file
View File

@ -0,0 +1,171 @@
<?php
namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Psr7\Response;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Convert;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
class TikaRestClient extends Client
{
/**
* Authentication options to be sent to the Tika server
*
* @var array
*/
protected $options = ['username' => null, 'password' => null];
/**
* @var array
*/
protected $mimes = [];
/**
*
* @param string $baseUrl
* @param array $config
*/
public function __construct($baseUrl = '', $config = [])
{
$password = Environment::getEnv('SS_TIKA_PASSWORD');
if (!empty($password)) {
$this->options = [
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
'password' => $password,
];
}
$config['base_uri'] = $baseUrl;
parent::__construct($config);
}
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable()
{
try {
/** @var Response $result */
$result = $this->get('/', $this->getGuzzleOptions());
if ($result->getStatusCode() == 200) {
return true;
}
} catch (RequestException $ex) {
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
Injector::inst()->get(LoggerInterface::class)->info($msg);
return false;
}
}
/**
* Get version code
*
* @return string
*/
public function getVersion()
{
/** @var Response $response */
$response = $this->get('version', $this->getGuzzleOptions());
$version = 0;
// Parse output
if ($response->getStatusCode() == 200
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody() ?? '', $matches)
) {
$version = $matches['version'];
}
return (string) $version;
}
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes()
{
if ($this->mimes) {
return $this->mimes;
}
$response = $this->get(
'mime-types',
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'application/json',
],
])
);
return $this->mimes = json_decode($response->getBody(), true);
}
/**
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file)
{
$text = null;
try {
/** @var Response $response */
$response = $this->put(
'tika',
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'text/plain',
],
'body' => file_get_contents($file ?? ''),
])
);
$text = $response->getBody();
} catch (RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody();
if ($body) {
$msg .= ' Body: ' . $body;
}
Injector::inst()->get(LoggerInterface::class)->info($msg);
}
return (string) $text;
}
/**
* Assembles an array of request options to pass to Guzzle
*
* @param array $options Authentication (etc) will be merged into this array and returned
* @return array
*/
protected function getGuzzleOptions($options = [])
{
if (!empty($this->options['username']) && !empty($this->options['password'])) {
$options['auth'] = [
$this->options['username'],
$this->options['password']
];
}
return $options;
}
}

View File

@ -1,17 +1,23 @@
<?php
class FileTextCacheDatabaseTest extends SapphireTest {
public function testTruncatesByMaxLength() {
Config::nest();
Config::inst()->update('FileTextCache_Database', 'max_content_length', 5);
$cache = new FileTextCache_Database();
$file = $this->getMock('File', array('write'));
$content = '0123456789';
$cache->save($file, $content);
$this->assertEquals($cache->load($file), '01234');
Config::unnest();
}
namespace SilverStripe\TextExtraction\Tests;
}
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Cache\FileTextCache\Database;
class FileTextCacheDatabaseTest extends SapphireTest
{
public function testTruncatesByMaxLength()
{
Config::modify()->set(Database::class, 'max_content_length', 5);
$cache = new Database();
$file = $this->getMockBuilder(File::class)->setMethods(['write'])->getMock();
$content = '0123456789';
$cache->save($file, $content);
$this->assertEquals($cache->load($file), '01234');
}
}

View File

@ -1,43 +1,60 @@
<?php
class FileTextExtractableTest extends SapphireTest {
protected $requiredExtensions = array(
'File' => array('FileTextExtractable')
);
namespace SilverStripe\TextExtraction\Tests;
public function setUp() {
parent::setUp();
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
// Ensure that html is a valid extension
Config::inst()
->nest()
->update('File', 'allowed_extensions', array('html'));
}
class FileTextExtractableTest extends SapphireTest
{
protected $usesDatabase = true;
public function tearDown() {
Config::unnest();
parent::tearDown();
}
protected static $required_extensions = [
File::class => [
FileTextExtractable::class,
],
];
function testExtractFileAsText() {
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(BASE_PATH.'/textextraction/tests/fixtures/test1.html',BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
// Use HTML, since the extractor is always available
$file = new File(array(
'Name' => 'test1-copy.html',
'Filename' => 'textextraction/tests/fixtures/test1-copy.html'
));
$file->write();
$content = $file->extractFileAsText();
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
protected function setUp(): void
{
parent::setUp();
if(file_exists(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html')) unlink(BASE_PATH.'/textextraction/tests/fixtures/test1-copy.html');
}
// Ensure that html is a valid extension
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
// Create a copy of the file, as it may be clobbered by the test
// ($file->extractFileAsText() calls $file->write)
copy(
dirname(__FILE__) . '/fixtures/test1.html',
dirname(__FILE__) . '/fixtures/test1-copy.html'
);
}
}
protected function tearDown(): void
{
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
}
parent::tearDown();
}
public function testExtractFileAsText()
{
// Use HTML, since the extractor is always available
/** @var File&FileTextExtractable $file */
$file = new File(['Name' => 'test1-copy.html']);
$file->setTextCache(new FileTextCache\Database());
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
$file->write();
$content = $file->extractFileAsText();
$this->assertNotNull($content);
$this->assertStringContainsString('Test Headline', $content);
$this->assertStringContainsString('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
}
}

View File

@ -1,14 +1,36 @@
<?php
class HTMLTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new HTMLTextExtractor();
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
$this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
}
namespace SilverStripe\TextExtraction\Tests;
}
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\HTMLTextExtractor;
class HTMLTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
protected function setUp(): void
{
parent::setUp();
Config::modify()->merge(File::class, 'allowed_extensions', ['html']);
}
public function testExtraction()
{
$extractor = new HTMLTextExtractor();
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.html');
$file->write();
$content = $extractor->getContent($file);
$this->assertStringContainsString('Test Headline', $content);
$this->assertStringNotContainsString('Test Comment', $content, 'Strips HTML comments');
$this->assertStringNotContainsString('Test Style', $content, 'Strips non-content style tags');
$this->assertStringNotContainsString('Test Script', $content, 'Strips non-content script tags');
}
}

View File

@ -1,12 +1,29 @@
<?php
class PDFTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new PDFTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('pdftotext not available');
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
$this->assertContains('This is a test file with a link', $content);
}
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
use SilverStripe\TextExtraction\Extractor\PDFTextExtractor;
class PDFTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testExtraction()
{
$extractor = new PDFTextExtractor();
if (!$extractor->isAvailable()) {
$this->expectException(Exception::class);
$this->expectExceptionMessage('getRawOutput called on unavailable extractor');
}
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertStringContainsString('This is a test file with a link', $content);
}
}

View File

@ -0,0 +1,78 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use PHPUnit\Framework\MockObject\MockObject;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* @group tika-tests
*/
class TikaServerTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
public function testServerExtraction()
{
$extractor = TikaServerTextExtractor::create();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika server not available');
}
// Check file
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
$content = $extractor->getContent($file);
$this->assertStringContainsString('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
/**
* @param string $version
* @param bool $expected
* @dataProvider isAvailableProvider
*/
public function testIsAvailable($version, $expected)
{
/** @var MockObject|TikaServerTextExtractor $extractor */
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
->setMethods(['getClient', 'getServerEndpoint'])
->getMock();
$client = $this->createMock(TikaRestClient::class);
$client->method('isAvailable')->willReturn(true);
$client->method('getVersion')->willReturn($version);
$extractor->method('getClient')->willReturn($client);
$extractor->method('getServerEndpoint')->willReturn('tikaserver.example');
$result = $extractor->isAvailable();
$this->assertSame($expected, $result);
}
/**
* @return array[]
*/
public function isAvailableProvider()
{
return [
['1.5.2', false],
['1.5', false],
['1.7.0', true],
['1.7.5', true],
['1.8.0', true],
['1.7', true],
['1.8', true],
['2.0.0', true],
];
}
}

View File

@ -1,38 +1,38 @@
<?php
namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaTextExtractor;
/**
* Tests the {@see TikaTextExtractor} class
*
* @group tika-tests
*/
class TikaTextExtractorTest extends SapphireTest {
function testExtraction() {
$extractor = new TikaTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika cli not available');
class TikaTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
public function testExtraction()
{
$extractor = TikaTextExtractor::create();
if (!$extractor->isAvailable()) {
$this->markTestSkipped('tika cli not available');
}
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
// Check file
$file = new File();
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1.pdf');
$file->write();
function testServerExtraction() {
$extractor = new TikaServerTextExtractor();
if(!$extractor->isAvailable()) $this->markTestSkipped('tika server not available');
// Check file
$file = Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf';
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
$content = $extractor->getContent($file);
$this->assertStringContainsString('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
}