Compare commits

...

64 Commits
3.0.0 ... 3

Author SHA1 Message Date
Guy Sartorelli e04501cb52
Merge branch '3.5' into 3 2023-04-26 12:49:10 +12:00
Maxime Rainville 821d2858f3
Merge pull request #81 from creative-commoners/pulls/3.5/fix-constraints
MNT Revert erroneous dependency changes
2023-03-28 17:06:30 +13:00
Guy Sartorelli de215d63f6
MNT Revert erroneous dependency changes 2023-03-28 14:55:48 +13:00
Maxime Rainville 2a260607ec
Merge pull request #80 from creative-commoners/pulls/3/dispatch-ci
MNT Use gha-dispatch-ci
2023-03-23 14:19:29 +13:00
Steve Boyd 6a92eb58e2 MNT Use gha-dispatch-ci 2023-03-21 13:42:30 +13:00
Guy Sartorelli 87869e94a6
MNT Update development dependencies 2023-03-10 16:38:07 +13:00
Guy Sartorelli 61f443d49c
MNT Update release dependencies 2023-03-10 16:38:04 +13:00
Guy Sartorelli 8f2e1d9b75
MNT Update development dependencies 2023-03-10 12:21:32 +13:00
Sabina Talipova a281114ed2
Merge pull request #77 from creative-commoners/pulls/3/stop-using-depr
API Stop using deprecated API
2022-12-05 16:44:21 +13:00
Steve Boyd 1a0cd6d6a6 API Stop using deprecated API 2022-11-28 19:20:01 +13:00
Steve Boyd 3bfa989a7e Merge branch '3.4' into 3 2022-08-02 19:01:21 +12:00
Steve Boyd 46b7f51040 Merge branch '3.3' into 3.4 2022-08-02 19:01:18 +12:00
Guy Sartorelli 041296bda2
Merge pull request #76 from creative-commoners/pulls/3.3/standardise-modules
MNT Standardise modules
2022-08-02 15:49:18 +12:00
Steve Boyd e8997870c5 MNT Standardise modules 2022-08-01 16:23:36 +12:00
Steve Boyd e8061724c5 Merge branch '3.4' into 3 2022-07-25 09:43:40 +12:00
Steve Boyd db8a36fa3e Merge branch '3.3' into 3.4 2022-07-25 09:43:36 +12:00
Guy Sartorelli 04ff0c6084
Merge pull request #74 from creative-commoners/pulls/3.3/module-standards
MNT Use GitHub Actions CI
2022-07-15 17:10:18 +12:00
Steve Boyd e5bf4f1322 MNT Use GitHub Actions CI 2022-07-05 19:08:18 +12:00
Guy Sartorelli 4674084d0d
Merge pull request #72 from creative-commoners/pulls/3/php81
ENH PHP 8.1 compatibility
2022-04-26 17:58:43 +12:00
Steve Boyd df8b17ab85 ENH PHP 8.1 compatibility 2022-04-13 13:51:04 +12:00
Michal Kleiner 77fecc4c53
Merge pull request #71 from GuySartorelli/patch-1
DOCS Fix incorrect PHPDoc about what null lifetime means.
2022-03-10 07:38:08 +13:00
Guy Sartorelli d03a9f06e2
DOCS Fix incorrect PHPDoc about what null lifetime means. 2022-03-09 16:03:03 +13:00
Michal Kleiner 88e7f27c5c
Merge pull request #69 from GuySartorelli/patch-1
DOCS Fix class reference for cache class
2022-03-07 11:31:11 +13:00
Guy Sartorelli 04e4b60435
DOCS Fix class reference for cache class
The `lifetime` config variable is on the `Cache` class, not the `Database` class.
2022-03-07 11:20:01 +13:00
Maxime Rainville 25d8a55058
Merge pull request #68 from creative-commoners/pulls/3/php74
DEP Set PHP 7.4 as the minimum version
2022-02-18 22:10:02 +13:00
Steve Boyd e8f015ddd2 DEP Set PHP 7.4 as the minimum version 2022-02-10 17:41:01 +13:00
Michal Kleiner 254c4e31f8
Merge pull request #67 from GuySartorelli/patch-1
DEP Loosen constraints for guzzlehttp/guzzle
2022-01-26 22:08:47 +13:00
GuySartorelli 7ad3fc9f13
DEP Loosen constraints for guzzlehttp/guzzle 2022-01-26 12:58:30 +13:00
Maxime Rainville eb36dcf5fb
Merge pull request #65 from creative-commoners/pulls/3/sapphire-test-nine
API phpunit 9 support
2021-11-01 22:37:19 +13:00
Steve Boyd b92616eb4e API phpunit 9 support 2021-10-27 18:16:05 +13:00
Steve Boyd 90d4812aa8 Merge branch '3.2' into 3 2021-05-21 14:30:19 +12:00
Maxime Rainville d1bdc003ad MNT Remove obsolete branch-alias 2021-05-05 11:17:57 +12:00
Steve Boyd 6af13768d3 Merge branch '3.1' into 3 2021-01-27 12:26:07 +13:00
Garion Herman 4250acb50e
Merge pull request #64 from creative-commoners/pulls/3.1/travis-shared
MNT Travis shared config
2021-01-27 11:59:07 +13:00
Steve Boyd 795abde8f1
Update build status badge 2021-01-21 16:43:07 +13:00
Steve Boyd d1e241ed56 MNT Travis shared config 2021-01-20 14:49:58 +13:00
Steve Boyd 8e9a0243bb Merge branch '3.1' into 3 2020-11-12 14:46:41 +13:00
Robbie Averill cb15845a95
Merge pull request #62 from creative-commoners/pulls/3.1/travis
Update travis 3.1
2020-06-23 09:34:53 -07:00
Steve Boyd 3564066245 Update travis 2020-06-23 16:26:32 +12:00
Steve Boyd 06995b2ec7 Merge branch '3.1' into 3 2020-06-18 13:30:39 +12:00
Maxime Rainville 01848af86d
Merge pull request #61 from creative-commoners/3.1
Update for 3.1
2020-06-15 18:35:27 +12:00
Steve Boyd e451f96b0b Update for 3.1 2020-06-15 16:17:53 +12:00
Robbie Averill d0a7db0b68
Merge pull request #60 from phptek/issue/58
FIX: Fixes #58 We always want $content (an array) passed to implode()
2019-12-16 12:04:39 -08:00
Russell Michell 42cc545414 FIX: Fixes #58 We always want $content (an array) passed to implode() 2019-12-16 10:06:55 +13:00
Robbie Averill 6234a971d1 Merge branch '3.0' 2019-08-28 10:56:39 +12:00
Robbie Averill 0d7c507b53 Use trusty in Travis builds 2019-08-28 10:56:33 +12:00
Robbie Averill d5313674c3 Merge branch '3.0' 2019-08-28 10:07:30 +12:00
Robbie Averill 32e2f9f84f FIX Ensure test uses database cache, it asserts assuming it is configured 2019-08-28 10:07:21 +12:00
Robbie Averill 5b967fd5d3 Merge branch '3.0' 2019-06-26 15:26:08 +12:00
Robbie Averill 943f393ee8
Merge pull request #55 from ichaber/fix/54-clean-temp-file
#54 Cleanup temporary file
2019-06-26 15:25:20 +12:00
Charlie Bergthaler 242e5a307d FIX Change check for cleanup of temp files only if file is instance of File. 2019-06-26 15:18:31 +12:00
Charlie Bergthaler a9270d73ad FIX Cleanup temporary file after extracting content in TikaServerTextExtractor and TikaTextExtractor 2019-06-26 15:18:31 +12:00
Robbie Averill b4c634bb1f Merge branch '3.0' 2019-06-26 15:17:42 +12:00
Robbie Averill 20079bd33f Remove SilverStripe 4.0-4.2 from Travis builds 2019-06-26 15:17:34 +12:00
Guy Marriott c5cfe4ea1e
Merge pull request #53 from martinhipp/bugfix/tika-version-number-checking
Return version number as string instead of float
2019-04-05 10:07:00 +13:00
Martin Hipp bff5eb2b79
Return version number as string instead of floats so '1.20' does not become 1.2 2019-04-05 09:56:45 +13:00
Robbie Averill 801cd9cacb Merge branch '3.0' 2019-02-22 09:34:11 +07:00
Dylan Wagstaff 9c2da06178
Merge pull request #52 from creative-commoners/pulls/3.0/fix-tests
FIX Ensure Tika responses are casted as strings, fixes broken unit tests
2019-02-14 10:15:55 +13:00
Robbie Averill 276fd9c856 Add PSR-4 autoloader and update Travis to include PHP 7.3 and SS 4.3 2019-02-13 11:42:51 +07:00
Robbie Averill 759d92ccb4 FIX Ensure Tika responses are casted as strings, fixes broken unit tests
They can be returned as a stream, but the TikaRestClient response is documented as a string
2019-02-13 11:42:51 +07:00
Robbie Averill b9502653c2
Merge pull request #51 from ishannz/patch-1
Update isAvailable check to work for identical versions
2019-02-13 11:28:04 +07:00
Robbie Averill 86eba78064 Add tests for isAvailable() 2019-02-13 11:23:28 +07:00
Ishan Jayamanne 21ed6e0f86 Update isAvailable check to work for identical versions
Tika server reports it's version as "Apache Tika 1.7". Unfortunately, `version_compare` in PHP says that version "1.7" is less than version "1.7.0", meaning that Tika server was incorrectly being ruled out unless you used Tika server version 1.8 (where "1.8" > "1.7.0").

Changing the comparison string to just "1.7" means they match exactly, and therefore `version_compare` will return `0` rather than `-1`.
2019-02-13 11:15:54 +07:00
Robbie Averill 75a8c66eee Merge branch '3.0' 2018-07-09 10:04:00 +12:00
23 changed files with 179 additions and 135 deletions

11
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,11 @@
name: CI
on:
push:
pull_request:
workflow_dispatch:
jobs:
ci:
name: CI
uses: silverstripe/gha-ci/.github/workflows/ci.yml@v1

16
.github/workflows/dispatch-ci.yml vendored Normal file
View File

@ -0,0 +1,16 @@
name: Dispatch CI
on:
# At 12:20 PM UTC, only on Saturday and Sunday
schedule:
- cron: '20 12 * * 6,0'
jobs:
dispatch-ci:
name: Dispatch CI
# Only run cron on the silverstripe account
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
steps:
- name: Dispatch CI
uses: silverstripe/gha-dispatch-ci@v1

17
.github/workflows/keepalive.yml vendored Normal file
View File

@ -0,0 +1,17 @@
name: Keepalive
on:
workflow_dispatch:
# The 4th of every month at 10:50am UTC
schedule:
- cron: '50 10 4 * *'
jobs:
keepalive:
name: Keepalive
# Only run cron on the silverstripe account
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
runs-on: ubuntu-latest
steps:
- name: Keepalive
uses: silverstripe/gha-keepalive@v1

View File

@ -1,13 +0,0 @@
inherit: true
checks:
php: true
build:
nodes:
analysis:
tests:
override: [php-scrutinizer-run]
filter:
paths: [src/*, tests/*]

View File

@ -1,47 +0,0 @@
language: php
addons:
apt:
packages:
- poppler-utils
env:
global:
- COMPOSER_ROOT_VERSION=3.x-dev
- SS_TIKA_ENDPOINT="http://localhost:9998/"
matrix:
include:
- php: 5.6
env: DB=MYSQL RECIPE_VERSION=1.0.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1
- php: 7.0
env: DB=MYSQL RECIPE_VERSION=1.1.x-dev PHPUNIT_TEST=1
- php: 7.1
env: DB=PGSQL RECIPE_VERSION=4.2.x-dev PHPUNIT_COVERAGE_TEST=1
- php: 7.2
env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1
before_script:
# Init PHP
- phpenv rehash
- phpenv config-rm xdebug.ini
# Configure Tika bin
- mkdir -p $HOME/bin
- export PATH=$PATH:$HOME/bin
- ./.travis/install_tika.sh
- ($HOME/bin/tika-rest-server &) &> /dev/null
# Install composer dependencies
- composer validate
- composer require --no-update silverstripe/recipe-core "$RECIPE_VERSION"
- if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi
- composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile
script:
- if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi
- if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi
after_success:
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi

View File

@ -1,9 +1,7 @@
# Text extraction module
[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master)
[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction)
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
[![CI](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml/badge.svg)](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml)
[![Silverstripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
Provides a text extraction API for file content, that can hook into different extractor
engines based on availability and the parsed file format. The output returned is always a string of the file content.
@ -24,7 +22,7 @@ The module supports text extraction on the following file formats:
## Requirements
* SilverStripe ^4.0
* Silverstripe ^4.0
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
* (optional) [Apache Tika](http://tika.apache.org/)
@ -54,8 +52,8 @@ If the issue does look like a new bug:
- Create a new issue
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
and screencasts can help here.
- Describe your environment as detailed as possible: SilverStripe version, Browser, PHP version,
Operating System, any installed SilverStripe modules.
- Describe your environment as detailed as possible: Silverstripe version, Browser, PHP version,
Operating System, any installed Silverstripe modules.
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.

View File

@ -20,14 +20,21 @@
}
],
"require": {
"silverstripe/framework": "^4",
"php": "^7.4 || ^8.0",
"silverstripe/framework": "^4.10",
"silverstripe/assets": "^1",
"silverstripe/versioned": "^1",
"guzzlehttp/guzzle": "~6.3.0"
"guzzlehttp/guzzle": "^6.3 || ^7.0"
},
"require-dev": {
"squizlabs/php_codesniffer": "^3",
"phpunit/phpunit": "^5.7"
"phpunit/phpunit": "^9.5"
},
"autoload": {
"psr-4": {
"SilverStripe\\TextExtraction\\": "src/",
"SilverStripe\\TextExtraction\\Tests\\": "tests/"
}
},
"suggest": {
"ext-fileinfo": "Improved support for file mime detection"
@ -35,4 +42,4 @@
"extra": [],
"minimum-stability": "dev",
"prefer-stable": true
}
}

View File

@ -30,7 +30,7 @@ SilverStripe\Core\Injector\Injector:
SilverStripe\TextExtraction\Cache\FileTextCache:
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
SilverStripe\TextExtraction\Cache\FileTextCache\Cache:
lifetime: 3600 # Number of seconds to cache content for
```

View File

@ -2,6 +2,9 @@
<ruleset name="SilverStripe">
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
<file>src</file>
<file>tests</file>
<!-- base rules are PSR-2 -->
<rule ref="PSR2" >
<!-- Current exclusions -->

View File

@ -1,7 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
<testsuite name="Default">
<directory>tests/</directory>
</testsuite>
<testsuites>
<testsuite name="Default">
<directory>tests/</directory>
</testsuite>
</testsuites>
<filter>
<whitelist addUncoveredFilesFromWhitelist="true">

View File

@ -18,7 +18,7 @@ class Cache implements FileTextCache, Flushable
/**
* Lifetime of cache in seconds
* Null is indefinite
* Null defaults to 3600 (1 hour)
*
* @var int|null
* @config
@ -42,7 +42,7 @@ class Cache implements FileTextCache, Flushable
*/
protected function getKey(File $file)
{
return md5($file->getFilename());
return md5($file->getFilename() ?? '');
}
/**

View File

@ -58,7 +58,7 @@ abstract class FileTextExtractor
arsort($classPriorities);
// Save classes
$sortedClasses = array_keys($classPriorities);
$sortedClasses = array_keys($classPriorities ?? []);
return self::$sorted_extractor_classes = $sortedClasses;
}
@ -81,7 +81,7 @@ abstract class FileTextExtractor
*/
public static function for_file($file)
{
if (!$file || (is_string($file) && !file_exists($file))) {
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
return null;
}
@ -137,11 +137,11 @@ abstract class FileTextExtractor
}
// Remove any existing temp files with this name
if (file_exists($path)) {
unlink($path);
if (file_exists($path ?? '')) {
unlink($path ?? '');
}
$bytesWritten = file_put_contents($path, $file->getStream());
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
if (false === $bytesWritten) {
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
}

View File

@ -34,7 +34,7 @@ class HTMLTextExtractor extends FileTextExtractor
*/
public function supportsExtension($extension)
{
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
return in_array(strtolower($extension ?? ''), ["html", "htm", "xhtml"]);
}
/**
@ -43,7 +43,7 @@ class HTMLTextExtractor extends FileTextExtractor
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
return strtolower($mime ?? '') === 'text/html';
}
/**
@ -56,7 +56,7 @@ class HTMLTextExtractor extends FileTextExtractor
*/
public function getContent($file)
{
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
$content = $file instanceof File ? $file->getString() : file_get_contents($file ?? '');
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
@ -82,9 +82,9 @@ class HTMLTextExtractor extends FileTextExtractor
'@</?((frameset)|(frame)|(iframe))@iu',
],
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
$content
$content ?? ''
);
return strip_tags($content);
return strip_tags($content ?? '');
}
}

View File

@ -33,18 +33,18 @@ class PDFTextExtractor extends FileTextExtractor
public function isAvailable()
{
$bin = $this->bin('pdftotext');
return $bin && file_exists($bin) && is_executable($bin);
return $bin && file_exists($bin ?? '') && is_executable($bin ?? '');
}
public function supportsExtension($extension)
{
return strtolower($extension) === 'pdf';
return strtolower($extension ?? '') === 'pdf';
}
public function supportsMime($mime)
{
return in_array(
strtolower($mime),
strtolower($mime ?? ''),
[
'application/pdf',
'application/x-pdf',
@ -72,7 +72,7 @@ class PDFTextExtractor extends FileTextExtractor
// Find program in each path
foreach ($locations as $location) {
$path = "{$location}/{$program}";
if (file_exists($path)) {
if (file_exists($path ?? '')) {
return $path;
}
if (file_exists($path . '.exe')) {
@ -86,7 +86,7 @@ class PDFTextExtractor extends FileTextExtractor
public function getContent($file)
{
if (!$file || (is_string($file) && !file_exists($file))) {
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
// no file
return '';
}
@ -108,17 +108,13 @@ class PDFTextExtractor extends FileTextExtractor
}
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
if (!is_array($err) && $err == 1) {
// For Windows compatibility
$err = $content;
}
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path ?? '')), $content, $err);
if ($err) {
throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $err)
implode(PHP_EOL, $content)
));
}
@ -145,6 +141,6 @@ class PDFTextExtractor extends FileTextExtractor
'st' => 'st'
];
return str_replace(array_keys($mapping), array_values($mapping), $input);
return str_replace(array_keys($mapping ?? []), array_values($mapping ?? []), $input ?? '');
}
}

View File

@ -80,7 +80,7 @@ class SolrCellTextExtractor extends FileTextExtractor
public function supportsExtension($extension)
{
return in_array(
strtolower($extension),
strtolower($extension ?? ''),
[
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
@ -106,12 +106,12 @@ class SolrCellTextExtractor extends FileTextExtractor
*/
public function getContent($file)
{
if (!$file || (is_string($file) && !file_exists($file))) {
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
// no file
return '';
}
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
$client = $this->getHttpClient();
// Get and validate base URL
@ -121,7 +121,7 @@ class SolrCellTextExtractor extends FileTextExtractor
}
try {
$stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
/** @var Response $response */
$response = $client
->post($baseUrl, [
@ -154,7 +154,7 @@ class SolrCellTextExtractor extends FileTextExtractor
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
(string)$response->getBody(),
$matches
);

View File

@ -86,7 +86,7 @@ class TikaServerTextExtractor extends FileTextExtractor
{
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
&& version_compare($this->getVersion(), '1.7.0') >= 0;
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
}
/**
@ -116,7 +116,7 @@ class TikaServerTextExtractor extends FileTextExtractor
// Check aliases
foreach ($this->supportedMimes as $info) {
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
return true;
}
}
@ -127,6 +127,11 @@ class TikaServerTextExtractor extends FileTextExtractor
public function getContent($file)
{
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
return $this->getClient()->tika($tempFile);
$content = $this->getClient()->tika($tempFile);
//Cleanup temp file
if ($file instanceof File) {
unlink($tempFile ?? '');
}
return $content;
}
}

View File

@ -29,7 +29,7 @@ class TikaTextExtractor extends FileTextExtractor
$code = $this->runShell('tika --version', $stdout);
// Parse output
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout ?? '', $matches)) {
return $matches['version'];
}
@ -54,14 +54,14 @@ class TikaTextExtractor extends FileTextExtractor
];
// Invoke command
$pipes = [];
$proc = proc_open($command, $descriptorSpecs, $pipes);
$proc = proc_open($command ?? '', $descriptorSpecs ?? [], $pipes);
if (!is_resource($proc)) {
return 255;
}
// Send content as input
fwrite($pipes[0], $input);
fwrite($pipes[0], $input ?? '');
fclose($pipes[0]);
// Get output
@ -78,8 +78,12 @@ class TikaTextExtractor extends FileTextExtractor
{
$mode = $this->config()->get('output_mode');
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
$command = sprintf('tika %s %s', $mode, escapeshellarg($path ?? ''));
$code = $this->runShell($command, $output);
//Cleanup temp file
if ($file instanceof File) {
unlink($path ?? '');
}
if ($code == 0) {
return $output;
@ -119,8 +123,8 @@ class TikaTextExtractor extends FileTextExtractor
}
// Check if the mime type is inside the result
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime ?? '', '/'));
return (bool)preg_match($pattern, $supportedTypes);
return (bool)preg_match($pattern ?? '', $supportedTypes ?? '');
}
}

View File

@ -70,22 +70,22 @@ class TikaRestClient extends Client
/**
* Get version code
*
* @return float
* @return string
*/
public function getVersion()
{
/** @var Response $response */
$response = $this->get('version', $this->getGuzzleOptions());
$version = 0.0;
$version = 0;
// Parse output
if ($response->getStatusCode() == 200
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody() ?? '', $matches)
) {
$version = (float)$matches['version'];
$version = $matches['version'];
}
return $version;
return (string) $version;
}
/**
@ -108,7 +108,7 @@ class TikaRestClient extends Client
])
);
return $this->mimes = Convert::json2array($response->getBody());
return $this->mimes = json_decode($response->getBody(), true);
}
/**
@ -129,7 +129,7 @@ class TikaRestClient extends Client
'headers' => [
'Accept' => 'text/plain',
],
'body' => file_get_contents($file),
'body' => file_get_contents($file ?? ''),
])
);
$text = $response->getBody();
@ -149,7 +149,7 @@ class TikaRestClient extends Client
Injector::inst()->get(LoggerInterface::class)->info($msg);
}
return $text;
return (string) $text;
}
/**

View File

@ -5,6 +5,7 @@ namespace SilverStripe\TextExtraction\Tests;
use SilverStripe\Assets\File;
use SilverStripe\Core\Config\Config;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Cache\FileTextCache;
use SilverStripe\TextExtraction\Extension\FileTextExtractable;
class FileTextExtractableTest extends SapphireTest
@ -17,7 +18,7 @@ class FileTextExtractableTest extends SapphireTest
],
];
protected function setUp()
protected function setUp(): void
{
parent::setUp();
@ -32,7 +33,7 @@ class FileTextExtractableTest extends SapphireTest
);
}
protected function tearDown()
protected function tearDown(): void
{
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
@ -44,15 +45,16 @@ class FileTextExtractableTest extends SapphireTest
public function testExtractFileAsText()
{
// Use HTML, since the extractor is always available
/** @var File|FileTextExtractable $file */
/** @var File&FileTextExtractable $file */
$file = new File(['Name' => 'test1-copy.html']);
$file->setTextCache(new FileTextCache\Database());
$file->setFromLocalFile(dirname(__FILE__) . '/fixtures/test1-copy.html');
$file->write();
$content = $file->extractFileAsText();
$this->assertNotNull($content);
$this->assertContains('Test Headline', $content);
$this->assertContains('Test Text', $content);
$this->assertStringContainsString('Test Headline', $content);
$this->assertStringContainsString('Test Text', $content);
$this->assertEquals($content, $file->FileContentCache);
}
}

View File

@ -11,7 +11,7 @@ class HTMLTextExtractorTest extends SapphireTest
{
protected $usesDatabase = true;
protected function setUp()
protected function setUp(): void
{
parent::setUp();
@ -28,9 +28,9 @@ class HTMLTextExtractorTest extends SapphireTest
$content = $extractor->getContent($file);
$this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
$this->assertStringContainsString('Test Headline', $content);
$this->assertStringNotContainsString('Test Comment', $content, 'Strips HTML comments');
$this->assertStringNotContainsString('Test Style', $content, 'Strips non-content style tags');
$this->assertStringNotContainsString('Test Script', $content, 'Strips non-content script tags');
}
}

View File

@ -24,6 +24,6 @@ class PDFTextExtractorTest extends SapphireTest
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
$this->assertStringContainsString('This is a test file with a link', $content);
}
}

View File

@ -2,9 +2,11 @@
namespace SilverStripe\TextExtraction\Tests;
use PHPUnit\Framework\MockObject\MockObject;
use SilverStripe\Assets\File;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
/**
* @group tika-tests
@ -26,11 +28,51 @@ class TikaServerTextExtractorTest extends SapphireTest
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
$this->assertStringContainsString('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));
$this->assertTrue($extractor->supportsMime('text/html'));
$this->assertFalse($extractor->supportsMime('application/not-supported'));
}
/**
* @param string $version
* @param bool $expected
* @dataProvider isAvailableProvider
*/
public function testIsAvailable($version, $expected)
{
/** @var MockObject|TikaServerTextExtractor $extractor */
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
->setMethods(['getClient', 'getServerEndpoint'])
->getMock();
$client = $this->createMock(TikaRestClient::class);
$client->method('isAvailable')->willReturn(true);
$client->method('getVersion')->willReturn($version);
$extractor->method('getClient')->willReturn($client);
$extractor->method('getServerEndpoint')->willReturn('tikaserver.example');
$result = $extractor->isAvailable();
$this->assertSame($expected, $result);
}
/**
* @return array[]
*/
public function isAvailableProvider()
{
return [
['1.5.2', false],
['1.5', false],
['1.7.0', true],
['1.7.5', true],
['1.8.0', true],
['1.7', true],
['1.8', true],
['2.0.0', true],
];
}
}

View File

@ -28,7 +28,7 @@ class TikaTextExtractorTest extends SapphireTest
$file->write();
$content = $extractor->getContent($file);
$this->assertContains('This is a test file with a link', $content);
$this->assertStringContainsString('This is a test file with a link', $content);
// Check mime validation
$this->assertTrue($extractor->supportsMime('application/pdf'));