mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-09-28 20:29:17 +02:00
Compare commits
40 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
e04501cb52 | ||
|
821d2858f3 | ||
|
de215d63f6 | ||
|
2a260607ec | ||
|
6a92eb58e2 | ||
|
87869e94a6 | ||
|
61f443d49c | ||
|
8f2e1d9b75 | ||
|
a281114ed2 | ||
|
1a0cd6d6a6 | ||
|
3bfa989a7e | ||
|
46b7f51040 | ||
|
041296bda2 | ||
|
e8997870c5 | ||
|
e8061724c5 | ||
|
db8a36fa3e | ||
|
04ff0c6084 | ||
|
e5bf4f1322 | ||
|
4674084d0d | ||
|
df8b17ab85 | ||
|
77fecc4c53 | ||
|
d03a9f06e2 | ||
|
88e7f27c5c | ||
|
04e4b60435 | ||
|
25d8a55058 | ||
|
e8f015ddd2 | ||
|
254c4e31f8 | ||
|
7ad3fc9f13 | ||
|
eb36dcf5fb | ||
|
b92616eb4e | ||
|
90d4812aa8 | ||
|
d1bdc003ad | ||
|
6af13768d3 | ||
|
4250acb50e | ||
|
795abde8f1 | ||
|
d1e241ed56 | ||
|
8e9a0243bb | ||
|
cb15845a95 | ||
|
3564066245 | ||
|
06995b2ec7 |
11
.github/workflows/ci.yml
vendored
Normal file
11
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
pull_request:
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ci:
|
||||||
|
name: CI
|
||||||
|
uses: silverstripe/gha-ci/.github/workflows/ci.yml@v1
|
16
.github/workflows/dispatch-ci.yml
vendored
Normal file
16
.github/workflows/dispatch-ci.yml
vendored
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
name: Dispatch CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
# At 12:20 PM UTC, only on Saturday and Sunday
|
||||||
|
schedule:
|
||||||
|
- cron: '20 12 * * 6,0'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
dispatch-ci:
|
||||||
|
name: Dispatch CI
|
||||||
|
# Only run cron on the silverstripe account
|
||||||
|
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Dispatch CI
|
||||||
|
uses: silverstripe/gha-dispatch-ci@v1
|
17
.github/workflows/keepalive.yml
vendored
Normal file
17
.github/workflows/keepalive.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
name: Keepalive
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
# The 4th of every month at 10:50am UTC
|
||||||
|
schedule:
|
||||||
|
- cron: '50 10 4 * *'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
keepalive:
|
||||||
|
name: Keepalive
|
||||||
|
# Only run cron on the silverstripe account
|
||||||
|
if: (github.event_name == 'schedule' && github.repository_owner == 'silverstripe') || (github.event_name != 'schedule')
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Keepalive
|
||||||
|
uses: silverstripe/gha-keepalive@v1
|
@ -1,13 +0,0 @@
|
|||||||
inherit: true
|
|
||||||
|
|
||||||
checks:
|
|
||||||
php: true
|
|
||||||
|
|
||||||
build:
|
|
||||||
nodes:
|
|
||||||
analysis:
|
|
||||||
tests:
|
|
||||||
override: [php-scrutinizer-run]
|
|
||||||
|
|
||||||
filter:
|
|
||||||
paths: [src/*, tests/*]
|
|
51
.travis.yml
51
.travis.yml
@ -1,51 +0,0 @@
|
|||||||
language: php
|
|
||||||
|
|
||||||
dist: trusty
|
|
||||||
|
|
||||||
addons:
|
|
||||||
apt:
|
|
||||||
packages:
|
|
||||||
- poppler-utils
|
|
||||||
|
|
||||||
env:
|
|
||||||
global:
|
|
||||||
- COMPOSER_ROOT_VERSION=3.1.x-dev
|
|
||||||
- SS_TIKA_ENDPOINT="http://localhost:9998/"
|
|
||||||
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- php: 5.6
|
|
||||||
env: DB=MYSQL RECIPE_VERSION=4.3.x-dev PHPCS_TEST=1 PHPUNIT_TEST=1
|
|
||||||
- php: 7.0
|
|
||||||
env: DB=MYSQL RECIPE_VERSION=4.3.x-dev PHPUNIT_TEST=1
|
|
||||||
- php: 7.1
|
|
||||||
env: DB=PGSQL RECIPE_VERSION=4.4.x-dev PHPUNIT_COVERAGE_TEST=1
|
|
||||||
- php: 7.2
|
|
||||||
env: DB=MYSQL RECIPE_VERSION=4.4.x-dev PHPUNIT_TEST=1
|
|
||||||
- php: 7.3
|
|
||||||
env: DB=MYSQL RECIPE_VERSION=4.x-dev PHPUNIT_TEST=1
|
|
||||||
|
|
||||||
before_script:
|
|
||||||
# Init PHP
|
|
||||||
- phpenv rehash
|
|
||||||
- phpenv config-rm xdebug.ini
|
|
||||||
|
|
||||||
# Configure Tika bin
|
|
||||||
- mkdir -p $HOME/bin
|
|
||||||
- export PATH=$PATH:$HOME/bin
|
|
||||||
- ./.travis/install_tika.sh
|
|
||||||
- ($HOME/bin/tika-rest-server &) &> /dev/null
|
|
||||||
|
|
||||||
# Install composer dependencies
|
|
||||||
- composer validate
|
|
||||||
- composer require --no-update silverstripe/recipe-cms:"$RECIPE_VERSION"
|
|
||||||
- if [[ $DB == PGSQL ]]; then composer require --no-update silverstripe/postgresql 2.1.x-dev; fi
|
|
||||||
- composer install --prefer-dist --no-interaction --no-progress --no-suggest --optimize-autoloader --verbose --profile
|
|
||||||
|
|
||||||
script:
|
|
||||||
- if [[ $PHPUNIT_TEST ]]; then vendor/bin/phpunit; fi
|
|
||||||
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then phpdbg -qrr vendor/bin/phpunit --coverage-clover=coverage.xml; fi
|
|
||||||
- if [[ $PHPCS_TEST ]]; then vendor/bin/phpcs src/ tests/; fi
|
|
||||||
|
|
||||||
after_success:
|
|
||||||
- if [[ $PHPUNIT_COVERAGE_TEST ]]; then bash <(curl -s https://codecov.io/bash) -f coverage.xml; fi
|
|
12
README.md
12
README.md
@ -1,9 +1,7 @@
|
|||||||
# Text extraction module
|
# Text extraction module
|
||||||
|
|
||||||
[![Build Status](https://travis-ci.org/silverstripe/silverstripe-textextraction.svg?branch=master)](https://travis-ci.org/silverstripe/silverstripe-textextraction)
|
[![CI](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml/badge.svg)](https://github.com/silverstripe/silverstripe-textextraction/actions/workflows/ci.yml)
|
||||||
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/silverstripe/silverstripe-textextraction/?branch=master)
|
[![Silverstripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
||||||
[![codecov](https://codecov.io/gh/silverstripe/silverstripe-textextraction/branch/master/graph/badge.svg)](https://codecov.io/gh/silverstripe/silverstripe-textextraction)
|
|
||||||
[![SilverStripe supported module](https://img.shields.io/badge/silverstripe-supported-0071C4.svg)](https://www.silverstripe.org/software/addons/silverstripe-commercially-supported-module-list/)
|
|
||||||
|
|
||||||
Provides a text extraction API for file content, that can hook into different extractor
|
Provides a text extraction API for file content, that can hook into different extractor
|
||||||
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
engines based on availability and the parsed file format. The output returned is always a string of the file content.
|
||||||
@ -24,7 +22,7 @@ The module supports text extraction on the following file formats:
|
|||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
* SilverStripe ^4.0
|
* Silverstripe ^4.0
|
||||||
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
* (optional) [XPDF](http://www.foolabs.com/xpdf/) (`pdftotext` utility)
|
||||||
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
* (optional) [Apache Solr with ExtracingRequestHandler](http://wiki.apache.org/solr/ExtractingRequestHandler)
|
||||||
* (optional) [Apache Tika](http://tika.apache.org/)
|
* (optional) [Apache Tika](http://tika.apache.org/)
|
||||||
@ -54,8 +52,8 @@ If the issue does look like a new bug:
|
|||||||
- Create a new issue
|
- Create a new issue
|
||||||
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
|
- Describe the steps required to reproduce your issue, and the expected outcome. Unit tests, screenshots
|
||||||
and screencasts can help here.
|
and screencasts can help here.
|
||||||
- Describe your environment as detailed as possible: SilverStripe version, Browser, PHP version,
|
- Describe your environment as detailed as possible: Silverstripe version, Browser, PHP version,
|
||||||
Operating System, any installed SilverStripe modules.
|
Operating System, any installed Silverstripe modules.
|
||||||
|
|
||||||
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.
|
Please report security issues to security@silverstripe.org directly. Please don't file security issues in the bugtracker.
|
||||||
|
|
||||||
|
@ -20,14 +20,15 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"require": {
|
"require": {
|
||||||
"silverstripe/framework": "^4",
|
"php": "^7.4 || ^8.0",
|
||||||
|
"silverstripe/framework": "^4.10",
|
||||||
"silverstripe/assets": "^1",
|
"silverstripe/assets": "^1",
|
||||||
"silverstripe/versioned": "^1",
|
"silverstripe/versioned": "^1",
|
||||||
"guzzlehttp/guzzle": "~6.3.0"
|
"guzzlehttp/guzzle": "^6.3 || ^7.0"
|
||||||
},
|
},
|
||||||
"require-dev": {
|
"require-dev": {
|
||||||
"squizlabs/php_codesniffer": "^3",
|
"squizlabs/php_codesniffer": "^3",
|
||||||
"phpunit/phpunit": "^5.7"
|
"phpunit/phpunit": "^9.5"
|
||||||
},
|
},
|
||||||
"autoload": {
|
"autoload": {
|
||||||
"psr-4": {
|
"psr-4": {
|
||||||
@ -38,11 +39,7 @@
|
|||||||
"suggest": {
|
"suggest": {
|
||||||
"ext-fileinfo": "Improved support for file mime detection"
|
"ext-fileinfo": "Improved support for file mime detection"
|
||||||
},
|
},
|
||||||
"extra": {
|
"extra": [],
|
||||||
"branch-alias": {
|
|
||||||
"dev-master": "3.x-dev"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"minimum-stability": "dev",
|
"minimum-stability": "dev",
|
||||||
"prefer-stable": true
|
"prefer-stable": true
|
||||||
}
|
}
|
@ -30,7 +30,7 @@ SilverStripe\Core\Injector\Injector:
|
|||||||
SilverStripe\TextExtraction\Cache\FileTextCache:
|
SilverStripe\TextExtraction\Cache\FileTextCache:
|
||||||
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
class: SilverStripe\TextExtraction\Cache\FileTextCache\Cache
|
||||||
|
|
||||||
SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
SilverStripe\TextExtraction\Cache\FileTextCache\Cache:
|
||||||
lifetime: 3600 # Number of seconds to cache content for
|
lifetime: 3600 # Number of seconds to cache content for
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
<ruleset name="SilverStripe">
|
<ruleset name="SilverStripe">
|
||||||
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
|
<description>CodeSniffer ruleset for SilverStripe coding conventions.</description>
|
||||||
|
|
||||||
|
<file>src</file>
|
||||||
|
<file>tests</file>
|
||||||
|
|
||||||
<!-- base rules are PSR-2 -->
|
<!-- base rules are PSR-2 -->
|
||||||
<rule ref="PSR2" >
|
<rule ref="PSR2" >
|
||||||
<!-- Current exclusions -->
|
<!-- Current exclusions -->
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
|
<phpunit bootstrap="vendor/silverstripe/framework/tests/bootstrap.php" colors="true">
|
||||||
|
<testsuites>
|
||||||
<testsuite name="Default">
|
<testsuite name="Default">
|
||||||
<directory>tests/</directory>
|
<directory>tests/</directory>
|
||||||
</testsuite>
|
</testsuite>
|
||||||
|
</testsuites>
|
||||||
|
|
||||||
<filter>
|
<filter>
|
||||||
<whitelist addUncoveredFilesFromWhitelist="true">
|
<whitelist addUncoveredFilesFromWhitelist="true">
|
||||||
|
@ -18,7 +18,7 @@ class Cache implements FileTextCache, Flushable
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Lifetime of cache in seconds
|
* Lifetime of cache in seconds
|
||||||
* Null is indefinite
|
* Null defaults to 3600 (1 hour)
|
||||||
*
|
*
|
||||||
* @var int|null
|
* @var int|null
|
||||||
* @config
|
* @config
|
||||||
@ -42,7 +42,7 @@ class Cache implements FileTextCache, Flushable
|
|||||||
*/
|
*/
|
||||||
protected function getKey(File $file)
|
protected function getKey(File $file)
|
||||||
{
|
{
|
||||||
return md5($file->getFilename());
|
return md5($file->getFilename() ?? '');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -58,7 +58,7 @@ abstract class FileTextExtractor
|
|||||||
arsort($classPriorities);
|
arsort($classPriorities);
|
||||||
|
|
||||||
// Save classes
|
// Save classes
|
||||||
$sortedClasses = array_keys($classPriorities);
|
$sortedClasses = array_keys($classPriorities ?? []);
|
||||||
return self::$sorted_extractor_classes = $sortedClasses;
|
return self::$sorted_extractor_classes = $sortedClasses;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +81,7 @@ abstract class FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public static function for_file($file)
|
public static function for_file($file)
|
||||||
{
|
{
|
||||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,11 +137,11 @@ abstract class FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Remove any existing temp files with this name
|
// Remove any existing temp files with this name
|
||||||
if (file_exists($path)) {
|
if (file_exists($path ?? '')) {
|
||||||
unlink($path);
|
unlink($path ?? '');
|
||||||
}
|
}
|
||||||
|
|
||||||
$bytesWritten = file_put_contents($path, $file->getStream());
|
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
|
||||||
if (false === $bytesWritten) {
|
if (false === $bytesWritten) {
|
||||||
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,7 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return in_array(strtolower($extension), ["html", "htm", "xhtml"]);
|
return in_array(strtolower($extension ?? ''), ["html", "htm", "xhtml"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -43,7 +43,7 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
return strtolower($mime) === 'text/html';
|
return strtolower($mime ?? '') === 'text/html';
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -56,7 +56,7 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public function getContent($file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
|
$content = $file instanceof File ? $file->getString() : file_get_contents($file ?? '');
|
||||||
|
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
@ -82,9 +82,9 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||||
],
|
],
|
||||||
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0"],
|
||||||
$content
|
$content ?? ''
|
||||||
);
|
);
|
||||||
|
|
||||||
return strip_tags($content);
|
return strip_tags($content ?? '');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -33,18 +33,18 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
$bin = $this->bin('pdftotext');
|
$bin = $this->bin('pdftotext');
|
||||||
return $bin && file_exists($bin) && is_executable($bin);
|
return $bin && file_exists($bin ?? '') && is_executable($bin ?? '');
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return strtolower($extension) === 'pdf';
|
return strtolower($extension ?? '') === 'pdf';
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
strtolower($mime),
|
strtolower($mime ?? ''),
|
||||||
[
|
[
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/x-pdf',
|
'application/x-pdf',
|
||||||
@ -72,7 +72,7 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
// Find program in each path
|
// Find program in each path
|
||||||
foreach ($locations as $location) {
|
foreach ($locations as $location) {
|
||||||
$path = "{$location}/{$program}";
|
$path = "{$location}/{$program}";
|
||||||
if (file_exists($path)) {
|
if (file_exists($path ?? '')) {
|
||||||
return $path;
|
return $path;
|
||||||
}
|
}
|
||||||
if (file_exists($path . '.exe')) {
|
if (file_exists($path . '.exe')) {
|
||||||
@ -86,7 +86,7 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
|
|
||||||
public function getContent($file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
@ -108,7 +108,7 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path ?? '')), $content, $err);
|
||||||
|
|
||||||
if ($err) {
|
if ($err) {
|
||||||
throw new Exception(sprintf(
|
throw new Exception(sprintf(
|
||||||
@ -141,6 +141,6 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
'st' => 'st'
|
'st' => 'st'
|
||||||
];
|
];
|
||||||
|
|
||||||
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
return str_replace(array_keys($mapping ?? []), array_values($mapping ?? []), $input ?? '');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -80,7 +80,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return in_array(
|
return in_array(
|
||||||
strtolower($extension),
|
strtolower($extension ?? ''),
|
||||||
[
|
[
|
||||||
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
||||||
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
||||||
@ -106,12 +106,12 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
public function getContent($file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
|
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
|
||||||
$client = $this->getHttpClient();
|
$client = $this->getHttpClient();
|
||||||
|
|
||||||
// Get and validate base URL
|
// Get and validate base URL
|
||||||
@ -121,7 +121,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
|
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
|
||||||
/** @var Response $response */
|
/** @var Response $response */
|
||||||
$response = $client
|
$response = $client
|
||||||
->post($baseUrl, [
|
->post($baseUrl, [
|
||||||
@ -154,7 +154,7 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
$matches = [];
|
$matches = [];
|
||||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||||
preg_match(
|
preg_match(
|
||||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
|
||||||
(string)$response->getBody(),
|
(string)$response->getBody(),
|
||||||
$matches
|
$matches
|
||||||
);
|
);
|
||||||
|
@ -86,7 +86,7 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
{
|
{
|
||||||
return $this->getServerEndpoint()
|
return $this->getServerEndpoint()
|
||||||
&& $this->getClient()->isAvailable()
|
&& $this->getClient()->isAvailable()
|
||||||
&& version_compare($this->getVersion(), '1.7') >= 0;
|
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -116,7 +116,7 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
|
|
||||||
// Check aliases
|
// Check aliases
|
||||||
foreach ($this->supportedMimes as $info) {
|
foreach ($this->supportedMimes as $info) {
|
||||||
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,7 +130,7 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
$content = $this->getClient()->tika($tempFile);
|
$content = $this->getClient()->tika($tempFile);
|
||||||
//Cleanup temp file
|
//Cleanup temp file
|
||||||
if ($file instanceof File) {
|
if ($file instanceof File) {
|
||||||
unlink($tempFile);
|
unlink($tempFile ?? '');
|
||||||
}
|
}
|
||||||
return $content;
|
return $content;
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
$code = $this->runShell('tika --version', $stdout);
|
$code = $this->runShell('tika --version', $stdout);
|
||||||
|
|
||||||
// Parse output
|
// Parse output
|
||||||
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout ?? '', $matches)) {
|
||||||
return $matches['version'];
|
return $matches['version'];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,14 +54,14 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
];
|
];
|
||||||
// Invoke command
|
// Invoke command
|
||||||
$pipes = [];
|
$pipes = [];
|
||||||
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
$proc = proc_open($command ?? '', $descriptorSpecs ?? [], $pipes);
|
||||||
|
|
||||||
if (!is_resource($proc)) {
|
if (!is_resource($proc)) {
|
||||||
return 255;
|
return 255;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send content as input
|
// Send content as input
|
||||||
fwrite($pipes[0], $input);
|
fwrite($pipes[0], $input ?? '');
|
||||||
fclose($pipes[0]);
|
fclose($pipes[0]);
|
||||||
|
|
||||||
// Get output
|
// Get output
|
||||||
@ -78,11 +78,11 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
{
|
{
|
||||||
$mode = $this->config()->get('output_mode');
|
$mode = $this->config()->get('output_mode');
|
||||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path ?? ''));
|
||||||
$code = $this->runShell($command, $output);
|
$code = $this->runShell($command, $output);
|
||||||
//Cleanup temp file
|
//Cleanup temp file
|
||||||
if ($file instanceof File) {
|
if ($file instanceof File) {
|
||||||
unlink($path);
|
unlink($path ?? '');
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($code == 0) {
|
if ($code == 0) {
|
||||||
@ -123,8 +123,8 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if the mime type is inside the result
|
// Check if the mime type is inside the result
|
||||||
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime ?? '', '/'));
|
||||||
|
|
||||||
return (bool)preg_match($pattern, $supportedTypes);
|
return (bool)preg_match($pattern ?? '', $supportedTypes ?? '');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -80,7 +80,7 @@ class TikaRestClient extends Client
|
|||||||
|
|
||||||
// Parse output
|
// Parse output
|
||||||
if ($response->getStatusCode() == 200
|
if ($response->getStatusCode() == 200
|
||||||
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody() ?? '', $matches)
|
||||||
) {
|
) {
|
||||||
$version = $matches['version'];
|
$version = $matches['version'];
|
||||||
}
|
}
|
||||||
@ -108,7 +108,7 @@ class TikaRestClient extends Client
|
|||||||
])
|
])
|
||||||
);
|
);
|
||||||
|
|
||||||
return $this->mimes = Convert::json2array($response->getBody());
|
return $this->mimes = json_decode($response->getBody(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -129,7 +129,7 @@ class TikaRestClient extends Client
|
|||||||
'headers' => [
|
'headers' => [
|
||||||
'Accept' => 'text/plain',
|
'Accept' => 'text/plain',
|
||||||
],
|
],
|
||||||
'body' => file_get_contents($file),
|
'body' => file_get_contents($file ?? ''),
|
||||||
])
|
])
|
||||||
);
|
);
|
||||||
$text = $response->getBody();
|
$text = $response->getBody();
|
||||||
|
@ -18,7 +18,7 @@ class FileTextExtractableTest extends SapphireTest
|
|||||||
],
|
],
|
||||||
];
|
];
|
||||||
|
|
||||||
protected function setUp()
|
protected function setUp(): void
|
||||||
{
|
{
|
||||||
parent::setUp();
|
parent::setUp();
|
||||||
|
|
||||||
@ -33,7 +33,7 @@ class FileTextExtractableTest extends SapphireTest
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function tearDown()
|
protected function tearDown(): void
|
||||||
{
|
{
|
||||||
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
if (file_exists(dirname(__FILE__) . '/fixtures/test1-copy.html')) {
|
||||||
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
unlink(dirname(__FILE__) . '/fixtures/test1-copy.html');
|
||||||
@ -53,8 +53,8 @@ class FileTextExtractableTest extends SapphireTest
|
|||||||
|
|
||||||
$content = $file->extractFileAsText();
|
$content = $file->extractFileAsText();
|
||||||
$this->assertNotNull($content);
|
$this->assertNotNull($content);
|
||||||
$this->assertContains('Test Headline', $content);
|
$this->assertStringContainsString('Test Headline', $content);
|
||||||
$this->assertContains('Test Text', $content);
|
$this->assertStringContainsString('Test Text', $content);
|
||||||
$this->assertEquals($content, $file->FileContentCache);
|
$this->assertEquals($content, $file->FileContentCache);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,7 @@ class HTMLTextExtractorTest extends SapphireTest
|
|||||||
{
|
{
|
||||||
protected $usesDatabase = true;
|
protected $usesDatabase = true;
|
||||||
|
|
||||||
protected function setUp()
|
protected function setUp(): void
|
||||||
{
|
{
|
||||||
parent::setUp();
|
parent::setUp();
|
||||||
|
|
||||||
@ -28,9 +28,9 @@ class HTMLTextExtractorTest extends SapphireTest
|
|||||||
|
|
||||||
$content = $extractor->getContent($file);
|
$content = $extractor->getContent($file);
|
||||||
|
|
||||||
$this->assertContains('Test Headline', $content);
|
$this->assertStringContainsString('Test Headline', $content);
|
||||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
$this->assertStringNotContainsString('Test Comment', $content, 'Strips HTML comments');
|
||||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
$this->assertStringNotContainsString('Test Style', $content, 'Strips non-content style tags');
|
||||||
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
$this->assertStringNotContainsString('Test Script', $content, 'Strips non-content script tags');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,6 +24,6 @@ class PDFTextExtractorTest extends SapphireTest
|
|||||||
$file->write();
|
$file->write();
|
||||||
|
|
||||||
$content = $extractor->getContent($file);
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
namespace SilverStripe\TextExtraction\Tests;
|
namespace SilverStripe\TextExtraction\Tests;
|
||||||
|
|
||||||
use PHPUnit_Framework_MockObject_MockObject;
|
use PHPUnit\Framework\MockObject\MockObject;
|
||||||
use SilverStripe\Assets\File;
|
use SilverStripe\Assets\File;
|
||||||
use SilverStripe\Dev\SapphireTest;
|
use SilverStripe\Dev\SapphireTest;
|
||||||
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
|
use SilverStripe\TextExtraction\Extractor\TikaServerTextExtractor;
|
||||||
@ -28,7 +28,7 @@ class TikaServerTextExtractorTest extends SapphireTest
|
|||||||
$file->write();
|
$file->write();
|
||||||
|
|
||||||
$content = $extractor->getContent($file);
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||||
|
|
||||||
// Check mime validation
|
// Check mime validation
|
||||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
@ -43,7 +43,7 @@ class TikaServerTextExtractorTest extends SapphireTest
|
|||||||
*/
|
*/
|
||||||
public function testIsAvailable($version, $expected)
|
public function testIsAvailable($version, $expected)
|
||||||
{
|
{
|
||||||
/** @var PHPUnit_Framework_MockObject_MockObject|TikaServerTextExtractor $extractor */
|
/** @var MockObject|TikaServerTextExtractor $extractor */
|
||||||
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
|
$extractor = $this->getMockBuilder(TikaServerTextExtractor::class)
|
||||||
->setMethods(['getClient', 'getServerEndpoint'])
|
->setMethods(['getClient', 'getServerEndpoint'])
|
||||||
->getMock();
|
->getMock();
|
||||||
|
@ -28,7 +28,7 @@ class TikaTextExtractorTest extends SapphireTest
|
|||||||
$file->write();
|
$file->write();
|
||||||
|
|
||||||
$content = $extractor->getContent($file);
|
$content = $extractor->getContent($file);
|
||||||
$this->assertContains('This is a test file with a link', $content);
|
$this->assertStringContainsString('This is a test file with a link', $content);
|
||||||
|
|
||||||
// Check mime validation
|
// Check mime validation
|
||||||
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
$this->assertTrue($extractor->supportsMime('application/pdf'));
|
||||||
|
Loading…
Reference in New Issue
Block a user