Merge pull request #47 from creative-commoners/pulls/3.0/fix-extractors

FIX Update Guzzle implementations in extractors to ensure they're working
This commit is contained in:
Dylan Wagstaff 2018-07-09 09:57:17 +12:00 committed by GitHub
commit 03d1fef4ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 74 additions and 38 deletions

View File

@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database:
## XPDF
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
commandline utility. Follow their installation instructions, its presence will be automatically
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex
}
```
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
code.
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.

View File

@ -76,15 +76,23 @@ abstract class FileTextExtractor
/**
* Given a File object, decide which extractor instance to use to handle it
*
* @param File $file
* @param File|string $file
* @return FileTextExtractor|null
*/
public static function for_file(File $file)
public static function for_file($file)
{
if (!$file) {
if (!$file || (is_string($file) && !file_exists($file))) {
return null;
}
// Ensure we have a File instance to work with
if (is_string($file)) {
/** @var File $fileObject */
$fileObject = File::create();
$fileObject->setFromLocalFile($file);
$file = $fileObject;
}
$extension = $file->getExtension();
$mime = $file->getMimeType();
@ -116,7 +124,7 @@ abstract class FileTextExtractor
* @return string
* @throws Exception
*/
protected function getPathFromFile(File $file)
protected static function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {

View File

@ -4,6 +4,7 @@ namespace SilverStripe\TextExtraction\Extractor;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
@ -120,12 +121,16 @@ class SolrCellTextExtractor extends FileTextExtractor
}
try {
$path = $this->getPathFromFile($file);
$request = $client
->post($baseUrl)
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
->addPostFiles(['myfile' => $path]);
$response = $request->send();
$stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
/** @var Response $response */
$response = $client
->post($baseUrl, [
'multipart' => [
['name' => 'extractOnly', 'contents' => 'true'],
['name' => 'extractFormat', 'contents' => 'text'],
['name' => 'myfile', 'contents' => $stream],
]
]);
} catch (InvalidArgumentException $e) {
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
@ -133,25 +138,20 @@ class SolrCellTextExtractor extends FileTextExtractor
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
} catch (Exception $e) {
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
// query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$path,
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
}
// Just initialise it, it doesn't take much.
$matches = [];
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),

View File

@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor
public function supportsMime($mime)
{
if (!$this->supportedMimes) {
$this->supportedMimes = $this->getClient()->getSupportedMimes();
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)

View File

@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Psr7\Response;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Convert;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
@ -38,6 +40,8 @@ class TikaRestClient extends Client
];
}
$config['base_uri'] = $baseUrl;
parent::__construct($config);
}
@ -49,11 +53,10 @@ class TikaRestClient extends Client
public function isAvailable()
{
try {
$result = $this->get(null);
$result->setAuth($this->options['username'], $this->options['password']);
$result->send();
/** @var Response $result */
$result = $this->get('/', $this->getGuzzleOptions());
if ($result->getResponse()->getStatusCode() == 200) {
if ($result->getStatusCode() == 200) {
return true;
}
} catch (RequestException $ex) {
@ -71,14 +74,13 @@ class TikaRestClient extends Client
*/
public function getVersion()
{
$response = $this->get('version');
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
/** @var Response $response */
$response = $this->get('version', $this->getGuzzleOptions());
$version = 0.0;
// Parse output
if ($response->getResponse()->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
if ($response->getStatusCode() == 200
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) {
$version = (float)$matches['version'];
}
@ -99,12 +101,14 @@ class TikaRestClient extends Client
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'application/json',
],
])
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
return $this->mimes = $response->getResponse()->json();
return $this->mimes = Convert::json2array($response->getBody());
}
/**
@ -118,14 +122,17 @@ class TikaRestClient extends Client
{
$text = null;
try {
/** @var Response $response */
$response = $this->put(
'tika',
['Accept' => 'text/plain'],
file_get_contents($file)
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'text/plain',
],
'body' => file_get_contents($file),
])
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
$text = $response->getResponse()->getBody(true);
$text = $response->getBody();
} catch (RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
@ -134,7 +141,7 @@ class TikaRestClient extends Client
$e->getResponse()->getReasonPhrase()
);
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
$body = $e->getResponse()->getBody();
if ($body) {
$msg .= ' Body: ' . $body;
}
@ -144,4 +151,21 @@ class TikaRestClient extends Client
return $text;
}
/**
* Assembles an array of request options to pass to Guzzle
*
* @param array $options Authentication (etc) will be merged into this array and returned
* @return array
*/
protected function getGuzzleOptions($options = [])
{
if (!empty($this->options['username']) && !empty($this->options['password'])) {
$options['auth'] = [
$this->options['username'],
$this->options['password']
];
}
return $options;
}
}