FIX Update Guzzle implementations in Tika extractors

This commit is contained in:
Robbie Averill 2018-07-06 10:26:54 +12:00
parent b20738573f
commit 231a2091af
5 changed files with 61 additions and 25 deletions

View File

@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database:
## XPDF
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
commandline utility. Follow their installation instructions, its presence will be automatically
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex
}
```
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
code.
Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously.

View File

@ -76,15 +76,23 @@ abstract class FileTextExtractor
/**
* Given a File object, decide which extractor instance to use to handle it
*
* @param File $file
* @param File|string $file
* @return FileTextExtractor|null
*/
public static function for_file(File $file)
public static function for_file($file)
{
if (!$file) {
if (!$file || (is_string($file) && !file_exists($file))) {
return null;
}
// Ensure we have a File instance to work with
if (is_string($file)) {
/** @var File $fileObject */
$fileObject = File::create();
$fileObject->setFromLocalFile($file);
$file = $fileObject;
}
$extension = $file->getExtension();
$mime = $file->getMimeType();
@ -116,7 +124,7 @@ abstract class FileTextExtractor
* @return string
* @throws Exception
*/
protected function getPathFromFile(File $file)
protected static function getPathFromFile(File $file)
{
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) {

View File

@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor
public function supportsMime($mime)
{
if (!$this->supportedMimes) {
$this->supportedMimes = $this->getClient()->getSupportedMimes();
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
}
// Check if supported (most common / quickest lookup)

View File

@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Psr7\Response;
use Psr\Log\LoggerInterface;
use SilverStripe\Core\Convert;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
@ -38,6 +40,8 @@ class TikaRestClient extends Client
];
}
$config['base_uri'] = $baseUrl;
parent::__construct($config);
}
@ -49,11 +53,10 @@ class TikaRestClient extends Client
public function isAvailable()
{
try {
$result = $this->get(null);
$result->setAuth($this->options['username'], $this->options['password']);
$result->send();
/** @var Response $result */
$result = $this->get('/', $this->getGuzzleOptions());
if ($result->getResponse()->getStatusCode() == 200) {
if ($result->getStatusCode() == 200) {
return true;
}
} catch (RequestException $ex) {
@ -71,14 +74,13 @@ class TikaRestClient extends Client
*/
public function getVersion()
{
$response = $this->get('version');
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
/** @var Response $response */
$response = $this->get('version', $this->getGuzzleOptions());
$version = 0.0;
// Parse output
if ($response->getResponse()->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
if ($response->getStatusCode() == 200
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) {
$version = (float)$matches['version'];
}
@ -99,12 +101,14 @@ class TikaRestClient extends Client
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'application/json',
],
])
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
return $this->mimes = $response->getResponse()->json();
return $this->mimes = Convert::json2array($response->getBody());
}
/**
@ -118,14 +122,17 @@ class TikaRestClient extends Client
{
$text = null;
try {
/** @var Response $response */
$response = $this->put(
'tika',
['Accept' => 'text/plain'],
file_get_contents($file)
$this->getGuzzleOptions([
'headers' => [
'Accept' => 'text/plain',
],
'body' => file_get_contents($file),
])
);
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
$text = $response->getResponse()->getBody(true);
$text = $response->getBody();
} catch (RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
@ -134,7 +141,7 @@ class TikaRestClient extends Client
$e->getResponse()->getReasonPhrase()
);
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
$body = $e->getResponse()->getBody();
if ($body) {
$msg .= ' Body: ' . $body;
}
@ -144,4 +151,21 @@ class TikaRestClient extends Client
return $text;
}
/**
* Assembles an array of request options to pass to Guzzle
*
* @param array $options Authentication (etc) will be merged into this array and returned
* @return array
*/
protected function getGuzzleOptions($options = [])
{
if (!empty($this->options['username']) && !empty($this->options['password'])) {
$options['auth'] = [
$this->options['username'],
$this->options['password']
];
}
return $options;
}
}