FIX Update Guzzle implementations in Tika extractors

This commit is contained in:
Robbie Averill 2018-07-06 10:26:54 +12:00
parent b20738573f
commit 231a2091af
5 changed files with 61 additions and 25 deletions

View File

@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database:
## XPDF ## XPDF
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
commandline utility. Follow their installation instructions, its presence will be automatically commandline utility. Follow their installation instructions, its presence will be automatically
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`: detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex
} }
``` ```
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
code.
Note: This isn't a terribly efficient way to process large amounts of files, since Note: This isn't a terribly efficient way to process large amounts of files, since
each HTTP request is run synchronously. each HTTP request is run synchronously.

View File

@ -76,15 +76,23 @@ abstract class FileTextExtractor
/** /**
* Given a File object, decide which extractor instance to use to handle it * Given a File object, decide which extractor instance to use to handle it
* *
* @param File $file * @param File|string $file
* @return FileTextExtractor|null * @return FileTextExtractor|null
*/ */
public static function for_file(File $file) public static function for_file($file)
{ {
if (!$file) { if (!$file || (is_string($file) && !file_exists($file))) {
return null; return null;
} }
// Ensure we have a File instance to work with
if (is_string($file)) {
/** @var File $fileObject */
$fileObject = File::create();
$fileObject->setFromLocalFile($file);
$file = $fileObject;
}
$extension = $file->getExtension(); $extension = $file->getExtension();
$mime = $file->getMimeType(); $mime = $file->getMimeType();
@ -116,7 +124,7 @@ abstract class FileTextExtractor
* @return string * @return string
* @throws Exception * @throws Exception
*/ */
protected function getPathFromFile(File $file) protected static function getPathFromFile(File $file)
{ {
$path = tempnam(TEMP_PATH, 'pdftextextractor_'); $path = tempnam(TEMP_PATH, 'pdftextextractor_');
if (false === $path) { if (false === $path) {

View File

@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor
public function supportsMime($mime) public function supportsMime($mime)
{ {
if (!$this->supportedMimes) { if (!$this->supportedMimes) {
$this->supportedMimes = $this->getClient()->getSupportedMimes(); $this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
} }
// Check if supported (most common / quickest lookup) // Check if supported (most common / quickest lookup)

View File

@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest;
use GuzzleHttp\Client; use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException; use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Psr7\Response;
use Psr\Log\LoggerInterface; use Psr\Log\LoggerInterface;
use SilverStripe\Core\Convert;
use SilverStripe\Core\Environment; use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector; use SilverStripe\Core\Injector\Injector;
@ -38,6 +40,8 @@ class TikaRestClient extends Client
]; ];
} }
$config['base_uri'] = $baseUrl;
parent::__construct($config); parent::__construct($config);
} }
@ -49,11 +53,10 @@ class TikaRestClient extends Client
public function isAvailable() public function isAvailable()
{ {
try { try {
$result = $this->get(null); /** @var Response $result */
$result->setAuth($this->options['username'], $this->options['password']); $result = $this->get('/', $this->getGuzzleOptions());
$result->send();
if ($result->getResponse()->getStatusCode() == 200) { if ($result->getStatusCode() == 200) {
return true; return true;
} }
} catch (RequestException $ex) { } catch (RequestException $ex) {
@ -71,14 +74,13 @@ class TikaRestClient extends Client
*/ */
public function getVersion() public function getVersion()
{ {
$response = $this->get('version'); /** @var Response $response */
$response->setAuth($this->options['username'], $this->options['password']); $response = $this->get('version', $this->getGuzzleOptions());
$response->send();
$version = 0.0; $version = 0.0;
// Parse output // Parse output
if ($response->getResponse()->getStatusCode() == 200 && if ($response->getStatusCode() == 200
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches) && preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) { ) {
$version = (float)$matches['version']; $version = (float)$matches['version'];
} }
@ -99,12 +101,14 @@ class TikaRestClient extends Client
$response = $this->get( $response = $this->get(
'mime-types', 'mime-types',
array('Accept' => 'application/json') $this->getGuzzleOptions([
'headers' => [
'Accept' => 'application/json',
],
])
); );
$response->setAuth($this->options['username'], $this->options['password']);
$response->send();
return $this->mimes = $response->getResponse()->json(); return $this->mimes = Convert::json2array($response->getBody());
} }
/** /**
@ -118,14 +122,17 @@ class TikaRestClient extends Client
{ {
$text = null; $text = null;
try { try {
/** @var Response $response */
$response = $this->put( $response = $this->put(
'tika', 'tika',
['Accept' => 'text/plain'], $this->getGuzzleOptions([
file_get_contents($file) 'headers' => [
'Accept' => 'text/plain',
],
'body' => file_get_contents($file),
])
); );
$response->setAuth($this->options['username'], $this->options['password']); $text = $response->getBody();
$response->send();
$text = $response->getResponse()->getBody(true);
} catch (RequestException $e) { } catch (RequestException $e) {
$msg = sprintf( $msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.', 'TikaRestClient was not able to process %s. Response: %s %s.',
@ -134,7 +141,7 @@ class TikaRestClient extends Client
$e->getResponse()->getReasonPhrase() $e->getResponse()->getReasonPhrase()
); );
// Only available if tika-server was started with --includeStack // Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true); $body = $e->getResponse()->getBody();
if ($body) { if ($body) {
$msg .= ' Body: ' . $body; $msg .= ' Body: ' . $body;
} }
@ -144,4 +151,21 @@ class TikaRestClient extends Client
return $text; return $text;
} }
/**
* Assembles an array of request options to pass to Guzzle
*
* @param array $options Authentication (etc) will be merged into this array and returned
* @return array
*/
protected function getGuzzleOptions($options = [])
{
if (!empty($this->options['username']) && !empty($this->options['password'])) {
$options['auth'] = [
$this->options['username'],
$this->options['password']
];
}
return $options;
}
} }