mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
FIX Update Guzzle implementations in Tika extractors
This commit is contained in:
parent
b20738573f
commit
231a2091af
@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
|||||||
|
|
||||||
## XPDF
|
## XPDF
|
||||||
|
|
||||||
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
|
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
|
||||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||||
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
||||||
|
|
||||||
@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
|
||||||
|
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
|
||||||
|
code.
|
||||||
|
|
||||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||||
each HTTP request is run synchronously.
|
each HTTP request is run synchronously.
|
||||||
|
|
||||||
|
@ -76,15 +76,23 @@ abstract class FileTextExtractor
|
|||||||
/**
|
/**
|
||||||
* Given a File object, decide which extractor instance to use to handle it
|
* Given a File object, decide which extractor instance to use to handle it
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File|string $file
|
||||||
* @return FileTextExtractor|null
|
* @return FileTextExtractor|null
|
||||||
*/
|
*/
|
||||||
public static function for_file(File $file)
|
public static function for_file($file)
|
||||||
{
|
{
|
||||||
if (!$file) {
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure we have a File instance to work with
|
||||||
|
if (is_string($file)) {
|
||||||
|
/** @var File $fileObject */
|
||||||
|
$fileObject = File::create();
|
||||||
|
$fileObject->setFromLocalFile($file);
|
||||||
|
$file = $fileObject;
|
||||||
|
}
|
||||||
|
|
||||||
$extension = $file->getExtension();
|
$extension = $file->getExtension();
|
||||||
$mime = $file->getMimeType();
|
$mime = $file->getMimeType();
|
||||||
|
|
||||||
@ -116,7 +124,7 @@ abstract class FileTextExtractor
|
|||||||
* @return string
|
* @return string
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
protected function getPathFromFile(File $file)
|
protected static function getPathFromFile(File $file)
|
||||||
{
|
{
|
||||||
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||||
if (false === $path) {
|
if (false === $path) {
|
||||||
|
@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
public function supportsMime($mime)
|
public function supportsMime($mime)
|
||||||
{
|
{
|
||||||
if (!$this->supportedMimes) {
|
if (!$this->supportedMimes) {
|
||||||
$this->supportedMimes = $this->getClient()->getSupportedMimes();
|
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if supported (most common / quickest lookup)
|
// Check if supported (most common / quickest lookup)
|
||||||
|
@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest;
|
|||||||
|
|
||||||
use GuzzleHttp\Client;
|
use GuzzleHttp\Client;
|
||||||
use GuzzleHttp\Exception\RequestException;
|
use GuzzleHttp\Exception\RequestException;
|
||||||
|
use GuzzleHttp\Psr7\Response;
|
||||||
use Psr\Log\LoggerInterface;
|
use Psr\Log\LoggerInterface;
|
||||||
|
use SilverStripe\Core\Convert;
|
||||||
use SilverStripe\Core\Environment;
|
use SilverStripe\Core\Environment;
|
||||||
use SilverStripe\Core\Injector\Injector;
|
use SilverStripe\Core\Injector\Injector;
|
||||||
|
|
||||||
@ -38,6 +40,8 @@ class TikaRestClient extends Client
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$config['base_uri'] = $baseUrl;
|
||||||
|
|
||||||
parent::__construct($config);
|
parent::__construct($config);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -49,11 +53,10 @@ class TikaRestClient extends Client
|
|||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
$result = $this->get(null);
|
/** @var Response $result */
|
||||||
$result->setAuth($this->options['username'], $this->options['password']);
|
$result = $this->get('/', $this->getGuzzleOptions());
|
||||||
$result->send();
|
|
||||||
|
|
||||||
if ($result->getResponse()->getStatusCode() == 200) {
|
if ($result->getStatusCode() == 200) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} catch (RequestException $ex) {
|
} catch (RequestException $ex) {
|
||||||
@ -71,14 +74,13 @@ class TikaRestClient extends Client
|
|||||||
*/
|
*/
|
||||||
public function getVersion()
|
public function getVersion()
|
||||||
{
|
{
|
||||||
$response = $this->get('version');
|
/** @var Response $response */
|
||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
$response = $this->get('version', $this->getGuzzleOptions());
|
||||||
$response->send();
|
|
||||||
$version = 0.0;
|
$version = 0.0;
|
||||||
|
|
||||||
// Parse output
|
// Parse output
|
||||||
if ($response->getResponse()->getStatusCode() == 200 &&
|
if ($response->getStatusCode() == 200
|
||||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
|
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
||||||
) {
|
) {
|
||||||
$version = (float)$matches['version'];
|
$version = (float)$matches['version'];
|
||||||
}
|
}
|
||||||
@ -99,12 +101,14 @@ class TikaRestClient extends Client
|
|||||||
|
|
||||||
$response = $this->get(
|
$response = $this->get(
|
||||||
'mime-types',
|
'mime-types',
|
||||||
array('Accept' => 'application/json')
|
$this->getGuzzleOptions([
|
||||||
|
'headers' => [
|
||||||
|
'Accept' => 'application/json',
|
||||||
|
],
|
||||||
|
])
|
||||||
);
|
);
|
||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
|
||||||
$response->send();
|
|
||||||
|
|
||||||
return $this->mimes = $response->getResponse()->json();
|
return $this->mimes = Convert::json2array($response->getBody());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -118,14 +122,17 @@ class TikaRestClient extends Client
|
|||||||
{
|
{
|
||||||
$text = null;
|
$text = null;
|
||||||
try {
|
try {
|
||||||
|
/** @var Response $response */
|
||||||
$response = $this->put(
|
$response = $this->put(
|
||||||
'tika',
|
'tika',
|
||||||
['Accept' => 'text/plain'],
|
$this->getGuzzleOptions([
|
||||||
file_get_contents($file)
|
'headers' => [
|
||||||
|
'Accept' => 'text/plain',
|
||||||
|
],
|
||||||
|
'body' => file_get_contents($file),
|
||||||
|
])
|
||||||
);
|
);
|
||||||
$response->setAuth($this->options['username'], $this->options['password']);
|
$text = $response->getBody();
|
||||||
$response->send();
|
|
||||||
$text = $response->getResponse()->getBody(true);
|
|
||||||
} catch (RequestException $e) {
|
} catch (RequestException $e) {
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||||
@ -134,7 +141,7 @@ class TikaRestClient extends Client
|
|||||||
$e->getResponse()->getReasonPhrase()
|
$e->getResponse()->getReasonPhrase()
|
||||||
);
|
);
|
||||||
// Only available if tika-server was started with --includeStack
|
// Only available if tika-server was started with --includeStack
|
||||||
$body = $e->getResponse()->getBody(true);
|
$body = $e->getResponse()->getBody();
|
||||||
if ($body) {
|
if ($body) {
|
||||||
$msg .= ' Body: ' . $body;
|
$msg .= ' Body: ' . $body;
|
||||||
}
|
}
|
||||||
@ -144,4 +151,21 @@ class TikaRestClient extends Client
|
|||||||
|
|
||||||
return $text;
|
return $text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assembles an array of request options to pass to Guzzle
|
||||||
|
*
|
||||||
|
* @param array $options Authentication (etc) will be merged into this array and returned
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
protected function getGuzzleOptions($options = [])
|
||||||
|
{
|
||||||
|
if (!empty($this->options['username']) && !empty($this->options['password'])) {
|
||||||
|
$options['auth'] = [
|
||||||
|
$this->options['username'],
|
||||||
|
$this->options['password']
|
||||||
|
];
|
||||||
|
}
|
||||||
|
return $options;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user