mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
Merge pull request #47 from creative-commoners/pulls/3.0/fix-extractors
FIX Update Guzzle implementations in extractors to ensure they're working
This commit is contained in:
commit
03d1fef4ae
@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database:
|
||||
|
||||
## XPDF
|
||||
|
||||
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)
|
||||
PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/)
|
||||
commandline utility. Follow their installation instructions, its presence will be automatically
|
||||
detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`:
|
||||
|
||||
@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex
|
||||
}
|
||||
```
|
||||
|
||||
Extractors will return content formatted with new line characters at the end of each extracted line. If you want
|
||||
this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your
|
||||
code.
|
||||
|
||||
Note: This isn't a terribly efficient way to process large amounts of files, since
|
||||
each HTTP request is run synchronously.
|
||||
|
||||
|
@ -76,15 +76,23 @@ abstract class FileTextExtractor
|
||||
/**
|
||||
* Given a File object, decide which extractor instance to use to handle it
|
||||
*
|
||||
* @param File $file
|
||||
* @param File|string $file
|
||||
* @return FileTextExtractor|null
|
||||
*/
|
||||
public static function for_file(File $file)
|
||||
public static function for_file($file)
|
||||
{
|
||||
if (!$file) {
|
||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure we have a File instance to work with
|
||||
if (is_string($file)) {
|
||||
/** @var File $fileObject */
|
||||
$fileObject = File::create();
|
||||
$fileObject->setFromLocalFile($file);
|
||||
$file = $fileObject;
|
||||
}
|
||||
|
||||
$extension = $file->getExtension();
|
||||
$mime = $file->getMimeType();
|
||||
|
||||
@ -116,7 +124,7 @@ abstract class FileTextExtractor
|
||||
* @return string
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function getPathFromFile(File $file)
|
||||
protected static function getPathFromFile(File $file)
|
||||
{
|
||||
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
||||
if (false === $path) {
|
||||
|
@ -4,6 +4,7 @@ namespace SilverStripe\TextExtraction\Extractor;
|
||||
|
||||
use Exception;
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use InvalidArgumentException;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Assets\File;
|
||||
@ -120,12 +121,16 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
try {
|
||||
$path = $this->getPathFromFile($file);
|
||||
$request = $client
|
||||
->post($baseUrl)
|
||||
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
||||
->addPostFiles(['myfile' => $path]);
|
||||
$response = $request->send();
|
||||
$stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
|
||||
/** @var Response $response */
|
||||
$response = $client
|
||||
->post($baseUrl, [
|
||||
'multipart' => [
|
||||
['name' => 'extractOnly', 'contents' => 'true'],
|
||||
['name' => 'extractFormat', 'contents' => 'text'],
|
||||
['name' => 'myfile', 'contents' => $stream],
|
||||
]
|
||||
]);
|
||||
} catch (InvalidArgumentException $e) {
|
||||
$msg = sprintf(
|
||||
'Error extracting text from "%s" (message: %s)',
|
||||
@ -133,25 +138,20 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
$e->getMessage()
|
||||
);
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
|
||||
return null;
|
||||
} catch (Exception $e) {
|
||||
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
|
||||
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
|
||||
// query in some cases.
|
||||
$msg = sprintf(
|
||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||
$path,
|
||||
$fileName,
|
||||
$e->getMessage()
|
||||
);
|
||||
|
||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Just initialise it, it doesn't take much.
|
||||
$matches = [];
|
||||
|
||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||
preg_match(
|
||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
||||
|
@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
public function supportsMime($mime)
|
||||
{
|
||||
if (!$this->supportedMimes) {
|
||||
$this->supportedMimes = $this->getClient()->getSupportedMimes();
|
||||
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
|
||||
}
|
||||
|
||||
// Check if supported (most common / quickest lookup)
|
||||
|
@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest;
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use GuzzleHttp\Psr7\Response;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use SilverStripe\Core\Convert;
|
||||
use SilverStripe\Core\Environment;
|
||||
use SilverStripe\Core\Injector\Injector;
|
||||
|
||||
@ -38,6 +40,8 @@ class TikaRestClient extends Client
|
||||
];
|
||||
}
|
||||
|
||||
$config['base_uri'] = $baseUrl;
|
||||
|
||||
parent::__construct($config);
|
||||
}
|
||||
|
||||
@ -49,11 +53,10 @@ class TikaRestClient extends Client
|
||||
public function isAvailable()
|
||||
{
|
||||
try {
|
||||
$result = $this->get(null);
|
||||
$result->setAuth($this->options['username'], $this->options['password']);
|
||||
$result->send();
|
||||
/** @var Response $result */
|
||||
$result = $this->get('/', $this->getGuzzleOptions());
|
||||
|
||||
if ($result->getResponse()->getStatusCode() == 200) {
|
||||
if ($result->getStatusCode() == 200) {
|
||||
return true;
|
||||
}
|
||||
} catch (RequestException $ex) {
|
||||
@ -71,14 +74,13 @@ class TikaRestClient extends Client
|
||||
*/
|
||||
public function getVersion()
|
||||
{
|
||||
$response = $this->get('version');
|
||||
$response->setAuth($this->options['username'], $this->options['password']);
|
||||
$response->send();
|
||||
/** @var Response $response */
|
||||
$response = $this->get('version', $this->getGuzzleOptions());
|
||||
$version = 0.0;
|
||||
|
||||
// Parse output
|
||||
if ($response->getResponse()->getStatusCode() == 200 &&
|
||||
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getResponse()->getBody(), $matches)
|
||||
if ($response->getStatusCode() == 200
|
||||
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
||||
) {
|
||||
$version = (float)$matches['version'];
|
||||
}
|
||||
@ -99,12 +101,14 @@ class TikaRestClient extends Client
|
||||
|
||||
$response = $this->get(
|
||||
'mime-types',
|
||||
array('Accept' => 'application/json')
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'application/json',
|
||||
],
|
||||
])
|
||||
);
|
||||
$response->setAuth($this->options['username'], $this->options['password']);
|
||||
$response->send();
|
||||
|
||||
return $this->mimes = $response->getResponse()->json();
|
||||
return $this->mimes = Convert::json2array($response->getBody());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -118,14 +122,17 @@ class TikaRestClient extends Client
|
||||
{
|
||||
$text = null;
|
||||
try {
|
||||
/** @var Response $response */
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
['Accept' => 'text/plain'],
|
||||
file_get_contents($file)
|
||||
$this->getGuzzleOptions([
|
||||
'headers' => [
|
||||
'Accept' => 'text/plain',
|
||||
],
|
||||
'body' => file_get_contents($file),
|
||||
])
|
||||
);
|
||||
$response->setAuth($this->options['username'], $this->options['password']);
|
||||
$response->send();
|
||||
$text = $response->getResponse()->getBody(true);
|
||||
$text = $response->getBody();
|
||||
} catch (RequestException $e) {
|
||||
$msg = sprintf(
|
||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||
@ -134,7 +141,7 @@ class TikaRestClient extends Client
|
||||
$e->getResponse()->getReasonPhrase()
|
||||
);
|
||||
// Only available if tika-server was started with --includeStack
|
||||
$body = $e->getResponse()->getBody(true);
|
||||
$body = $e->getResponse()->getBody();
|
||||
if ($body) {
|
||||
$msg .= ' Body: ' . $body;
|
||||
}
|
||||
@ -144,4 +151,21 @@ class TikaRestClient extends Client
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assembles an array of request options to pass to Guzzle
|
||||
*
|
||||
* @param array $options Authentication (etc) will be merged into this array and returned
|
||||
* @return array
|
||||
*/
|
||||
protected function getGuzzleOptions($options = [])
|
||||
{
|
||||
if (!empty($this->options['username']) && !empty($this->options['password'])) {
|
||||
$options['auth'] = [
|
||||
$this->options['username'],
|
||||
$this->options['password']
|
||||
];
|
||||
}
|
||||
return $options;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user