mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
FIX Update SolrCellTextExtractor to use a Guzzle 6 API implementation
This commit is contained in:
parent
231a2091af
commit
e1e7cdbfa4
@ -4,6 +4,7 @@ namespace SilverStripe\TextExtraction\Extractor;
|
|||||||
|
|
||||||
use Exception;
|
use Exception;
|
||||||
use GuzzleHttp\Client;
|
use GuzzleHttp\Client;
|
||||||
|
use GuzzleHttp\Psr7\Response;
|
||||||
use InvalidArgumentException;
|
use InvalidArgumentException;
|
||||||
use Psr\Log\LoggerInterface;
|
use Psr\Log\LoggerInterface;
|
||||||
use SilverStripe\Assets\File;
|
use SilverStripe\Assets\File;
|
||||||
@ -120,12 +121,16 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
$path = $this->getPathFromFile($file);
|
$stream = $file instanceof File ? $file->getStream() : fopen($file, 'r');
|
||||||
$request = $client
|
/** @var Response $response */
|
||||||
->post($baseUrl)
|
$response = $client
|
||||||
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
->post($baseUrl, [
|
||||||
->addPostFiles(['myfile' => $path]);
|
'multipart' => [
|
||||||
$response = $request->send();
|
['name' => 'extractOnly', 'contents' => 'true'],
|
||||||
|
['name' => 'extractFormat', 'contents' => 'text'],
|
||||||
|
['name' => 'myfile', 'contents' => $stream],
|
||||||
|
]
|
||||||
|
]);
|
||||||
} catch (InvalidArgumentException $e) {
|
} catch (InvalidArgumentException $e) {
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'Error extracting text from "%s" (message: %s)',
|
'Error extracting text from "%s" (message: %s)',
|
||||||
@ -133,25 +138,20 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
);
|
);
|
||||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
} catch (Exception $e) {
|
} catch (Exception $e) {
|
||||||
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
|
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
|
||||||
// query in some cases.
|
// query in some cases.
|
||||||
$msg = sprintf(
|
$msg = sprintf(
|
||||||
'Tika server error attempting to extract from "%s" (message: %s)',
|
'Tika server error attempting to extract from "%s" (message: %s)',
|
||||||
$path,
|
$fileName,
|
||||||
$e->getMessage()
|
$e->getMessage()
|
||||||
);
|
);
|
||||||
|
|
||||||
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Just initialise it, it doesn't take much.
|
|
||||||
$matches = [];
|
$matches = [];
|
||||||
|
|
||||||
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
||||||
preg_match(
|
preg_match(
|
||||||
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
||||||
|
Loading…
Reference in New Issue
Block a user