2013-02-01 15:35:16 +01:00
|
|
|
<?php
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
use Exception;
|
|
|
|
use GuzzleHttp\Client;
|
2018-07-06 05:43:53 +02:00
|
|
|
use GuzzleHttp\Psr7\Response;
|
2018-07-03 01:23:27 +02:00
|
|
|
use InvalidArgumentException;
|
|
|
|
use Psr\Log\LoggerInterface;
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\Injector\Injector;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that calls an Apache Solr instance
|
|
|
|
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
|
|
|
* Does not alter the Solr index itself, but uses it purely
|
|
|
|
* for its file parsing abilities.
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2013-02-01 15:35:16 +01:00
|
|
|
* @author ischommer
|
|
|
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class SolrCellTextExtractor extends FileTextExtractor
|
|
|
|
{
|
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* Base URL to use for Solr text extraction.
|
2015-11-18 05:07:31 +01:00
|
|
|
* E.g. http://localhost:8983/solr/update/extract
|
|
|
|
*
|
|
|
|
* @config
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private static $base_url;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
* @config
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
private static $priority = 75;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @var Client
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
protected $httpClient;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return Client
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function getHttpClient()
|
|
|
|
{
|
|
|
|
if (!$this->httpClient) {
|
2018-07-03 01:23:27 +02:00
|
|
|
$this->httpClient = new Client();
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return $this->httpClient;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @param Client $client
|
|
|
|
* @return $this
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2018-07-03 01:23:27 +02:00
|
|
|
public function setHttpClient(Client $client)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
|
|
|
$this->httpClient = $client;
|
2018-07-03 01:23:27 +02:00
|
|
|
return $this;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @return string
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
$url = $this->config()->get('base_url');
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
return (bool) $url;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @param string $extension
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
return in_array(
|
2022-04-13 03:51:04 +02:00
|
|
|
strtolower($extension ?? ''),
|
2018-07-03 01:23:27 +02:00
|
|
|
[
|
2015-11-18 05:07:31 +01:00
|
|
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
|
|
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
|
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
2018-07-03 01:23:27 +02:00
|
|
|
]
|
2015-11-18 05:07:31 +01:00
|
|
|
);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @param string $mime
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
|
|
|
// Rely on supportsExtension
|
|
|
|
return false;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
/**
|
2018-07-03 07:03:47 +02:00
|
|
|
* @param File|string $file
|
2017-12-20 22:24:39 +01:00
|
|
|
* @return string
|
2018-07-03 01:23:27 +02:00
|
|
|
* @throws InvalidArgumentException
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2018-07-03 07:03:47 +02:00
|
|
|
public function getContent($file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2022-04-13 03:51:04 +02:00
|
|
|
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
2018-07-03 01:23:27 +02:00
|
|
|
// no file
|
|
|
|
return '';
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2022-04-13 03:51:04 +02:00
|
|
|
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
|
2015-11-18 05:07:31 +01:00
|
|
|
$client = $this->getHttpClient();
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
// Get and validate base URL
|
|
|
|
$baseUrl = $this->config()->get('base_url');
|
|
|
|
if (!$this->config()->get('base_url')) {
|
|
|
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
|
|
|
}
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
try {
|
2022-04-13 03:51:04 +02:00
|
|
|
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
|
2018-07-06 05:43:53 +02:00
|
|
|
/** @var Response $response */
|
|
|
|
$response = $client
|
|
|
|
->post($baseUrl, [
|
|
|
|
'multipart' => [
|
|
|
|
['name' => 'extractOnly', 'contents' => 'true'],
|
|
|
|
['name' => 'extractFormat', 'contents' => 'text'],
|
|
|
|
['name' => 'myfile', 'contents' => $stream],
|
|
|
|
]
|
|
|
|
]);
|
2018-07-03 01:23:27 +02:00
|
|
|
} catch (InvalidArgumentException $e) {
|
2017-12-20 22:24:39 +01:00
|
|
|
$msg = sprintf(
|
2018-07-03 01:23:27 +02:00
|
|
|
'Error extracting text from "%s" (message: %s)',
|
2018-07-03 05:55:02 +02:00
|
|
|
$fileName,
|
2018-07-03 01:23:27 +02:00
|
|
|
$e->getMessage()
|
|
|
|
);
|
2017-12-20 22:24:39 +01:00
|
|
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
2015-11-18 05:07:31 +01:00
|
|
|
return null;
|
2018-07-03 01:23:27 +02:00
|
|
|
} catch (Exception $e) {
|
2018-07-06 05:43:53 +02:00
|
|
|
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
|
2018-07-03 01:37:38 +02:00
|
|
|
// query in some cases.
|
2017-12-20 22:24:39 +01:00
|
|
|
$msg = sprintf(
|
2018-07-03 01:23:27 +02:00
|
|
|
'Tika server error attempting to extract from "%s" (message: %s)',
|
2018-07-06 05:43:53 +02:00
|
|
|
$fileName,
|
2018-07-03 01:23:27 +02:00
|
|
|
$e->getMessage()
|
|
|
|
);
|
2017-12-20 22:24:39 +01:00
|
|
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
2017-11-23 14:18:44 +01:00
|
|
|
return null;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
$matches = [];
|
2015-11-18 05:07:31 +01:00
|
|
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
|
|
preg_match(
|
2022-04-13 03:51:04 +02:00
|
|
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
|
2015-11-18 05:07:31 +01:00
|
|
|
(string)$response->getBody(),
|
|
|
|
$matches
|
|
|
|
);
|
2013-05-07 19:27:06 +02:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return $matches ? $matches[1] : null;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
}
|