2013-02-01 15:35:16 +01:00
|
|
|
<?php
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
use Exception;
|
|
|
|
use GuzzleHttp\Client;
|
|
|
|
use InvalidArgumentException;
|
|
|
|
use Psr\Log\LoggerInterface;
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\Injector\Injector;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that calls an Apache Solr instance
|
|
|
|
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
|
|
|
* Does not alter the Solr index itself, but uses it purely
|
|
|
|
* for its file parsing abilities.
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2013-02-01 15:35:16 +01:00
|
|
|
* @author ischommer
|
|
|
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class SolrCellTextExtractor extends FileTextExtractor
|
|
|
|
{
|
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* Base URL to use for Solr text extraction.
|
2015-11-18 05:07:31 +01:00
|
|
|
* E.g. http://localhost:8983/solr/update/extract
|
|
|
|
*
|
|
|
|
* @config
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private static $base_url;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @var int
|
|
|
|
* @config
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
private static $priority = 75;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @var Client
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
protected $httpClient;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return Client
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function getHttpClient()
|
|
|
|
{
|
|
|
|
if (!$this->httpClient) {
|
2018-07-03 01:23:27 +02:00
|
|
|
$this->httpClient = new Client();
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return $this->httpClient;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @param Client $client
|
|
|
|
* @return $this
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2018-07-03 01:23:27 +02:00
|
|
|
public function setHttpClient(Client $client)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
|
|
|
$this->httpClient = $client;
|
2018-07-03 01:23:27 +02:00
|
|
|
return $this;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @return string
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
$url = $this->config()->get('base_url');
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
return (bool) $url;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @param string $extension
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
return in_array(
|
|
|
|
strtolower($extension),
|
2018-07-03 01:23:27 +02:00
|
|
|
[
|
2015-11-18 05:07:31 +01:00
|
|
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
|
|
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
|
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
2018-07-03 01:23:27 +02:00
|
|
|
]
|
2015-11-18 05:07:31 +01:00
|
|
|
);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @param string $mime
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
|
|
|
// Rely on supportsExtension
|
|
|
|
return false;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
/**
|
2018-07-03 05:55:02 +02:00
|
|
|
* @param File $file
|
2017-12-20 22:24:39 +01:00
|
|
|
* @return string
|
2018-07-03 01:23:27 +02:00
|
|
|
* @throws InvalidArgumentException
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2018-07-03 05:55:02 +02:00
|
|
|
public function getContent(File $file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2018-07-03 05:55:02 +02:00
|
|
|
if (!$file) {
|
2018-07-03 01:23:27 +02:00
|
|
|
// no file
|
|
|
|
return '';
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
$fileName = $file->getFilename();
|
2015-11-18 05:07:31 +01:00
|
|
|
$client = $this->getHttpClient();
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
// Get and validate base URL
|
|
|
|
$baseUrl = $this->config()->get('base_url');
|
|
|
|
if (!$this->config()->get('base_url')) {
|
|
|
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
|
|
|
}
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
try {
|
2018-07-03 05:55:02 +02:00
|
|
|
$path = $this->getPathFromFile($file);
|
2015-11-18 05:07:31 +01:00
|
|
|
$request = $client
|
2018-07-03 01:23:27 +02:00
|
|
|
->post($baseUrl)
|
|
|
|
->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text'])
|
|
|
|
->addPostFiles(['myfile' => $path]);
|
2015-11-18 05:07:31 +01:00
|
|
|
$response = $request->send();
|
2018-07-03 01:23:27 +02:00
|
|
|
} catch (InvalidArgumentException $e) {
|
2017-12-20 22:24:39 +01:00
|
|
|
$msg = sprintf(
|
2018-07-03 01:23:27 +02:00
|
|
|
'Error extracting text from "%s" (message: %s)',
|
2018-07-03 05:55:02 +02:00
|
|
|
$fileName,
|
2018-07-03 01:23:27 +02:00
|
|
|
$e->getMessage()
|
|
|
|
);
|
2017-12-20 22:24:39 +01:00
|
|
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return null;
|
2018-07-03 01:23:27 +02:00
|
|
|
} catch (Exception $e) {
|
2018-07-03 01:37:38 +02:00
|
|
|
// Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search
|
|
|
|
// query in some cases.
|
2017-12-20 22:24:39 +01:00
|
|
|
$msg = sprintf(
|
2018-07-03 01:23:27 +02:00
|
|
|
'Tika server error attempting to extract from "%s" (message: %s)',
|
|
|
|
$path,
|
|
|
|
$e->getMessage()
|
|
|
|
);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
Injector::inst()->get(LoggerInterface::class)->notice($msg);
|
|
|
|
|
2017-11-23 14:18:44 +01:00
|
|
|
return null;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
// Just initialise it, it doesn't take much.
|
2017-12-20 22:24:39 +01:00
|
|
|
$matches = [];
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
|
|
preg_match(
|
|
|
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
|
|
|
(string)$response->getBody(),
|
|
|
|
$matches
|
|
|
|
);
|
2013-05-07 19:27:06 +02:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return $matches ? $matches[1] : null;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
}
|