2013-02-01 15:35:16 +01:00
|
|
|
<?php
|
|
|
|
use Guzzle\Http\Client;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that calls an Apache Solr instance
|
|
|
|
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
|
|
|
* Does not alter the Solr index itself, but uses it purely
|
|
|
|
* for its file parsing abilities.
|
|
|
|
*
|
|
|
|
* @author ischommer
|
|
|
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class SolrCellTextExtractor extends FileTextExtractor
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Base URL to use for solr text extraction.
|
|
|
|
* E.g. http://localhost:8983/solr/update/extract
|
|
|
|
*
|
|
|
|
* @config
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private static $base_url;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
private static $priority = 75;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
protected $httpClient;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function getHttpClient()
|
|
|
|
{
|
|
|
|
if (!$this->config()->get('base_url')) {
|
|
|
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
|
|
|
}
|
|
|
|
if (!$this->httpClient) {
|
|
|
|
$this->httpClient = new Client($this->config()->get('base_url'));
|
|
|
|
}
|
|
|
|
return $this->httpClient;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function setHttpClient($client)
|
|
|
|
{
|
|
|
|
$this->httpClient = $client;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
$url = $this->config()->get('base_url');
|
|
|
|
return (boolean) $url;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
return in_array(
|
|
|
|
strtolower($extension),
|
|
|
|
array(
|
|
|
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
|
|
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
|
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
|
|
|
// Rely on supportsExtension
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getContent($path)
|
|
|
|
{
|
|
|
|
if (!$path) {
|
|
|
|
return "";
|
|
|
|
} // no file
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
$fileName = basename($path);
|
|
|
|
$client = $this->getHttpClient();
|
|
|
|
try {
|
|
|
|
$request = $client
|
|
|
|
->post()
|
|
|
|
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
|
|
|
->addPostFiles(array('myfile' => $path));
|
|
|
|
$response = $request->send();
|
|
|
|
} catch (InvalidArgumentException $e) {
|
|
|
|
SS_Log::log(
|
|
|
|
sprintf(
|
|
|
|
'Error extracting text from "%s" (message: %s)',
|
|
|
|
$path,
|
|
|
|
$e->getMessage()
|
|
|
|
),
|
|
|
|
SS_Log::NOTICE
|
|
|
|
);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
|
|
preg_match(
|
|
|
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
|
|
|
(string)$response->getBody(),
|
|
|
|
$matches
|
|
|
|
);
|
2013-05-07 19:27:06 +02:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return $matches ? $matches[1] : null;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
}
|