2013-02-01 15:35:16 +01:00
|
|
|
<?php
|
|
|
|
use Guzzle\Http\Client;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that calls an Apache Solr instance
|
|
|
|
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
|
|
|
* Does not alter the Solr index itself, but uses it purely
|
|
|
|
* for its file parsing abilities.
|
|
|
|
*
|
|
|
|
* @author ischommer
|
|
|
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
|
|
|
*/
|
|
|
|
class SolrCellTextExtractor extends FileTextExtractor {
|
|
|
|
|
2013-05-07 18:47:56 +02:00
|
|
|
/**
|
|
|
|
* @config
|
|
|
|
* @var [type]
|
|
|
|
*/
|
|
|
|
private static $base_url;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
2013-05-07 18:47:56 +02:00
|
|
|
private static $priority = 75;
|
2013-02-01 15:35:16 +01:00
|
|
|
|
|
|
|
protected $httpClient;
|
|
|
|
|
|
|
|
public function getHttpClient() {
|
2013-05-07 19:27:06 +02:00
|
|
|
if(!$this->config()->get('base_url')) {
|
|
|
|
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
|
|
|
|
return $this->httpClient;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function setHttpClient($client) {
|
|
|
|
$this->httpClient = $client;
|
|
|
|
}
|
|
|
|
|
|
|
|
public function isAvailable() {
|
|
|
|
$url = $this->config()->get('base_url');
|
|
|
|
if(!$url) return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @see http://tika.apache.org/1.3/formats.html
|
|
|
|
* @return Array
|
|
|
|
*/
|
|
|
|
public function supportedExtensions() {
|
|
|
|
return array(
|
|
|
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
|
|
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
|
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
public function getContent($path) {
|
|
|
|
if (!$path) return ""; // no file
|
|
|
|
|
|
|
|
$fileName = basename($path);
|
|
|
|
$client = $this->getHttpClient();
|
2013-05-07 19:27:06 +02:00
|
|
|
try {
|
|
|
|
$request = $client
|
|
|
|
->post()
|
|
|
|
->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text'))
|
|
|
|
->addPostFiles(array('myfile' => $path));
|
|
|
|
$response = $request->send();
|
|
|
|
} catch(InvalidArgumentException $e) {
|
|
|
|
SS_Log::log(
|
|
|
|
sprintf(
|
|
|
|
'Error extracting text from "%s" (message: %s)',
|
|
|
|
$path,
|
|
|
|
$e->getMessage()
|
|
|
|
),
|
|
|
|
SS_Log::NOTICE
|
|
|
|
);
|
|
|
|
return null;
|
|
|
|
}
|
2013-02-01 15:35:16 +01:00
|
|
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
|
|
preg_match(
|
2013-03-11 00:56:44 +01:00
|
|
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
2013-02-01 15:35:16 +01:00
|
|
|
(string)$response->getBody(),
|
|
|
|
$matches
|
|
|
|
);
|
2013-05-07 19:27:06 +02:00
|
|
|
|
2013-02-01 15:35:16 +01:00
|
|
|
return $matches ? $matches[1] : null;
|
|
|
|
}
|
|
|
|
}
|