silverstripe-textextraction/code/extractors/SolrCellTextExtractor.php
Ingo Schommer f2c8df2348 BUG Exclude meta info from SolrCell content retrieval
Was matching </str> greedily, which included too much content
2013-03-11 00:56:44 +01:00

64 lines
1.6 KiB
PHP

<?php
use Guzzle\Http\Client;
/**
* Text extractor that calls an Apache Solr instance
* and extracts content via the "ExtractingRequestHandler" endpoint.
* Does not alter the Solr index itself, but uses it purely
* for its file parsing abilities.
*
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
class SolrCellTextExtractor extends FileTextExtractor {
public static $base_url;
public static $priority = 75;
protected $httpClient;
public function getHttpClient() {
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
return $this->httpClient;
}
public function setHttpClient($client) {
$this->httpClient = $client;
}
public function isAvailable() {
$url = $this->config()->get('base_url');
if(!$url) return false;
}
/**
* @see http://tika.apache.org/1.3/formats.html
* @return Array
*/
public function supportedExtensions() {
return array(
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
);
}
public function getContent($path) {
if (!$path) return ""; // no file
$fileName = basename($path);
$client = $this->getHttpClient();
$request = $client
->post('?extractOnly=true&extractFormat=text')
->addPostFiles(array('myfile' => $path));
$response = $request->send();
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
(string)$response->getBody(),
$matches
);
return $matches ? $matches[1] : null;
}
}