mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
f2c8df2348
Was matching </str> greedily, which included too much content
64 lines
1.6 KiB
PHP
64 lines
1.6 KiB
PHP
<?php
|
|
use Guzzle\Http\Client;
|
|
|
|
/**
|
|
* Text extractor that calls an Apache Solr instance
|
|
* and extracts content via the "ExtractingRequestHandler" endpoint.
|
|
* Does not alter the Solr index itself, but uses it purely
|
|
* for its file parsing abilities.
|
|
*
|
|
* @author ischommer
|
|
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
|
|
*/
|
|
class SolrCellTextExtractor extends FileTextExtractor {
|
|
|
|
public static $base_url;
|
|
|
|
public static $priority = 75;
|
|
|
|
protected $httpClient;
|
|
|
|
public function getHttpClient() {
|
|
if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url'));
|
|
return $this->httpClient;
|
|
}
|
|
|
|
public function setHttpClient($client) {
|
|
$this->httpClient = $client;
|
|
}
|
|
|
|
public function isAvailable() {
|
|
$url = $this->config()->get('base_url');
|
|
if(!$url) return false;
|
|
}
|
|
|
|
/**
|
|
* @see http://tika.apache.org/1.3/formats.html
|
|
* @return Array
|
|
*/
|
|
public function supportedExtensions() {
|
|
return array(
|
|
'pdf', 'doc', 'docx', 'xls', 'xlsx',
|
|
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
|
|
'ppt', 'pptx', 'odp', 'fodp', 'csv'
|
|
);
|
|
}
|
|
|
|
public function getContent($path) {
|
|
if (!$path) return ""; // no file
|
|
|
|
$fileName = basename($path);
|
|
$client = $this->getHttpClient();
|
|
$request = $client
|
|
->post('?extractOnly=true&extractFormat=text')
|
|
->addPostFiles(array('myfile' => $path));
|
|
$response = $request->send();
|
|
// Use preg match to avoid SimpleXML running out of memory on large text nodes
|
|
preg_match(
|
|
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName)),
|
|
(string)$response->getBody(),
|
|
$matches
|
|
);
|
|
return $matches ? $matches[1] : null;
|
|
}
|
|
} |