silverstripe-textextraction/src/Extractor/SolrCellTextExtractor.php

165 lines
4.3 KiB
PHP
Raw Normal View History

2013-02-01 15:35:16 +01:00
<?php
namespace SilverStripe\TextExtraction\Extractor;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use SilverStripe\Assets\File;
use SilverStripe\Core\Injector\Injector;
2013-02-01 15:35:16 +01:00
/**
* Text extractor that calls an Apache Solr instance
* and extracts content via the "ExtractingRequestHandler" endpoint.
* Does not alter the Solr index itself, but uses it purely
* for its file parsing abilities.
*
2013-02-01 15:35:16 +01:00
* @author ischommer
* @see http://wiki.apache.org/solr/ExtractingRequestHandler
*/
2015-11-18 05:07:31 +01:00
class SolrCellTextExtractor extends FileTextExtractor
{
/**
* Base URL to use for Solr text extraction.
2015-11-18 05:07:31 +01:00
* E.g. http://localhost:8983/solr/update/extract
*
* @config
* @var string
*/
private static $base_url;
2013-02-01 15:35:16 +01:00
/**
* @var int
* @config
*/
2015-11-18 05:07:31 +01:00
private static $priority = 75;
2013-02-01 15:35:16 +01:00
/**
* @var Client
*/
2015-11-18 05:07:31 +01:00
protected $httpClient;
2013-02-01 15:35:16 +01:00
/**
* @return Client
*/
2015-11-18 05:07:31 +01:00
public function getHttpClient()
{
if (!$this->httpClient) {
$this->httpClient = new Client();
2015-11-18 05:07:31 +01:00
}
2015-11-18 05:07:31 +01:00
return $this->httpClient;
}
2013-02-01 15:35:16 +01:00
/**
* @param Client $client
* @return $this
*/
public function setHttpClient(Client $client)
2015-11-18 05:07:31 +01:00
{
$this->httpClient = $client;
return $this;
2015-11-18 05:07:31 +01:00
}
2013-02-01 15:35:16 +01:00
/**
* @return string
*/
2015-11-18 05:07:31 +01:00
public function isAvailable()
{
$url = $this->config()->get('base_url');
return (bool) $url;
2015-11-18 05:07:31 +01:00
}
2013-02-01 15:35:16 +01:00
/**
* @param string $extension
* @return bool
*/
2015-11-18 05:07:31 +01:00
public function supportsExtension($extension)
{
return in_array(
2022-04-13 03:51:04 +02:00
strtolower($extension ?? ''),
[
2015-11-18 05:07:31 +01:00
'pdf', 'doc', 'docx', 'xls', 'xlsx',
'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods',
'ppt', 'pptx', 'odp', 'fodp', 'csv'
]
2015-11-18 05:07:31 +01:00
);
}
/**
* @param string $mime
* @return bool
*/
2015-11-18 05:07:31 +01:00
public function supportsMime($mime)
{
// Rely on supportsExtension
return false;
}
/**
* @param File|string $file
* @return string
* @throws InvalidArgumentException
*/
public function getContent($file)
2015-11-18 05:07:31 +01:00
{
2022-04-13 03:51:04 +02:00
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
// no file
return '';
}
2022-04-13 03:51:04 +02:00
$fileName = $file instanceof File ? $file->getFilename() : basename($file ?? '');
2015-11-18 05:07:31 +01:00
$client = $this->getHttpClient();
// Get and validate base URL
$baseUrl = $this->config()->get('base_url');
if (!$this->config()->get('base_url')) {
throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified');
}
2015-11-18 05:07:31 +01:00
try {
2022-04-13 03:51:04 +02:00
$stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r');
/** @var Response $response */
$response = $client
->post($baseUrl, [
'multipart' => [
['name' => 'extractOnly', 'contents' => 'true'],
['name' => 'extractFormat', 'contents' => 'text'],
['name' => 'myfile', 'contents' => $stream],
]
]);
} catch (InvalidArgumentException $e) {
$msg = sprintf(
'Error extracting text from "%s" (message: %s)',
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
2015-11-18 05:07:31 +01:00
return null;
} catch (Exception $e) {
// Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search
// query in some cases.
$msg = sprintf(
'Tika server error attempting to extract from "%s" (message: %s)',
$fileName,
$e->getMessage()
);
Injector::inst()->get(LoggerInterface::class)->notice($msg);
return null;
2015-11-18 05:07:31 +01:00
}
$matches = [];
2015-11-18 05:07:31 +01:00
// Use preg match to avoid SimpleXML running out of memory on large text nodes
preg_match(
2022-04-13 03:51:04 +02:00
sprintf('/\<str name\="%s"\>(.*?)\<\/str\>/s', preg_quote($fileName ?? '')),
2015-11-18 05:07:31 +01:00
(string)$response->getBody(),
$matches
);
2015-11-18 05:07:31 +01:00
return $matches ? $matches[1] : null;
}
}