2015-02-25 02:44:03 +01:00
|
|
|
<?php
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
namespace SilverStripe\TextExtraction\Rest;
|
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
use GuzzleHttp\Client;
|
|
|
|
use GuzzleHttp\Exception\RequestException;
|
2018-07-06 00:26:54 +02:00
|
|
|
use GuzzleHttp\Psr7\Response;
|
2018-07-03 01:23:27 +02:00
|
|
|
use Psr\Log\LoggerInterface;
|
2018-07-06 00:26:54 +02:00
|
|
|
use SilverStripe\Core\Convert;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\Environment;
|
|
|
|
use SilverStripe\Core\Injector\Injector;
|
2015-02-25 02:44:03 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
class TikaRestClient extends Client
|
|
|
|
{
|
2017-11-22 22:24:32 +01:00
|
|
|
/**
|
|
|
|
* Authentication options to be sent to the Tika server
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
2018-07-03 01:23:27 +02:00
|
|
|
protected $options = ['username' => null, 'password' => null];
|
2017-11-22 22:24:32 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
2018-07-03 01:23:27 +02:00
|
|
|
protected $mimes = [];
|
2017-11-22 22:24:32 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param string $baseUrl
|
2018-07-03 06:30:05 +02:00
|
|
|
* @param array $config
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2018-07-03 06:30:05 +02:00
|
|
|
public function __construct($baseUrl = '', $config = [])
|
2017-11-22 22:24:32 +01:00
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
$password = Environment::getEnv('SS_TIKA_PASSWORD');
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
if (!empty($password)) {
|
|
|
|
$this->options = [
|
2017-12-20 22:24:39 +01:00
|
|
|
'username' => Environment::getEnv('SS_TIKA_USERNAME'),
|
2018-07-03 01:23:27 +02:00
|
|
|
'password' => $password,
|
|
|
|
];
|
2017-11-22 22:24:32 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-06 00:26:54 +02:00
|
|
|
$config['base_uri'] = $baseUrl;
|
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
parent::__construct($config);
|
2017-11-22 22:24:32 +01:00
|
|
|
}
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Detect if the service is available
|
|
|
|
*
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
try {
|
2018-07-06 00:26:54 +02:00
|
|
|
/** @var Response $result */
|
|
|
|
$result = $this->get('/', $this->getGuzzleOptions());
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-06 00:26:54 +02:00
|
|
|
if ($result->getStatusCode() == 200) {
|
2017-11-22 22:24:32 +01:00
|
|
|
return true;
|
|
|
|
}
|
2015-11-18 05:07:31 +01:00
|
|
|
} catch (RequestException $ex) {
|
2017-12-20 22:24:39 +01:00
|
|
|
$msg = sprintf("Tika unavailable - %s", $ex->getMessage());
|
2018-07-03 07:15:16 +02:00
|
|
|
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2015-02-25 02:44:03 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Get version code
|
|
|
|
*
|
2019-03-25 04:02:27 +01:00
|
|
|
* @return string
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
|
|
|
public function getVersion()
|
|
|
|
{
|
2018-07-06 00:26:54 +02:00
|
|
|
/** @var Response $response */
|
|
|
|
$response = $this->get('version', $this->getGuzzleOptions());
|
2019-03-25 04:02:27 +01:00
|
|
|
$version = 0;
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Parse output
|
2018-07-06 00:26:54 +02:00
|
|
|
if ($response->getStatusCode() == 200
|
|
|
|
&& preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
|
2015-11-18 05:07:31 +01:00
|
|
|
) {
|
2019-03-25 04:02:27 +01:00
|
|
|
$version = $matches['version'];
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2019-03-25 04:02:27 +01:00
|
|
|
return (string) $version;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2015-02-25 02:44:03 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Gets supported mime data. May include aliased mime types.
|
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function getSupportedMimes()
|
|
|
|
{
|
|
|
|
if ($this->mimes) {
|
|
|
|
return $this->mimes;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
$response = $this->get(
|
|
|
|
'mime-types',
|
2018-07-06 00:26:54 +02:00
|
|
|
$this->getGuzzleOptions([
|
|
|
|
'headers' => [
|
|
|
|
'Accept' => 'application/json',
|
|
|
|
],
|
|
|
|
])
|
2017-11-22 22:24:32 +01:00
|
|
|
);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-06 00:26:54 +02:00
|
|
|
return $this->mimes = Convert::json2array($response->getBody());
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2015-03-05 01:52:41 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Extract text content from a given file.
|
|
|
|
* Logs a notice-level error if the document can't be parsed.
|
|
|
|
*
|
2017-12-20 22:24:39 +01:00
|
|
|
* @param string $file Full filesystem path to a file to post
|
2015-11-18 05:07:31 +01:00
|
|
|
* @return string Content of the file extracted as plain text
|
|
|
|
*/
|
|
|
|
public function tika($file)
|
|
|
|
{
|
|
|
|
$text = null;
|
|
|
|
try {
|
2018-07-06 00:26:54 +02:00
|
|
|
/** @var Response $response */
|
2015-11-18 05:07:31 +01:00
|
|
|
$response = $this->put(
|
|
|
|
'tika',
|
2018-07-06 00:26:54 +02:00
|
|
|
$this->getGuzzleOptions([
|
|
|
|
'headers' => [
|
|
|
|
'Accept' => 'text/plain',
|
|
|
|
],
|
|
|
|
'body' => file_get_contents($file),
|
|
|
|
])
|
2017-11-22 22:24:32 +01:00
|
|
|
);
|
2018-07-06 00:26:54 +02:00
|
|
|
$text = $response->getBody();
|
2015-11-18 05:07:31 +01:00
|
|
|
} catch (RequestException $e) {
|
|
|
|
$msg = sprintf(
|
|
|
|
'TikaRestClient was not able to process %s. Response: %s %s.',
|
|
|
|
$file,
|
|
|
|
$e->getResponse()->getStatusCode(),
|
|
|
|
$e->getResponse()->getReasonPhrase()
|
|
|
|
);
|
|
|
|
// Only available if tika-server was started with --includeStack
|
2018-07-06 00:26:54 +02:00
|
|
|
$body = $e->getResponse()->getBody();
|
2015-11-18 05:07:31 +01:00
|
|
|
if ($body) {
|
|
|
|
$msg .= ' Body: ' . $body;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 07:15:16 +02:00
|
|
|
Injector::inst()->get(LoggerInterface::class)->info($msg);
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2019-02-13 05:39:35 +01:00
|
|
|
return (string) $text;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2018-07-06 00:26:54 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Assembles an array of request options to pass to Guzzle
|
|
|
|
*
|
|
|
|
* @param array $options Authentication (etc) will be merged into this array and returned
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
protected function getGuzzleOptions($options = [])
|
|
|
|
{
|
|
|
|
if (!empty($this->options['username']) && !empty($this->options['password'])) {
|
|
|
|
$options['auth'] = [
|
|
|
|
$this->options['username'],
|
|
|
|
$this->options['password']
|
|
|
|
];
|
|
|
|
}
|
|
|
|
return $options;
|
|
|
|
}
|
2015-02-25 02:44:03 +01:00
|
|
|
}
|