silverstripe-textextraction/code/tika/TikaRestClient.php

100 lines
2.4 KiB
PHP
Raw Permalink Normal View History

2015-02-25 02:44:03 +01:00
<?php
2015-03-05 01:52:41 +01:00
use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException;
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
class TikaRestClient extends Client
{
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable()
{
try {
return $this
->get()->send()
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
/**
* Get version code
*
* @return float
*/
public function getVersion()
{
$response = $this->get('version')->send();
// Parse output
if ($response->getStatusCode() == 200 &&
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
) {
return (float)$matches['version'];
}
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
return 0.0;
}
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
protected $mimes = array();
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes()
{
if ($this->mimes) {
return $this->mimes;
}
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
$response = $this->get(
'mime-types',
array('Accept' => 'application/json')
)->send();
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
return $this->mimes = $response->json();
}
2015-03-05 01:52:41 +01:00
2015-11-18 05:07:31 +01:00
/**
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file)
{
$text = null;
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
file_get_contents($file)
)->send();
$text = $response->getBody(true);
} catch (RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
if ($body) {
$msg .= ' Body: ' . $body;
}
2015-02-25 02:44:03 +01:00
2015-11-18 05:07:31 +01:00
SS_Log::log($msg, SS_Log::NOTICE);
}
return $text;
}
2015-02-25 02:44:03 +01:00
}