silverstripe-textextraction/code/tika/TikaRestClient.php

95 lines
1.9 KiB
PHP
Raw Normal View History

2015-02-25 02:44:03 +01:00
<?php
2015-03-05 01:52:41 +01:00
use Guzzle\Http\Client;
use Guzzle\Http\Exception\RequestException;
2015-02-25 02:44:03 +01:00
class TikaRestClient extends Client {
/**
* Detect if the service is available
*
* @return bool
*/
public function isAvailable() {
try {
return $this
2015-03-05 01:52:41 +01:00
->get()->send()
2015-02-25 02:44:03 +01:00
->getStatusCode() == 200;
} catch (RequestException $ex) {
return false;
}
}
/**
* Get version code
*
* @return float
*/
public function getVersion() {
2015-03-05 01:52:41 +01:00
$response = $this->get('version')->send();
2015-02-25 02:44:03 +01:00
// Parse output
if($response->getStatusCode() == 200 &&
2015-03-05 01:52:41 +01:00
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody(), $matches)
2015-02-25 02:44:03 +01:00
) {
return (float)$matches['version'];
}
return 0.0;
}
protected $mimes = array();
/**
* Gets supported mime data. May include aliased mime types.
*
* @return array
*/
public function getSupportedMimes() {
if($this->mimes) return $this->mimes;
$response = $this->get(
'mime-types',
2015-03-05 01:52:41 +01:00
array('Accept' => 'application/json')
)->send();
2015-02-25 02:44:03 +01:00
return $this->mimes = $response->json();
}
/**
2015-05-06 07:00:42 +02:00
* Extract text content from a given file.
* Logs a notice-level error if the document can't be parsed.
2015-02-25 02:44:03 +01:00
*
* @param string $file Full filesystem path to a file to post
* @return string Content of the file extracted as plain text
*/
public function tika($file) {
2015-05-06 07:00:42 +02:00
$text = null;
try {
$response = $this->put(
'tika',
array('Accept' => 'text/plain'),
file_get_contents($file)
)->send();
$text = $response->getBody(true);
} catch(RequestException $e) {
$msg = sprintf(
'TikaRestClient was not able to process %s. Response: %s %s.',
$file,
$e->getResponse()->getStatusCode(),
$e->getResponse()->getReasonPhrase()
);
2015-03-05 01:52:41 +01:00
2015-05-06 07:00:42 +02:00
// Only available if tika-server was started with --includeStack
$body = $e->getResponse()->getBody(true);
if($body) {
$msg .= ' Body: ' . $body;
}
SS_Log::log($msg, SS_Log::NOTICE);
}
return $text;
2015-02-25 02:44:03 +01:00
}
}