mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
83 lines
1.5 KiB
PHP
83 lines
1.5 KiB
PHP
|
<?php
|
||
|
|
||
|
use GuzzleHttp\Client;
|
||
|
use GuzzleHttp\Exception\RequestException;
|
||
|
|
||
|
class TikaRestClient extends Client {
|
||
|
|
||
|
public function __construct($url) {
|
||
|
parent::__construct(array(
|
||
|
'base_url' => $url
|
||
|
));
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Detect if the service is available
|
||
|
*
|
||
|
* @return bool
|
||
|
*/
|
||
|
public function isAvailable() {
|
||
|
try {
|
||
|
return $this
|
||
|
->get()
|
||
|
->getStatusCode() == 200;
|
||
|
} catch (RequestException $ex) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get version code
|
||
|
*
|
||
|
* @return float
|
||
|
*/
|
||
|
public function getVersion() {
|
||
|
$response = $this->get('version');
|
||
|
// Parse output
|
||
|
if($response->getStatusCode() == 200 &&
|
||
|
preg_match('/Apache Tika (?<version>[\.\d]+)/', $response->getBody()->getContents(), $matches)
|
||
|
) {
|
||
|
return (float)$matches['version'];
|
||
|
}
|
||
|
|
||
|
return 0.0;
|
||
|
}
|
||
|
|
||
|
protected $mimes = array();
|
||
|
|
||
|
/**
|
||
|
* Gets supported mime data. May include aliased mime types.
|
||
|
*
|
||
|
* @return array
|
||
|
*/
|
||
|
public function getSupportedMimes() {
|
||
|
if($this->mimes) return $this->mimes;
|
||
|
|
||
|
$response = $this->get(
|
||
|
'mime-types',
|
||
|
array(
|
||
|
'headers' => array("Accept" => "application/json")
|
||
|
)
|
||
|
);
|
||
|
return $this->mimes = $response->json();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Extract text content from a given file
|
||
|
*
|
||
|
* @param string $file Full filesystem path to a file to post
|
||
|
* @return string Content of the file extracted as plain text
|
||
|
*/
|
||
|
public function tika($file) {
|
||
|
$response = $this->put(
|
||
|
'tika',
|
||
|
array(
|
||
|
'body' => file_get_contents($file),
|
||
|
'headers' => array("Accept" => "text/plain")
|
||
|
)
|
||
|
);
|
||
|
return $response->getBody()->getContents();
|
||
|
}
|
||
|
|
||
|
}
|