2015-02-25 02:44:03 +01:00
|
|
|
<?php
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\Environment;
|
|
|
|
use SilverStripe\Core\Injector\Injector;
|
|
|
|
use SilverStripe\TextExtraction\Rest\TikaRestClient;
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-02-25 02:44:03 +01:00
|
|
|
/**
|
|
|
|
* Enables text extraction of file content via the Tika Rest Server
|
|
|
|
*
|
|
|
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class TikaServerTextExtractor extends FileTextExtractor
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Tika server is pretty efficient so use it immediately if available
|
|
|
|
*
|
|
|
|
* @var integer
|
|
|
|
* @config
|
|
|
|
*/
|
|
|
|
private static $priority = 80;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Server endpoint
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
* @config
|
|
|
|
*/
|
|
|
|
private static $server_endpoint;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var TikaRestClient
|
|
|
|
*/
|
|
|
|
protected $client = null;
|
|
|
|
|
2018-07-03 01:23:27 +02:00
|
|
|
/**
|
|
|
|
* Cache of supported mime types
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $supportedMimes = [];
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* @return TikaRestClient
|
|
|
|
*/
|
|
|
|
public function getClient()
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
if (!$this->client) {
|
|
|
|
$this->client = Injector::inst()->createWithArgs(
|
|
|
|
TikaRestClient::class,
|
|
|
|
[$this->getServerEndpoint()]
|
2015-11-18 05:07:31 +01:00
|
|
|
);
|
2018-07-03 01:23:27 +02:00
|
|
|
}
|
|
|
|
return $this->client;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @return string
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function getServerEndpoint()
|
|
|
|
{
|
2017-12-20 22:24:39 +01:00
|
|
|
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
|
|
|
|
return $endpoint;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Default to configured endpoint
|
2018-07-03 01:23:27 +02:00
|
|
|
return $this->config()->get('server_endpoint');
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* Get the version of Tika installed, or 0 if not installed
|
2015-11-18 05:07:31 +01:00
|
|
|
*
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return float version of Tika
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
|
|
|
public function getVersion()
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
return $this->getClient()->getVersion();
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
return $this->getServerEndpoint()
|
|
|
|
&& $this->getClient()->isAvailable()
|
|
|
|
&& version_compare($this->getVersion(), '1.7.0') >= 0;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
|
|
|
* @param string $extension
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
// Determine support via mime type only
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2017-12-20 22:24:39 +01:00
|
|
|
* @param string $mime
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
if (!$this->supportedMimes) {
|
|
|
|
$this->supportedMimes = $this->getClient()->getSupportedMimes();
|
|
|
|
}
|
2015-11-18 05:07:31 +01:00
|
|
|
|
|
|
|
// Check if supported (most common / quickest lookup)
|
2018-07-03 01:23:27 +02:00
|
|
|
if (isset($this->supportedMimes[$mime])) {
|
2015-11-18 05:07:31 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check aliases
|
2018-07-03 01:23:27 +02:00
|
|
|
foreach ($this->supportedMimes as $info) {
|
2015-11-18 05:07:31 +01:00
|
|
|
if (isset($info['alias']) && in_array($mime, $info['alias'])) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
public function getContent(File $file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2018-07-03 05:55:02 +02:00
|
|
|
$tempFile = $this->getPathFromFile($file);
|
|
|
|
return $this->getClient()->tika($tempFile);
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2015-02-25 02:44:03 +01:00
|
|
|
}
|