silverstripe-textextraction/src/Extractor/TikaServerTextExtractor.php

138 lines
3.1 KiB
PHP
Raw Normal View History

2015-02-25 02:44:03 +01:00
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\Core\Environment;
use SilverStripe\Core\Injector\Injector;
use SilverStripe\TextExtraction\Rest\TikaRestClient;
2015-02-25 02:44:03 +01:00
/**
* Enables text extraction of file content via the Tika Rest Server
*
* {@link http://tika.apache.org/1.7/gettingstarted.html}
*/
2015-11-18 05:07:31 +01:00
class TikaServerTextExtractor extends FileTextExtractor
{
/**
* Tika server is pretty efficient so use it immediately if available
*
* @var integer
* @config
*/
private static $priority = 80;
/**
* Server endpoint
*
* @var string
* @config
*/
private static $server_endpoint;
/**
* @var TikaRestClient
*/
protected $client = null;
/**
* Cache of supported mime types
*
* @var array
*/
protected $supportedMimes = [];
2015-11-18 05:07:31 +01:00
/**
* @return TikaRestClient
*/
public function getClient()
{
if (!$this->client) {
$this->client = Injector::inst()->createWithArgs(
TikaRestClient::class,
[$this->getServerEndpoint()]
2015-11-18 05:07:31 +01:00
);
}
return $this->client;
2015-11-18 05:07:31 +01:00
}
/**
* @return string
*/
2015-11-18 05:07:31 +01:00
public function getServerEndpoint()
{
if ($endpoint = Environment::getEnv('SS_TIKA_ENDPOINT')) {
return $endpoint;
2015-11-18 05:07:31 +01:00
}
// Default to configured endpoint
return $this->config()->get('server_endpoint');
2015-11-18 05:07:31 +01:00
}
/**
* Get the version of Tika installed, or 0 if not installed
2015-11-18 05:07:31 +01:00
*
* @return float version of Tika
2015-11-18 05:07:31 +01:00
*/
public function getVersion()
{
return $this->getClient()->getVersion();
2015-11-18 05:07:31 +01:00
}
/**
* @return boolean
*/
2015-11-18 05:07:31 +01:00
public function isAvailable()
{
return $this->getServerEndpoint()
&& $this->getClient()->isAvailable()
2022-04-13 03:51:04 +02:00
&& version_compare($this->getVersion() ?? '', '1.7') >= 0;
2015-11-18 05:07:31 +01:00
}
/**
* @param string $extension
* @return boolean
*/
2015-11-18 05:07:31 +01:00
public function supportsExtension($extension)
{
// Determine support via mime type only
return false;
}
/**
* @param string $mime
* @return boolean
*/
2015-11-18 05:07:31 +01:00
public function supportsMime($mime)
{
if (!$this->supportedMimes) {
$this->supportedMimes = (array) $this->getClient()->getSupportedMimes();
}
2015-11-18 05:07:31 +01:00
// Check if supported (most common / quickest lookup)
if (isset($this->supportedMimes[$mime])) {
2015-11-18 05:07:31 +01:00
return true;
}
// Check aliases
foreach ($this->supportedMimes as $info) {
2022-04-13 03:51:04 +02:00
if (isset($info['alias']) && in_array($mime, $info['alias'] ?? [])) {
2015-11-18 05:07:31 +01:00
return true;
}
}
return false;
}
public function getContent($file)
2015-11-18 05:07:31 +01:00
{
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
$content = $this->getClient()->tika($tempFile);
//Cleanup temp file
if ($file instanceof File) {
2022-04-13 03:51:04 +02:00
unlink($tempFile ?? '');
}
return $content;
2015-11-18 05:07:31 +01:00
}
2015-02-25 02:44:03 +01:00
}