2015-02-18 03:31:38 +01:00
|
|
|
<?php
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
|
|
|
|
2015-02-18 03:31:38 +01:00
|
|
|
/**
|
|
|
|
* Enables text extraction of file content via the Tika CLI
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2015-02-18 03:31:38 +01:00
|
|
|
* {@link http://tika.apache.org/1.7/gettingstarted.html}
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class TikaTextExtractor extends FileTextExtractor
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Text extraction mode. Defaults to -t (plain text)
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
* @config
|
|
|
|
*/
|
|
|
|
private static $output_mode = '-t';
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Get the version of tika installed, or 0 if not installed
|
|
|
|
*
|
2017-12-20 22:24:39 +01:00
|
|
|
* @return mixed float | int The version of tika
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
|
|
|
public function getVersion()
|
|
|
|
{
|
|
|
|
$code = $this->runShell('tika --version', $stdout);
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Parse output
|
|
|
|
if (!$code && preg_match('/Apache Tika (?<version>[\.\d]+)/', $stdout, $matches)) {
|
|
|
|
return $matches['version'];
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return 0;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Runs an arbitrary and safely escaped shell command
|
|
|
|
*
|
2017-12-20 22:24:39 +01:00
|
|
|
* @param string $command Full command including arguments
|
|
|
|
* @param string &$stdout Standand output
|
|
|
|
* @param string &$stderr Standard error
|
|
|
|
* @param string $input Content to pass via standard input
|
|
|
|
* @return int Exit code. 0 is success
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
|
|
|
protected function runShell($command, &$stdout = '', &$stderr = '', $input = '')
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
$descriptorSpecs = [
|
|
|
|
0 => ["pipe", "r"],
|
|
|
|
1 => ["pipe", "w"],
|
|
|
|
2 => ["pipe", "w"]
|
|
|
|
];
|
2015-11-18 05:07:31 +01:00
|
|
|
// Invoke command
|
2018-07-03 01:23:27 +02:00
|
|
|
$pipes = [];
|
2015-11-18 05:07:31 +01:00
|
|
|
$proc = proc_open($command, $descriptorSpecs, $pipes);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
if (!is_resource($proc)) {
|
|
|
|
return 255;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Send content as input
|
|
|
|
fwrite($pipes[0], $input);
|
|
|
|
fclose($pipes[0]);
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Get output
|
|
|
|
$stdout = stream_get_contents($pipes[1]);
|
|
|
|
fclose($pipes[1]);
|
|
|
|
$stderr = stream_get_contents($pipes[2]);
|
|
|
|
fclose($pipes[2]);
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Get result
|
|
|
|
return proc_close($proc);
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2018-07-03 07:03:47 +02:00
|
|
|
public function getContent($file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2018-07-03 05:55:02 +02:00
|
|
|
$mode = $this->config()->get('output_mode');
|
2018-07-03 07:03:47 +02:00
|
|
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
2015-11-18 05:07:31 +01:00
|
|
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
|
|
|
$code = $this->runShell($command, $output);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
if ($code == 0) {
|
|
|
|
return $output;
|
|
|
|
}
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
return $this->getVersion() > 0;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
// Determine support via mime type only
|
|
|
|
return false;
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @param string $mime
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return bool
|
2017-12-20 22:24:39 +01:00
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
|
|
|
// Get list of supported mime types
|
|
|
|
$code = $this->runShell('tika --list-supported-types', $supportedTypes, $error);
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
if ($code) {
|
2018-07-03 01:23:27 +02:00
|
|
|
// Error case
|
2015-11-18 05:07:31 +01:00
|
|
|
return false;
|
2018-07-03 01:23:27 +02:00
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Check if the mime type is inside the result
|
|
|
|
$pattern = sprintf('/\b(%s)\b/', preg_quote($mime, '/'));
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return (bool)preg_match($pattern, $supportedTypes);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
}
|