2012-08-22 17:52:08 +02:00
|
|
|
<?php
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2012-08-22 17:52:08 +02:00
|
|
|
/**
|
|
|
|
* Text extractor that calls pdftotext to do the conversion.
|
|
|
|
* @author mstephens
|
|
|
|
*/
|
2015-11-18 05:07:31 +01:00
|
|
|
class PDFTextExtractor extends FileTextExtractor
|
|
|
|
{
|
2016-02-25 04:28:36 +01:00
|
|
|
/**
|
|
|
|
* Set to bin path this extractor can execute
|
|
|
|
*
|
|
|
|
* @var string
|
|
|
|
*/
|
|
|
|
private static $binary_location = null;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Used if binary_location isn't set.
|
|
|
|
* List of locations to search for a given binary in
|
|
|
|
*
|
|
|
|
* @config
|
|
|
|
* @var array
|
|
|
|
*/
|
2018-07-03 01:23:27 +02:00
|
|
|
private static $search_binary_locations = [
|
2016-02-25 04:28:36 +01:00
|
|
|
'/usr/bin',
|
|
|
|
'/usr/local/bin',
|
2018-07-03 01:23:27 +02:00
|
|
|
];
|
2016-02-25 04:28:36 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function isAvailable()
|
|
|
|
{
|
|
|
|
$bin = $this->bin('pdftotext');
|
2016-02-25 04:28:36 +01:00
|
|
|
return $bin && file_exists($bin) && is_executable($bin);
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2016-02-25 04:28:36 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsExtension($extension)
|
|
|
|
{
|
|
|
|
return strtolower($extension) === 'pdf';
|
|
|
|
}
|
2012-08-22 18:25:55 +02:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
public function supportsMime($mime)
|
|
|
|
{
|
|
|
|
return in_array(
|
2018-07-03 01:23:27 +02:00
|
|
|
strtolower($mime),
|
|
|
|
[
|
|
|
|
'application/pdf',
|
|
|
|
'application/x-pdf',
|
|
|
|
'application/x-bzpdf',
|
|
|
|
'application/x-gzpdf'
|
|
|
|
]
|
2015-11-18 05:07:31 +01:00
|
|
|
);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Accessor to get the location of the binary
|
|
|
|
*
|
2016-02-25 04:28:36 +01:00
|
|
|
* @param string $program Name of binary
|
2015-11-18 05:07:31 +01:00
|
|
|
* @return string
|
|
|
|
*/
|
2016-02-25 04:28:36 +01:00
|
|
|
protected function bin($program = '')
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2016-02-25 04:28:36 +01:00
|
|
|
// Get list of allowed search paths
|
2018-07-03 01:23:27 +02:00
|
|
|
if ($location = $this->config()->get('binary_location')) {
|
|
|
|
$locations = [$location];
|
2015-11-18 05:07:31 +01:00
|
|
|
} else {
|
2018-07-03 01:23:27 +02:00
|
|
|
$locations = $this->config()->get('search_binary_locations');
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2012-08-22 17:52:08 +02:00
|
|
|
|
2016-02-25 04:28:36 +01:00
|
|
|
// Find program in each path
|
2017-12-20 22:24:39 +01:00
|
|
|
foreach ($locations as $location) {
|
2016-02-25 04:28:36 +01:00
|
|
|
$path = "{$location}/{$program}";
|
2017-12-20 22:24:39 +01:00
|
|
|
if (file_exists($path)) {
|
2016-02-25 04:28:36 +01:00
|
|
|
return $path;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
if (file_exists($path . '.exe')) {
|
|
|
|
return $path . '.exe';
|
2016-05-13 07:07:33 +02:00
|
|
|
}
|
2016-02-25 04:28:36 +01:00
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2016-02-25 04:28:36 +01:00
|
|
|
// Not found
|
|
|
|
return null;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2016-02-25 04:28:36 +01:00
|
|
|
|
2018-07-03 07:03:47 +02:00
|
|
|
public function getContent($file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2018-07-03 07:03:47 +02:00
|
|
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
2018-07-03 01:23:27 +02:00
|
|
|
// no file
|
|
|
|
return '';
|
|
|
|
}
|
2018-07-03 05:55:02 +02:00
|
|
|
$content = $this->getRawOutput($file);
|
2015-11-18 05:07:31 +01:00
|
|
|
return $this->cleanupLigatures($content);
|
|
|
|
}
|
2012-08-22 17:52:08 +02:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
2018-07-03 05:55:02 +02:00
|
|
|
* Invoke pdftotext with the given File object
|
2015-11-18 05:07:31 +01:00
|
|
|
*
|
2018-07-03 07:03:47 +02:00
|
|
|
* @param File|string $file
|
2015-11-18 05:07:31 +01:00
|
|
|
* @return string Output
|
2018-07-03 01:23:27 +02:00
|
|
|
* @throws Exception
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
2018-07-03 07:03:47 +02:00
|
|
|
protected function getRawOutput($file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2017-12-20 22:24:39 +01:00
|
|
|
if (!$this->isAvailable()) {
|
2018-07-03 01:23:27 +02:00
|
|
|
throw new Exception("getRawOutput called on unavailable extractor");
|
2016-02-25 04:28:36 +01:00
|
|
|
}
|
2018-07-03 05:55:02 +02:00
|
|
|
|
2018-07-03 07:03:47 +02:00
|
|
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
2015-11-18 05:07:31 +01:00
|
|
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
2018-07-03 01:23:27 +02:00
|
|
|
|
2019-12-15 22:06:55 +01:00
|
|
|
if ($err) {
|
2018-07-03 01:23:27 +02:00
|
|
|
throw new Exception(sprintf(
|
|
|
|
'PDFTextExtractor->getContent() failed for %s: %s',
|
|
|
|
$path,
|
2019-12-15 22:06:55 +01:00
|
|
|
implode(PHP_EOL, $content)
|
2015-11-18 05:07:31 +01:00
|
|
|
));
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2016-10-04 00:59:18 +02:00
|
|
|
return implode(PHP_EOL, $content);
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Removes utf-8 ligatures.
|
|
|
|
*
|
|
|
|
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
|
|
|
*
|
|
|
|
* @param string $input
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function cleanupLigatures($input)
|
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
$mapping = [
|
2015-11-18 05:07:31 +01:00
|
|
|
'ff' => 'ff',
|
|
|
|
'fi' => 'fi',
|
|
|
|
'fl' => 'fl',
|
|
|
|
'ffi' => 'ffi',
|
|
|
|
'ffl' => 'ffl',
|
|
|
|
'ſt' => 'ft',
|
|
|
|
'st' => 'st'
|
2018-07-03 01:23:27 +02:00
|
|
|
];
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
|
|
|
}
|
2015-02-18 03:31:38 +01:00
|
|
|
}
|