silverstripe-textextraction/src/Extractor/PDFTextExtractor.php

147 lines
3.6 KiB
PHP
Raw Normal View History

2012-08-22 17:52:08 +02:00
<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\Assets\File;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
2012-08-22 17:52:08 +02:00
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*/
2015-11-18 05:07:31 +01:00
class PDFTextExtractor extends FileTextExtractor
{
2016-02-25 04:28:36 +01:00
/**
* Set to bin path this extractor can execute
*
* @var string
*/
private static $binary_location = null;
/**
* Used if binary_location isn't set.
* List of locations to search for a given binary in
*
* @config
* @var array
*/
private static $search_binary_locations = [
2016-02-25 04:28:36 +01:00
'/usr/bin',
'/usr/local/bin',
];
2016-02-25 04:28:36 +01:00
2015-11-18 05:07:31 +01:00
public function isAvailable()
{
$bin = $this->bin('pdftotext');
2016-02-25 04:28:36 +01:00
return $bin && file_exists($bin) && is_executable($bin);
2015-11-18 05:07:31 +01:00
}
2016-02-25 04:28:36 +01:00
2015-11-18 05:07:31 +01:00
public function supportsExtension($extension)
{
return strtolower($extension) === 'pdf';
}
2012-08-22 18:25:55 +02:00
2015-11-18 05:07:31 +01:00
public function supportsMime($mime)
{
return in_array(
strtolower($mime),
[
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
]
2015-11-18 05:07:31 +01:00
);
}
2015-11-18 05:07:31 +01:00
/**
* Accessor to get the location of the binary
*
2016-02-25 04:28:36 +01:00
* @param string $program Name of binary
2015-11-18 05:07:31 +01:00
* @return string
*/
2016-02-25 04:28:36 +01:00
protected function bin($program = '')
2015-11-18 05:07:31 +01:00
{
2016-02-25 04:28:36 +01:00
// Get list of allowed search paths
if ($location = $this->config()->get('binary_location')) {
$locations = [$location];
2015-11-18 05:07:31 +01:00
} else {
$locations = $this->config()->get('search_binary_locations');
2015-11-18 05:07:31 +01:00
}
2012-08-22 17:52:08 +02:00
2016-02-25 04:28:36 +01:00
// Find program in each path
foreach ($locations as $location) {
2016-02-25 04:28:36 +01:00
$path = "{$location}/{$program}";
if (file_exists($path)) {
2016-02-25 04:28:36 +01:00
return $path;
}
if (file_exists($path . '.exe')) {
return $path . '.exe';
}
2016-02-25 04:28:36 +01:00
}
2016-02-25 04:28:36 +01:00
// Not found
return null;
2015-11-18 05:07:31 +01:00
}
2016-02-25 04:28:36 +01:00
public function getContent($file)
2015-11-18 05:07:31 +01:00
{
if (!$file || (is_string($file) && !file_exists($file))) {
// no file
return '';
}
$content = $this->getRawOutput($file);
2015-11-18 05:07:31 +01:00
return $this->cleanupLigatures($content);
}
2012-08-22 17:52:08 +02:00
2015-11-18 05:07:31 +01:00
/**
* Invoke pdftotext with the given File object
2015-11-18 05:07:31 +01:00
*
* @param File|string $file
2015-11-18 05:07:31 +01:00
* @return string Output
* @throws Exception
2015-11-18 05:07:31 +01:00
*/
protected function getRawOutput($file)
2015-11-18 05:07:31 +01:00
{
if (!$this->isAvailable()) {
throw new Exception("getRawOutput called on unavailable extractor");
2016-02-25 04:28:36 +01:00
}
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
2015-11-18 05:07:31 +01:00
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
throw new Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode(PHP_EOL, $content)
2015-11-18 05:07:31 +01:00
));
}
return implode(PHP_EOL, $content);
2015-11-18 05:07:31 +01:00
}
2015-11-18 05:07:31 +01:00
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input)
{
$mapping = [
2015-11-18 05:07:31 +01:00
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
];
2015-11-18 05:07:31 +01:00
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}