108 lines
2.7 KiB
PHP
Raw Normal View History

2012-08-22 17:52:08 +02:00
<?php
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
2015-11-18 17:07:31 +13:00
class PDFTextExtractor extends FileTextExtractor
{
public function isAvailable()
{
$bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin));
}
public function supportsExtension($extension)
{
return strtolower($extension) === 'pdf';
}
2012-08-22 18:25:55 +02:00
2015-11-18 17:07:31 +13:00
public function supportsMime($mime)
{
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
2015-11-18 17:07:31 +13:00
/**
* Accessor to get the location of the binary
*
* @param string $prog Name of binary
* @return string
*/
protected function bin($prog = '')
{
if ($this->config()->binary_location) {
// By config
$path = $this->config()->binary_location;
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
2012-08-22 17:52:08 +02:00
2015-11-18 17:07:31 +13:00
return ($path ? $path . '/' : '') . $prog;
}
public function getContent($path)
{
if (!$path) {
return "";
} // no file
$content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
2012-08-22 17:52:08 +02:00
2015-11-18 17:07:31 +13:00
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path)
{
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) {
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode('', $err)
));
}
return implode('', $content);
}
2015-11-18 17:07:31 +13:00
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input)
{
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}