2012-08-22 17:52:08 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that calls pdftotext to do the conversion.
|
|
|
|
* @author mstephens
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
class PDFTextExtractor extends FileTextExtractor {
|
2012-08-22 18:25:55 +02:00
|
|
|
|
2015-02-18 03:31:38 +01:00
|
|
|
public function isAvailable() {
|
2012-08-22 18:25:55 +02:00
|
|
|
$bin = $this->bin('pdftotext');
|
|
|
|
return (file_exists($bin) && is_executable($bin));
|
|
|
|
}
|
|
|
|
|
2015-02-18 03:31:38 +01:00
|
|
|
public function supportsExtension($extension) {
|
|
|
|
return strtolower($extension) === 'pdf';
|
|
|
|
}
|
|
|
|
|
|
|
|
public function supportsMime($mime) {
|
|
|
|
return in_array(
|
|
|
|
strtolower($mime),
|
|
|
|
array(
|
|
|
|
'application/pdf',
|
|
|
|
'application/x-pdf',
|
|
|
|
'application/x-bzpdf',
|
|
|
|
'application/x-gzpdf'
|
|
|
|
)
|
|
|
|
);
|
2012-08-22 17:52:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Accessor to get the location of the binary
|
2015-02-18 03:31:38 +01:00
|
|
|
*
|
|
|
|
* @param string $prog Name of binary
|
|
|
|
* @return string
|
2012-08-22 17:52:08 +02:00
|
|
|
*/
|
2015-02-18 03:31:38 +01:00
|
|
|
protected function bin($prog = '') {
|
|
|
|
if ($this->config()->binary_location) {
|
|
|
|
// By config
|
|
|
|
$path = $this->config()->binary_location;
|
|
|
|
} elseif (file_exists('/usr/bin/pdftotext')) {
|
|
|
|
// By searching common directories
|
|
|
|
$path = '/usr/bin';
|
|
|
|
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
|
|
|
$path = '/usr/local/bin';
|
|
|
|
} else {
|
|
|
|
$path = '.'; // Hope it's in path
|
|
|
|
}
|
2012-08-22 17:52:08 +02:00
|
|
|
|
|
|
|
return ( $path ? $path . '/' : '' ) . $prog;
|
|
|
|
}
|
|
|
|
|
2015-02-18 03:31:38 +01:00
|
|
|
public function getContent($path) {
|
|
|
|
if(!$path) return ""; // no file
|
|
|
|
$content = $this->getRawOutput($path);
|
|
|
|
return $this->cleanupLigatures($content);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Invoke pdftotext with the given path
|
|
|
|
*
|
|
|
|
* @param string $path
|
|
|
|
* @return string Output
|
|
|
|
* @throws FileTextExtractor_Exception
|
|
|
|
*/
|
|
|
|
protected function getRawOutput($path) {
|
|
|
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
2012-08-27 11:31:53 +02:00
|
|
|
if($err) {
|
|
|
|
throw new FileTextExtractor_Exception(sprintf(
|
|
|
|
'PDFTextExtractor->getContent() failed for %s: %s',
|
|
|
|
$path,
|
|
|
|
implode('', $err)
|
|
|
|
));
|
|
|
|
}
|
|
|
|
return implode('', $content);
|
2012-08-22 17:52:08 +02:00
|
|
|
}
|
|
|
|
|
2015-02-18 03:31:38 +01:00
|
|
|
/**
|
|
|
|
* Removes utf-8 ligatures.
|
|
|
|
*
|
|
|
|
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
|
|
|
|
*
|
|
|
|
* @param string $input
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
protected function cleanupLigatures($input) {
|
|
|
|
$mapping = array(
|
|
|
|
'ff' => 'ff',
|
|
|
|
'fi' => 'fi',
|
|
|
|
'fl' => 'fl',
|
|
|
|
'ffi' => 'ffi',
|
|
|
|
'ffl' => 'ffl',
|
|
|
|
'ſt' => 'ft',
|
|
|
|
'st' => 'st'
|
|
|
|
);
|
|
|
|
return str_replace(array_keys($mapping), array_values($mapping), $input);
|
|
|
|
}
|
|
|
|
}
|