Damian Mooyman 2977f85cb5 API Implement Tika support
API Implement support for detection via mime-type as well as file extension
API Implement FileContent property for safe usage in templates
API instead of returning the list of extensions / mime types supported, support is determined on a per-file bases
Marking dev-master as version 2.0 as this contains breaking changes
2015-02-20 15:12:20 +13:00

99 lines
2.2 KiB
PHP

<?php
/**
* Text extractor that calls pdftotext to do the conversion.
* @author mstephens
*
*/
class PDFTextExtractor extends FileTextExtractor {
public function isAvailable() {
$bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin));
}
public function supportsExtension($extension) {
return strtolower($extension) === 'pdf';
}
public function supportsMime($mime) {
return in_array(
strtolower($mime),
array(
'application/pdf',
'application/x-pdf',
'application/x-bzpdf',
'application/x-gzpdf'
)
);
}
/**
* Accessor to get the location of the binary
*
* @param string $prog Name of binary
* @return string
*/
protected function bin($prog = '') {
if ($this->config()->binary_location) {
// By config
$path = $this->config()->binary_location;
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else {
$path = '.'; // Hope it's in path
}
return ( $path ? $path . '/' : '' ) . $prog;
}
public function getContent($path) {
if(!$path) return ""; // no file
$content = $this->getRawOutput($path);
return $this->cleanupLigatures($content);
}
/**
* Invoke pdftotext with the given path
*
* @param string $path
* @return string Output
* @throws FileTextExtractor_Exception
*/
protected function getRawOutput($path) {
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if($err) {
throw new FileTextExtractor_Exception(sprintf(
'PDFTextExtractor->getContent() failed for %s: %s',
$path,
implode('', $err)
));
}
return implode('', $content);
}
/**
* Removes utf-8 ligatures.
*
* @link http://en.wikipedia.org/wiki/Typographic_ligature#Computer_typesetting
*
* @param string $input
* @return string
*/
protected function cleanupLigatures($input) {
$mapping = array(
'ff' => 'ff',
'fi' => 'fi',
'fl' => 'fl',
'ffi' => 'ffi',
'ffl' => 'ffl',
'ſt' => 'ft',
'st' => 'st'
);
return str_replace(array_keys($mapping), array_values($mapping), $input);
}
}