Merge pull request #27 from tractorcow/pulls/pdfpaths

API Whitelist bin paths for pdftotext
This commit is contained in:
Hamish Friedlander 2016-02-25 16:45:56 +13:00
commit bde4cf4536
2 changed files with 45 additions and 16 deletions

View File

@ -7,12 +7,31 @@
*/ */
class PDFTextExtractor extends FileTextExtractor class PDFTextExtractor extends FileTextExtractor
{ {
/**
* Set to bin path this extractor can execute
*
* @var string
*/
private static $binary_location = null;
/**
* Used if binary_location isn't set.
* List of locations to search for a given binary in
*
* @config
* @var array
*/
private static $search_binary_locations = array(
'/usr/bin',
'/usr/local/bin',
);
public function isAvailable() public function isAvailable()
{ {
$bin = $this->bin('pdftotext'); $bin = $this->bin('pdftotext');
return (file_exists($bin) && is_executable($bin)); return $bin && file_exists($bin) && is_executable($bin);
} }
public function supportsExtension($extension) public function supportsExtension($extension)
{ {
return strtolower($extension) === 'pdf'; return strtolower($extension) === 'pdf';
@ -34,26 +53,30 @@ class PDFTextExtractor extends FileTextExtractor
/** /**
* Accessor to get the location of the binary * Accessor to get the location of the binary
* *
* @param string $prog Name of binary * @param string $program Name of binary
* @return string * @return string
*/ */
protected function bin($prog = '') protected function bin($program = '')
{ {
if ($this->config()->binary_location) { // Get list of allowed search paths
// By config if ($location = $this->config()->binary_location) {
$path = $this->config()->binary_location; $locations = array($location);
} elseif (file_exists('/usr/bin/pdftotext')) {
// By searching common directories
$path = '/usr/bin';
} elseif (file_exists('/usr/local/bin/pdftotext')) {
$path = '/usr/local/bin';
} else { } else {
$path = '.'; // Hope it's in path $locations = $this->config()->search_binary_locations;
} }
return ($path ? $path . '/' : '') . $prog; // Find program in each path
foreach($locations as $location) {
$path = "{$location}/{$program}";
if(file_exists($path)) {
return $path;
}
}
// Not found
return null;
} }
public function getContent($path) public function getContent($path)
{ {
if (!$path) { if (!$path) {
@ -72,6 +95,9 @@ class PDFTextExtractor extends FileTextExtractor
*/ */
protected function getRawOutput($path) protected function getRawOutput($path)
{ {
if(!$this->isAvailable()) {
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
}
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
if ($err) { if ($err) {
throw new FileTextExtractor_Exception(sprintf( throw new FileTextExtractor_Exception(sprintf(

View File

@ -5,7 +5,10 @@ class PDFTextExtractorTest extends SapphireTest
{ {
$extractor = new PDFTextExtractor(); $extractor = new PDFTextExtractor();
if (!$extractor->isAvailable()) { if (!$extractor->isAvailable()) {
$this->markTestSkipped('pdftotext not available'); $this->setExpectedException(
'FileTextExtractor_Exception',
'getRawOutput called on unavailable extractor'
);
} }
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf'); $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');