From f72ba3a9785c8add1fc7a51cf46b435dd6e0e45c Mon Sep 17 00:00:00 2001 From: Damian Mooyman Date: Thu, 25 Feb 2016 16:28:36 +1300 Subject: [PATCH] API Whitelist bin paths for pdftotext --- code/extractors/PDFTextExtractor.php | 56 ++++++++++++++++++++-------- tests/PDFTextExtractorTest.php | 5 ++- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/code/extractors/PDFTextExtractor.php b/code/extractors/PDFTextExtractor.php index 2e96f35..0fdaf16 100644 --- a/code/extractors/PDFTextExtractor.php +++ b/code/extractors/PDFTextExtractor.php @@ -7,12 +7,31 @@ */ class PDFTextExtractor extends FileTextExtractor { + /** + * Set to bin path this extractor can execute + * + * @var string + */ + private static $binary_location = null; + + /** + * Used if binary_location isn't set. + * List of locations to search for a given binary in + * + * @config + * @var array + */ + private static $search_binary_locations = array( + '/usr/bin', + '/usr/local/bin', + ); + public function isAvailable() { $bin = $this->bin('pdftotext'); - return (file_exists($bin) && is_executable($bin)); + return $bin && file_exists($bin) && is_executable($bin); } - + public function supportsExtension($extension) { return strtolower($extension) === 'pdf'; @@ -34,26 +53,30 @@ class PDFTextExtractor extends FileTextExtractor /** * Accessor to get the location of the binary * - * @param string $prog Name of binary + * @param string $program Name of binary * @return string */ - protected function bin($prog = '') + protected function bin($program = '') { - if ($this->config()->binary_location) { - // By config - $path = $this->config()->binary_location; - } elseif (file_exists('/usr/bin/pdftotext')) { - // By searching common directories - $path = '/usr/bin'; - } elseif (file_exists('/usr/local/bin/pdftotext')) { - $path = '/usr/local/bin'; + // Get list of allowed search paths + if ($location = $this->config()->binary_location) { + $locations = array($location); } else { - $path = '.'; // Hope it's in path + $locations = $this->config()->search_binary_locations; } - return ($path ? $path . '/' : '') . $prog; + // Find program in each path + foreach($locations as $location) { + $path = "{$location}/{$program}"; + if(file_exists($path)) { + return $path; + } + } + + // Not found + return null; } - + public function getContent($path) { if (!$path) { @@ -72,6 +95,9 @@ class PDFTextExtractor extends FileTextExtractor */ protected function getRawOutput($path) { + if(!$this->isAvailable()) { + throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor"); + } exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err); if ($err) { throw new FileTextExtractor_Exception(sprintf( diff --git a/tests/PDFTextExtractorTest.php b/tests/PDFTextExtractorTest.php index b99ff06..96ad1ec 100644 --- a/tests/PDFTextExtractorTest.php +++ b/tests/PDFTextExtractorTest.php @@ -5,7 +5,10 @@ class PDFTextExtractorTest extends SapphireTest { $extractor = new PDFTextExtractor(); if (!$extractor->isAvailable()) { - $this->markTestSkipped('pdftotext not available'); + $this->setExpectedException( + 'FileTextExtractor_Exception', + 'getRawOutput called on unavailable extractor' + ); } $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');