mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
Merge pull request #27 from tractorcow/pulls/pdfpaths
API Whitelist bin paths for pdftotext
This commit is contained in:
commit
bde4cf4536
@ -7,12 +7,31 @@
|
|||||||
*/
|
*/
|
||||||
class PDFTextExtractor extends FileTextExtractor
|
class PDFTextExtractor extends FileTextExtractor
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Set to bin path this extractor can execute
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private static $binary_location = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used if binary_location isn't set.
|
||||||
|
* List of locations to search for a given binary in
|
||||||
|
*
|
||||||
|
* @config
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
|
private static $search_binary_locations = array(
|
||||||
|
'/usr/bin',
|
||||||
|
'/usr/local/bin',
|
||||||
|
);
|
||||||
|
|
||||||
public function isAvailable()
|
public function isAvailable()
|
||||||
{
|
{
|
||||||
$bin = $this->bin('pdftotext');
|
$bin = $this->bin('pdftotext');
|
||||||
return (file_exists($bin) && is_executable($bin));
|
return $bin && file_exists($bin) && is_executable($bin);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function supportsExtension($extension)
|
public function supportsExtension($extension)
|
||||||
{
|
{
|
||||||
return strtolower($extension) === 'pdf';
|
return strtolower($extension) === 'pdf';
|
||||||
@ -34,26 +53,30 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
/**
|
/**
|
||||||
* Accessor to get the location of the binary
|
* Accessor to get the location of the binary
|
||||||
*
|
*
|
||||||
* @param string $prog Name of binary
|
* @param string $program Name of binary
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
protected function bin($prog = '')
|
protected function bin($program = '')
|
||||||
{
|
{
|
||||||
if ($this->config()->binary_location) {
|
// Get list of allowed search paths
|
||||||
// By config
|
if ($location = $this->config()->binary_location) {
|
||||||
$path = $this->config()->binary_location;
|
$locations = array($location);
|
||||||
} elseif (file_exists('/usr/bin/pdftotext')) {
|
|
||||||
// By searching common directories
|
|
||||||
$path = '/usr/bin';
|
|
||||||
} elseif (file_exists('/usr/local/bin/pdftotext')) {
|
|
||||||
$path = '/usr/local/bin';
|
|
||||||
} else {
|
} else {
|
||||||
$path = '.'; // Hope it's in path
|
$locations = $this->config()->search_binary_locations;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ($path ? $path . '/' : '') . $prog;
|
// Find program in each path
|
||||||
|
foreach($locations as $location) {
|
||||||
|
$path = "{$location}/{$program}";
|
||||||
|
if(file_exists($path)) {
|
||||||
|
return $path;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not found
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent($path)
|
public function getContent($path)
|
||||||
{
|
{
|
||||||
if (!$path) {
|
if (!$path) {
|
||||||
@ -72,6 +95,9 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
*/
|
*/
|
||||||
protected function getRawOutput($path)
|
protected function getRawOutput($path)
|
||||||
{
|
{
|
||||||
|
if(!$this->isAvailable()) {
|
||||||
|
throw new FileTextExtractor_Exception("getRawOutput called on unavailable extractor");
|
||||||
|
}
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
if ($err) {
|
if ($err) {
|
||||||
throw new FileTextExtractor_Exception(sprintf(
|
throw new FileTextExtractor_Exception(sprintf(
|
||||||
|
@ -5,7 +5,10 @@ class PDFTextExtractorTest extends SapphireTest
|
|||||||
{
|
{
|
||||||
$extractor = new PDFTextExtractor();
|
$extractor = new PDFTextExtractor();
|
||||||
if (!$extractor->isAvailable()) {
|
if (!$extractor->isAvailable()) {
|
||||||
$this->markTestSkipped('pdftotext not available');
|
$this->setExpectedException(
|
||||||
|
'FileTextExtractor_Exception',
|
||||||
|
'getRawOutput called on unavailable extractor'
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.pdf');
|
||||||
|
Loading…
Reference in New Issue
Block a user