diff --git a/docs/en/configuration.md b/docs/en/configuration.md index 97d9d04..a5ce063 100644 --- a/docs/en/configuration.md +++ b/docs/en/configuration.md @@ -36,7 +36,7 @@ SilverStripe\TextExtraction\Cache\FileTextCache\Database: ## XPDF -PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) +PDFs require special handling, for example through the [XPDF](http://www.xpdfreader.com/) commandline utility. Follow their installation instructions, its presence will be automatically detected for \*nix operating systems. You can optionally set the binary path (required for Windows) in `mysite/_config/config.yml`: @@ -101,6 +101,10 @@ class MySolrIndex extends SolrIndex } ``` +Extractors will return content formatted with new line characters at the end of each extracted line. If you want +this to be used in HTML content it may be worth wrapping the result in a `nl2br()` call before using it in your +code. + Note: This isn't a terribly efficient way to process large amounts of files, since each HTTP request is run synchronously. diff --git a/src/Extractor/FileTextExtractor.php b/src/Extractor/FileTextExtractor.php index 6d99802..bea066d 100644 --- a/src/Extractor/FileTextExtractor.php +++ b/src/Extractor/FileTextExtractor.php @@ -76,15 +76,23 @@ abstract class FileTextExtractor /** * Given a File object, decide which extractor instance to use to handle it * - * @param File $file + * @param File|string $file * @return FileTextExtractor|null */ - public static function for_file(File $file) + public static function for_file($file) { - if (!$file) { + if (!$file || (is_string($file) && !file_exists($file))) { return null; } + // Ensure we have a File instance to work with + if (is_string($file)) { + /** @var File $fileObject */ + $fileObject = File::create(); + $fileObject->setFromLocalFile($file); + $file = $fileObject; + } + $extension = $file->getExtension(); $mime = $file->getMimeType(); @@ -116,7 +124,7 @@ abstract class FileTextExtractor * @return string * @throws Exception */ - protected function getPathFromFile(File $file) + protected static function getPathFromFile(File $file) { $path = tempnam(TEMP_PATH, 'pdftextextractor_'); if (false === $path) { diff --git a/src/Extractor/TikaServerTextExtractor.php b/src/Extractor/TikaServerTextExtractor.php index e94eeb9..eb9be4b 100644 --- a/src/Extractor/TikaServerTextExtractor.php +++ b/src/Extractor/TikaServerTextExtractor.php @@ -106,7 +106,7 @@ class TikaServerTextExtractor extends FileTextExtractor public function supportsMime($mime) { if (!$this->supportedMimes) { - $this->supportedMimes = $this->getClient()->getSupportedMimes(); + $this->supportedMimes = (array) $this->getClient()->getSupportedMimes(); } // Check if supported (most common / quickest lookup) diff --git a/src/Rest/TikaRestClient.php b/src/Rest/TikaRestClient.php index 34ffbde..2a109a8 100644 --- a/src/Rest/TikaRestClient.php +++ b/src/Rest/TikaRestClient.php @@ -4,7 +4,9 @@ namespace SilverStripe\TextExtraction\Rest; use GuzzleHttp\Client; use GuzzleHttp\Exception\RequestException; +use GuzzleHttp\Psr7\Response; use Psr\Log\LoggerInterface; +use SilverStripe\Core\Convert; use SilverStripe\Core\Environment; use SilverStripe\Core\Injector\Injector; @@ -38,6 +40,8 @@ class TikaRestClient extends Client ]; } + $config['base_uri'] = $baseUrl; + parent::__construct($config); } @@ -49,11 +53,10 @@ class TikaRestClient extends Client public function isAvailable() { try { - $result = $this->get(null); - $result->setAuth($this->options['username'], $this->options['password']); - $result->send(); + /** @var Response $result */ + $result = $this->get('/', $this->getGuzzleOptions()); - if ($result->getResponse()->getStatusCode() == 200) { + if ($result->getStatusCode() == 200) { return true; } } catch (RequestException $ex) { @@ -71,14 +74,13 @@ class TikaRestClient extends Client */ public function getVersion() { - $response = $this->get('version'); - $response->setAuth($this->options['username'], $this->options['password']); - $response->send(); + /** @var Response $response */ + $response = $this->get('version', $this->getGuzzleOptions()); $version = 0.0; // Parse output - if ($response->getResponse()->getStatusCode() == 200 && - preg_match('/Apache Tika (?[\.\d]+)/', $response->getResponse()->getBody(), $matches) + if ($response->getStatusCode() == 200 + && preg_match('/Apache Tika (?[\.\d]+)/', $response->getBody(), $matches) ) { $version = (float)$matches['version']; } @@ -99,12 +101,14 @@ class TikaRestClient extends Client $response = $this->get( 'mime-types', - array('Accept' => 'application/json') + $this->getGuzzleOptions([ + 'headers' => [ + 'Accept' => 'application/json', + ], + ]) ); - $response->setAuth($this->options['username'], $this->options['password']); - $response->send(); - return $this->mimes = $response->getResponse()->json(); + return $this->mimes = Convert::json2array($response->getBody()); } /** @@ -118,14 +122,17 @@ class TikaRestClient extends Client { $text = null; try { + /** @var Response $response */ $response = $this->put( 'tika', - ['Accept' => 'text/plain'], - file_get_contents($file) + $this->getGuzzleOptions([ + 'headers' => [ + 'Accept' => 'text/plain', + ], + 'body' => file_get_contents($file), + ]) ); - $response->setAuth($this->options['username'], $this->options['password']); - $response->send(); - $text = $response->getResponse()->getBody(true); + $text = $response->getBody(); } catch (RequestException $e) { $msg = sprintf( 'TikaRestClient was not able to process %s. Response: %s %s.', @@ -134,7 +141,7 @@ class TikaRestClient extends Client $e->getResponse()->getReasonPhrase() ); // Only available if tika-server was started with --includeStack - $body = $e->getResponse()->getBody(true); + $body = $e->getResponse()->getBody(); if ($body) { $msg .= ' Body: ' . $body; } @@ -144,4 +151,21 @@ class TikaRestClient extends Client return $text; } + + /** + * Assembles an array of request options to pass to Guzzle + * + * @param array $options Authentication (etc) will be merged into this array and returned + * @return array + */ + protected function getGuzzleOptions($options = []) + { + if (!empty($this->options['username']) && !empty($this->options['password'])) { + $options['auth'] = [ + $this->options['username'], + $this->options['password'] + ]; + } + return $options; + } } diff --git a/tests/TikaServerTextExtractor.php b/tests/TikaServerTextExtractorTest.php similarity index 100% rename from tests/TikaServerTextExtractor.php rename to tests/TikaServerTextExtractorTest.php