mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 09:06:00 +00:00
Merge pull request #9 from chillu/pulls/tika-logging
Improved Tika error logging
This commit is contained in:
commit
6cf09f26c8
12
README.md
12
README.md
@ -146,9 +146,9 @@ each HTTP request is run synchronously.
|
||||
|
||||
### Tika
|
||||
|
||||
Support for Apache Tika (1.7 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
Support for Apache Tika (1.8 and above) is included. This can be run in one of two ways: Server or CLI.
|
||||
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.7/index.html) for instructions on installing and
|
||||
See [the Apache Tika home page](http://tika.apache.org/1.8/index.html) for instructions on installing and
|
||||
configuring this. Download the latest `tika-app` for running as a CLI script, or `tika-server` if you're planning
|
||||
to have it running constantly in the background. Starting tika as a CLI script for every extraction request
|
||||
is fairly slow, so we recommend running it as a server.
|
||||
@ -186,6 +186,14 @@ java -jar tika-server-1.8.jar --host=localhost --port=9998
|
||||
While you can run `tika-app-1.8.jar` in server mode as well (with the `--server` flag),
|
||||
it behaves differently and is not recommended.
|
||||
|
||||
The module will log extraction errors with `SS_Log::NOTICE` priority by default,
|
||||
for example a "422 Unprocessable Entity" HTTP response for an encrypted PDF.
|
||||
In case you want more information on why processing failed, you can increase
|
||||
the logging verbosity in the tika server instance by passing through
|
||||
a `--includeStack` flag. Logs can passed on to files or external logging services,
|
||||
see [error handling](http://doc.silverstripe.org/en/developer_guides/debugging/error_handling)
|
||||
documentation for SilverStripe core.
|
||||
|
||||
## Usage
|
||||
|
||||
Manual extraction:
|
||||
|
@ -56,19 +56,39 @@ class TikaRestClient extends Client {
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a given file
|
||||
* Extract text content from a given file.
|
||||
* Logs a notice-level error if the document can't be parsed.
|
||||
*
|
||||
* @param string $file Full filesystem path to a file to post
|
||||
* @return string Content of the file extracted as plain text
|
||||
*/
|
||||
public function tika($file) {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array('Accept' => 'text/plain'),
|
||||
file_get_contents($file)
|
||||
)->send();
|
||||
$text = null;
|
||||
try {
|
||||
$response = $this->put(
|
||||
'tika',
|
||||
array('Accept' => 'text/plain'),
|
||||
file_get_contents($file)
|
||||
)->send();
|
||||
$text = $response->getBody(true);
|
||||
} catch(RequestException $e) {
|
||||
$msg = sprintf(
|
||||
'TikaRestClient was not able to process %s. Response: %s %s.',
|
||||
$file,
|
||||
$e->getResponse()->getStatusCode(),
|
||||
$e->getResponse()->getReasonPhrase()
|
||||
);
|
||||
|
||||
return $response->getBody(true);
|
||||
// Only available if tika-server was started with --includeStack
|
||||
$body = $e->getResponse()->getBody(true);
|
||||
if($body) {
|
||||
$msg .= ' Body: ' . $body;
|
||||
}
|
||||
|
||||
SS_Log::log($msg, SS_Log::NOTICE);
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user