From b32bc08dc490ad1b7b4d43aa83a57f3195a9893d Mon Sep 17 00:00:00 2001 From: Ingo Schommer Date: Tue, 7 May 2013 19:27:06 +0200 Subject: [PATCH] More resilience in SolrCellTextExtractor Shouldn't outright fail the request if a file can't be found --- code/extractors/SolrCellTextExtractor.php | 25 +++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/code/extractors/SolrCellTextExtractor.php b/code/extractors/SolrCellTextExtractor.php index 103bfe8..6bd1b6d 100644 --- a/code/extractors/SolrCellTextExtractor.php +++ b/code/extractors/SolrCellTextExtractor.php @@ -23,6 +23,9 @@ class SolrCellTextExtractor extends FileTextExtractor { protected $httpClient; public function getHttpClient() { + if(!$this->config()->get('base_url')) { + throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); + } if(!$this->httpClient) $this->httpClient = new Client($this->config()->get('base_url')); return $this->httpClient; } @@ -53,16 +56,30 @@ class SolrCellTextExtractor extends FileTextExtractor { $fileName = basename($path); $client = $this->getHttpClient(); - $request = $client - ->post('?extractOnly=true&extractFormat=text') - ->addPostFiles(array('myfile' => $path)); - $response = $request->send(); + try { + $request = $client + ->post() + ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) + ->addPostFiles(array('myfile' => $path)); + $response = $request->send(); + } catch(InvalidArgumentException $e) { + SS_Log::log( + sprintf( + 'Error extracting text from "%s" (message: %s)', + $path, + $e->getMessage() + ), + SS_Log::NOTICE + ); + return null; + } // Use preg match to avoid SimpleXML running out of memory on large text nodes preg_match( sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), (string)$response->getBody(), $matches ); + return $matches ? $matches[1] : null; } } \ No newline at end of file