config()->get('base_url')) { throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); } if (!$this->httpClient) { $this->httpClient = new Client($this->config()->get('base_url')); } return $this->httpClient; } public function setHttpClient($client) { $this->httpClient = $client; } public function isAvailable() { $url = $this->config()->get('base_url'); return (boolean) $url; } public function supportsExtension($extension) { return in_array( strtolower($extension), array( 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' ) ); } public function supportsMime($mime) { // Rely on supportsExtension return false; } public function getContent($path) { if (!$path) { return ""; } // no file $fileName = basename($path); $client = $this->getHttpClient(); try { $request = $client ->post() ->addPostFields(array('extractOnly' => 'true', 'extractFormat' => 'text')) ->addPostFiles(array('myfile' => $path)); $response = $request->send(); } catch (InvalidArgumentException $e) { SS_Log::log( sprintf( 'Error extracting text from "%s" (message: %s)', $path, $e->getMessage() ), SS_Log::NOTICE ); return null; } catch (Guzzle\Http\Exception\ServerErrorResponseException $e) { //catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. SS_Log::log( sprintf( 'Tika server error attempting to extract from "%s" (message: %s)', $path, $e->getMessage() ), SS_Log::NOTICE ); return null; } // Use preg match to avoid SimpleXML running out of memory on large text nodes preg_match( sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), (string)$response->getBody(), $matches ); return $matches ? $matches[1] : null; } }