httpClient) { $this->httpClient = new Client(); } return $this->httpClient; } /** * @param Client $client * @return $this */ public function setHttpClient(Client $client) { $this->httpClient = $client; return $this; } /** * @return string */ public function isAvailable() { $url = $this->config()->get('base_url'); return (bool) $url; } /** * @param string $extension * @return bool */ public function supportsExtension($extension) { return in_array( strtolower($extension), [ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' ] ); } /** * @param string $mime * @return bool */ public function supportsMime($mime) { // Rely on supportsExtension return false; } /** * @param string $path * @return string * @throws InvalidArgumentException */ public function getContent($path) { if (!$path) { // no file return ''; } $fileName = basename($path); $client = $this->getHttpClient(); // Get and validate base URL $baseUrl = $this->config()->get('base_url'); if (!$this->config()->get('base_url')) { throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); } try { $request = $client ->post($baseUrl) ->addPostFields(['extractOnly' => 'true', 'extractFormat' => 'text']) ->addPostFiles(['myfile' => $path]); $response = $request->send(); } catch (InvalidArgumentException $e) { $msg = sprintf( 'Error extracting text from "%s" (message: %s)', $path, $e->getMessage() ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } catch (Exception $e) { // Catch other errors that Tika can throw vai Guzzle but are not caught and break Solr search query in some cases. $msg = sprintf( 'Tika server error attempting to extract from "%s" (message: %s)', $path, $e->getMessage() ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } // Just initialise it, it doesn't take much. $matches = []; // Use preg match to avoid SimpleXML running out of memory on large text nodes preg_match( sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName)), (string)$response->getBody(), $matches ); return $matches ? $matches[1] : null; } }