httpClient) { $this->httpClient = new Client(); } return $this->httpClient; } /** * @param Client $client * @return $this */ public function setHttpClient(Client $client) { $this->httpClient = $client; return $this; } /** * @return string */ public function isAvailable() { $url = $this->config()->get('base_url'); return (bool) $url; } /** * @param string $extension * @return bool */ public function supportsExtension($extension) { return in_array( strtolower($extension ?? ''), [ 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'epub', 'rtf', 'odt', 'fodt', 'ods', 'fods', 'ppt', 'pptx', 'odp', 'fodp', 'csv' ] ); } /** * @param string $mime * @return bool */ public function supportsMime($mime) { // Rely on supportsExtension return false; } /** * @param File|string $file * @return string * @throws InvalidArgumentException */ public function getContent($file) { if (!$file || (is_string($file) && !file_exists($file ?? ''))) { // no file return ''; } $fileName = $file instanceof File ? $file->getFilename() : basename($file ?? ''); $client = $this->getHttpClient(); // Get and validate base URL $baseUrl = $this->config()->get('base_url'); if (!$this->config()->get('base_url')) { throw new InvalidArgumentException('SolrCellTextExtractor.base_url not specified'); } try { $stream = $file instanceof File ? $file->getStream() : fopen($file ?? '', 'r'); /** @var Response $response */ $response = $client ->post($baseUrl, [ 'multipart' => [ ['name' => 'extractOnly', 'contents' => 'true'], ['name' => 'extractFormat', 'contents' => 'text'], ['name' => 'myfile', 'contents' => $stream], ] ]); } catch (InvalidArgumentException $e) { $msg = sprintf( 'Error extracting text from "%s" (message: %s)', $fileName, $e->getMessage() ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } catch (Exception $e) { // Catch other errors that Tika can throw via Guzzle but are not caught and break Solr search // query in some cases. $msg = sprintf( 'Tika server error attempting to extract from "%s" (message: %s)', $fileName, $e->getMessage() ); Injector::inst()->get(LoggerInterface::class)->notice($msg); return null; } $matches = []; // Use preg match to avoid SimpleXML running out of memory on large text nodes preg_match( sprintf('/\(.*?)\<\/str\>/s', preg_quote($fileName ?? '')), (string)$response->getBody(), $matches ); return $matches ? $matches[1] : null; } }