mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API FileTextExtractor::getContent now supports a File and a filename path string
This commit is contained in:
parent
40e4b05f5d
commit
397e7a5d40
@ -181,8 +181,8 @@ abstract class FileTextExtractor
|
||||
/**
|
||||
* Given a File instance, extract the contents as text.
|
||||
*
|
||||
* @param File $file
|
||||
* @param File|string $file Either the File instance, or a file path for a file to load
|
||||
* @return string
|
||||
*/
|
||||
abstract public function getContent(File $file);
|
||||
abstract public function getContent($file);
|
||||
}
|
||||
|
@ -54,9 +54,9 @@ class HTMLTextExtractor extends FileTextExtractor
|
||||
* @param File $file
|
||||
* @return string
|
||||
*/
|
||||
public function getContent(File $file)
|
||||
public function getContent($file)
|
||||
{
|
||||
$content = $file->getString();
|
||||
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
|
||||
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
|
@ -84,9 +84,9 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
return null;
|
||||
}
|
||||
|
||||
public function getContent(File $file)
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file) {
|
||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
@ -97,17 +97,17 @@ class PDFTextExtractor extends FileTextExtractor
|
||||
/**
|
||||
* Invoke pdftotext with the given File object
|
||||
*
|
||||
* @param File $file
|
||||
* @param File|string $file
|
||||
* @return string Output
|
||||
* @throws Exception
|
||||
*/
|
||||
protected function getRawOutput(File $file)
|
||||
protected function getRawOutput($file)
|
||||
{
|
||||
if (!$this->isAvailable()) {
|
||||
throw new Exception("getRawOutput called on unavailable extractor");
|
||||
}
|
||||
|
||||
$path = $this->getPathFromFile($file);
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||
if ($err) {
|
||||
if (!is_array($err) && $err == 1) {
|
||||
|
@ -99,18 +99,18 @@ class SolrCellTextExtractor extends FileTextExtractor
|
||||
}
|
||||
|
||||
/**
|
||||
* @param File $file
|
||||
* @param File|string $file
|
||||
* @return string
|
||||
* @throws InvalidArgumentException
|
||||
*/
|
||||
public function getContent(File $file)
|
||||
public function getContent($file)
|
||||
{
|
||||
if (!$file) {
|
||||
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||
// no file
|
||||
return '';
|
||||
}
|
||||
|
||||
$fileName = $file->getFilename();
|
||||
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
|
||||
$client = $this->getHttpClient();
|
||||
|
||||
// Get and validate base URL
|
||||
|
@ -124,9 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContent(File $file)
|
||||
public function getContent($file)
|
||||
{
|
||||
$tempFile = $this->getPathFromFile($file);
|
||||
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
return $this->getClient()->tika($tempFile);
|
||||
}
|
||||
}
|
||||
|
@ -74,10 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
|
||||
return proc_close($proc);
|
||||
}
|
||||
|
||||
public function getContent(File $file)
|
||||
public function getContent($file)
|
||||
{
|
||||
$mode = $this->config()->get('output_mode');
|
||||
$path = $this->getPathFromFile($file);
|
||||
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||
$code = $this->runShell($command, $output);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user