mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
API FileTextExtractor::getContent now supports a File and a filename path string
This commit is contained in:
parent
40e4b05f5d
commit
397e7a5d40
@ -181,8 +181,8 @@ abstract class FileTextExtractor
|
|||||||
/**
|
/**
|
||||||
* Given a File instance, extract the contents as text.
|
* Given a File instance, extract the contents as text.
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File|string $file Either the File instance, or a file path for a file to load
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
abstract public function getContent(File $file);
|
abstract public function getContent($file);
|
||||||
}
|
}
|
||||||
|
@ -54,9 +54,9 @@ class HTMLTextExtractor extends FileTextExtractor
|
|||||||
* @param File $file
|
* @param File $file
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function getContent(File $file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
$content = $file->getString();
|
$content = $file instanceof File ? $file->getString() : file_get_contents($file);
|
||||||
|
|
||||||
// Yes, yes, regex'ing HTML is evil.
|
// Yes, yes, regex'ing HTML is evil.
|
||||||
// Since we don't care about well-formedness or markup here, it does the job.
|
// Since we don't care about well-formedness or markup here, it does the job.
|
||||||
|
@ -84,9 +84,9 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent(File $file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$file) {
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
@ -97,17 +97,17 @@ class PDFTextExtractor extends FileTextExtractor
|
|||||||
/**
|
/**
|
||||||
* Invoke pdftotext with the given File object
|
* Invoke pdftotext with the given File object
|
||||||
*
|
*
|
||||||
* @param File $file
|
* @param File|string $file
|
||||||
* @return string Output
|
* @return string Output
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
*/
|
*/
|
||||||
protected function getRawOutput(File $file)
|
protected function getRawOutput($file)
|
||||||
{
|
{
|
||||||
if (!$this->isAvailable()) {
|
if (!$this->isAvailable()) {
|
||||||
throw new Exception("getRawOutput called on unavailable extractor");
|
throw new Exception("getRawOutput called on unavailable extractor");
|
||||||
}
|
}
|
||||||
|
|
||||||
$path = $this->getPathFromFile($file);
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
exec(sprintf('%s %s - 2>&1', $this->bin('pdftotext'), escapeshellarg($path)), $content, $err);
|
||||||
if ($err) {
|
if ($err) {
|
||||||
if (!is_array($err) && $err == 1) {
|
if (!is_array($err) && $err == 1) {
|
||||||
|
@ -99,18 +99,18 @@ class SolrCellTextExtractor extends FileTextExtractor
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param File $file
|
* @param File|string $file
|
||||||
* @return string
|
* @return string
|
||||||
* @throws InvalidArgumentException
|
* @throws InvalidArgumentException
|
||||||
*/
|
*/
|
||||||
public function getContent(File $file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
if (!$file) {
|
if (!$file || (is_string($file) && !file_exists($file))) {
|
||||||
// no file
|
// no file
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
||||||
$fileName = $file->getFilename();
|
$fileName = $file instanceof File ? $file->getFilename() : basename($file);
|
||||||
$client = $this->getHttpClient();
|
$client = $this->getHttpClient();
|
||||||
|
|
||||||
// Get and validate base URL
|
// Get and validate base URL
|
||||||
|
@ -124,9 +124,9 @@ class TikaServerTextExtractor extends FileTextExtractor
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent(File $file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
$tempFile = $this->getPathFromFile($file);
|
$tempFile = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
return $this->getClient()->tika($tempFile);
|
return $this->getClient()->tika($tempFile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,10 +74,10 @@ class TikaTextExtractor extends FileTextExtractor
|
|||||||
return proc_close($proc);
|
return proc_close($proc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getContent(File $file)
|
public function getContent($file)
|
||||||
{
|
{
|
||||||
$mode = $this->config()->get('output_mode');
|
$mode = $this->config()->get('output_mode');
|
||||||
$path = $this->getPathFromFile($file);
|
$path = $file instanceof File ? $this->getPathFromFile($file) : $file;
|
||||||
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
$command = sprintf('tika %s %s', $mode, escapeshellarg($path));
|
||||||
$code = $this->runShell($command, $output);
|
$code = $this->runShell($command, $output);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user