2012-08-22 17:52:08 +02:00
|
|
|
<?php
|
|
|
|
|
2017-12-20 22:24:39 +01:00
|
|
|
namespace SilverStripe\TextExtraction\Extractor;
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\Assets\File;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\ClassInfo;
|
|
|
|
use SilverStripe\Core\Config\Config;
|
|
|
|
use SilverStripe\Core\Config\Configurable;
|
2018-07-03 07:15:16 +02:00
|
|
|
use SilverStripe\Core\Injector\Injectable;
|
2018-07-03 01:23:27 +02:00
|
|
|
use SilverStripe\Core\Injector\Injector;
|
2018-07-03 05:55:02 +02:00
|
|
|
use SilverStripe\TextExtraction\Extractor\FileTextExtractor\Exception;
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2012-08-22 17:52:08 +02:00
|
|
|
/**
|
|
|
|
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
|
|
|
* @author mstephens
|
|
|
|
*/
|
2017-12-20 22:24:39 +01:00
|
|
|
abstract class FileTextExtractor
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2018-07-03 01:23:27 +02:00
|
|
|
use Configurable;
|
2018-07-03 07:15:16 +02:00
|
|
|
use Injectable;
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Set priority from 0-100.
|
|
|
|
* The highest priority extractor for a given content type will be selected.
|
|
|
|
*
|
|
|
|
* @config
|
|
|
|
* @var integer
|
|
|
|
*/
|
|
|
|
private static $priority = 50;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cache of extractor class names, sorted by priority
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected static $sorted_extractor_classes = null;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Gets the list of prioritised extractor classes
|
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
protected static function get_extractor_classes()
|
|
|
|
{
|
|
|
|
// Check cache
|
|
|
|
if (self::$sorted_extractor_classes) {
|
|
|
|
return self::$sorted_extractor_classes;
|
|
|
|
}
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
// Generate the sorted list of extractors on demand.
|
2017-12-20 22:24:39 +01:00
|
|
|
$classes = ClassInfo::subclassesFor(__CLASS__);
|
2015-11-18 05:07:31 +01:00
|
|
|
array_shift($classes);
|
2018-07-03 01:23:27 +02:00
|
|
|
$classPriorities = [];
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
foreach ($classes as $class) {
|
|
|
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
|
|
|
}
|
|
|
|
arsort($classPriorities);
|
|
|
|
|
|
|
|
// Save classes
|
2022-04-13 03:51:04 +02:00
|
|
|
$sortedClasses = array_keys($classPriorities ?? []);
|
2015-11-18 05:07:31 +01:00
|
|
|
return self::$sorted_extractor_classes = $sortedClasses;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the text file extractor for the given class
|
|
|
|
*
|
|
|
|
* @param string $class
|
|
|
|
* @return FileTextExtractor
|
|
|
|
*/
|
|
|
|
protected static function get_extractor($class)
|
|
|
|
{
|
|
|
|
return Injector::inst()->get($class);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2018-07-03 05:55:02 +02:00
|
|
|
* Given a File object, decide which extractor instance to use to handle it
|
|
|
|
*
|
2018-07-06 00:26:54 +02:00
|
|
|
* @param File|string $file
|
2018-07-03 01:23:27 +02:00
|
|
|
* @return FileTextExtractor|null
|
2015-11-18 05:07:31 +01:00
|
|
|
*/
|
2018-07-06 00:26:54 +02:00
|
|
|
public static function for_file($file)
|
2015-11-18 05:07:31 +01:00
|
|
|
{
|
2022-04-13 03:51:04 +02:00
|
|
|
if (!$file || (is_string($file) && !file_exists($file ?? ''))) {
|
2018-07-03 01:23:27 +02:00
|
|
|
return null;
|
2015-11-18 05:07:31 +01:00
|
|
|
}
|
|
|
|
|
2018-07-06 00:26:54 +02:00
|
|
|
// Ensure we have a File instance to work with
|
|
|
|
if (is_string($file)) {
|
|
|
|
/** @var File $fileObject */
|
|
|
|
$fileObject = File::create();
|
|
|
|
$fileObject->setFromLocalFile($file);
|
|
|
|
$file = $fileObject;
|
|
|
|
}
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
$extension = $file->getExtension();
|
|
|
|
$mime = $file->getMimeType();
|
2017-12-20 22:24:39 +01:00
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
foreach (self::get_extractor_classes() as $className) {
|
|
|
|
$extractor = self::get_extractor($className);
|
|
|
|
|
|
|
|
// Skip unavailable extractors
|
|
|
|
if (!$extractor->isAvailable()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check extension
|
|
|
|
if ($extension && $extractor->supportsExtension($extension)) {
|
|
|
|
return $extractor;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check mime
|
|
|
|
if ($mime && $extractor->supportsMime($mime)) {
|
|
|
|
return $extractor;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-03 05:55:02 +02:00
|
|
|
/**
|
|
|
|
* Some text extractors (like pdftotext) may require a physical file to read from, so write the current
|
|
|
|
* file contents to a temp file and return its path
|
|
|
|
*
|
|
|
|
* @param File $file
|
|
|
|
* @return string
|
|
|
|
* @throws Exception
|
|
|
|
*/
|
2018-07-06 00:26:54 +02:00
|
|
|
protected static function getPathFromFile(File $file)
|
2018-07-03 05:55:02 +02:00
|
|
|
{
|
|
|
|
$path = tempnam(TEMP_PATH, 'pdftextextractor_');
|
|
|
|
if (false === $path) {
|
|
|
|
throw new Exception(static::class . '->getPathFromFile() could not allocate temporary file name');
|
|
|
|
}
|
|
|
|
|
|
|
|
// Append extension to temp file if one is set
|
|
|
|
if ($file->getExtension()) {
|
|
|
|
$path .= '.' . $file->getExtension();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove any existing temp files with this name
|
2022-04-13 03:51:04 +02:00
|
|
|
if (file_exists($path ?? '')) {
|
|
|
|
unlink($path ?? '');
|
2018-07-03 06:30:05 +02:00
|
|
|
}
|
2018-07-03 05:55:02 +02:00
|
|
|
|
2022-04-13 03:51:04 +02:00
|
|
|
$bytesWritten = file_put_contents($path ?? '', $file->getStream());
|
2018-07-03 05:55:02 +02:00
|
|
|
if (false === $bytesWritten) {
|
|
|
|
throw new Exception(static::class . '->getPathFromFile() failed to write temporary file');
|
|
|
|
}
|
|
|
|
|
|
|
|
return $path;
|
|
|
|
}
|
|
|
|
|
2015-11-18 05:07:31 +01:00
|
|
|
/**
|
|
|
|
* Checks if the extractor is supported on the current environment,
|
|
|
|
* for example if the correct binaries or libraries are available.
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2015-11-18 05:07:31 +01:00
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
abstract public function isAvailable();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine if this extractor supports the given extension.
|
|
|
|
* If support is determined by mime/type only, then this should return false.
|
|
|
|
*
|
|
|
|
* @param string $extension
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
abstract public function supportsExtension($extension);
|
|
|
|
|
|
|
|
/**
|
2018-07-03 01:23:27 +02:00
|
|
|
* Determine if this extractor supports the given mime type.
|
2015-11-18 05:07:31 +01:00
|
|
|
* Will only be called if supportsExtension returns false.
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2015-11-18 05:07:31 +01:00
|
|
|
* @param string $mime
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
abstract public function supportsMime($mime);
|
|
|
|
|
|
|
|
/**
|
2018-07-03 05:55:02 +02:00
|
|
|
* Given a File instance, extract the contents as text.
|
2017-12-20 22:24:39 +01:00
|
|
|
*
|
2018-07-03 07:03:47 +02:00
|
|
|
* @param File|string $file Either the File instance, or a file path for a file to load
|
2015-11-18 05:07:31 +01:00
|
|
|
* @return string
|
|
|
|
*/
|
2018-07-03 07:03:47 +02:00
|
|
|
abstract public function getContent($file);
|
2012-08-22 17:52:08 +02:00
|
|
|
}
|