2012-08-22 17:52:08 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* A decorator for File or a subclass that provides a method for extracting full-text from the file's external contents.
|
|
|
|
* @author mstephens
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
abstract class FileTextExtractor extends Object {
|
2015-02-18 15:31:38 +13:00
|
|
|
|
2012-08-22 17:52:08 +02:00
|
|
|
/**
|
|
|
|
* Set priority from 0-100.
|
|
|
|
* The highest priority extractor for a given content type will be selected.
|
|
|
|
*
|
2013-05-07 18:47:56 +02:00
|
|
|
* @config
|
2015-02-18 15:31:38 +13:00
|
|
|
* @var integer
|
2012-08-22 17:52:08 +02:00
|
|
|
*/
|
2013-05-07 18:47:56 +02:00
|
|
|
private static $priority = 50;
|
2012-08-22 17:52:08 +02:00
|
|
|
|
2015-02-18 15:31:38 +13:00
|
|
|
/**
|
|
|
|
* Cache of extractor class names, sorted by priority
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
2012-08-22 17:52:08 +02:00
|
|
|
protected static $sorted_extractor_classes = null;
|
|
|
|
|
|
|
|
/**
|
2015-02-18 15:31:38 +13:00
|
|
|
* Gets the list of prioritised extractor classes
|
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
protected static function get_extractor_classes() {
|
|
|
|
// Check cache
|
|
|
|
if (self::$sorted_extractor_classes) return self::$sorted_extractor_classes;
|
|
|
|
|
|
|
|
// Generate the sorted list of extractors on demand.
|
|
|
|
$classes = ClassInfo::subclassesFor("FileTextExtractor");
|
|
|
|
array_shift($classes);
|
2015-02-25 14:44:03 +13:00
|
|
|
$classPriorities = array();
|
|
|
|
foreach($classes as $class) {
|
|
|
|
$classPriorities[$class] = Config::inst()->get($class, 'priority');
|
|
|
|
}
|
|
|
|
arsort($classPriorities);
|
2015-02-18 15:31:38 +13:00
|
|
|
|
|
|
|
// Save classes
|
2015-02-25 14:44:03 +13:00
|
|
|
$sortedClasses = array_keys($classPriorities);
|
2015-02-18 15:31:38 +13:00
|
|
|
return self::$sorted_extractor_classes = $sortedClasses;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the text file extractor for the given class
|
|
|
|
*
|
|
|
|
* @param string $class
|
|
|
|
* @return FileTextExtractor
|
|
|
|
*/
|
|
|
|
protected static function get_extractor($class) {
|
|
|
|
return Injector::inst()->get($class);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Attempt to detect mime type for given file
|
|
|
|
*
|
|
|
|
* @param string $path
|
|
|
|
* @return string Mime type if found
|
|
|
|
*/
|
|
|
|
protected static function get_mime($path) {
|
|
|
|
if(!class_exists('finfo')) return null;
|
|
|
|
|
|
|
|
// Check mime of file
|
|
|
|
$finfo = new finfo(FILEINFO_MIME_TYPE);
|
|
|
|
return $finfo->file($path);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param string $path
|
2015-05-12 16:41:31 +12:00
|
|
|
* @return FileTextExtractor|null
|
2012-08-22 17:52:08 +02:00
|
|
|
*/
|
2012-08-22 18:25:12 +02:00
|
|
|
static function for_file($path) {
|
2015-05-12 16:41:31 +12:00
|
|
|
if(!file_exists($path) || is_dir($path)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-08-22 18:25:12 +02:00
|
|
|
$extension = pathinfo($path, PATHINFO_EXTENSION);
|
2015-02-18 15:31:38 +13:00
|
|
|
$mime = self::get_mime($path);
|
|
|
|
foreach(self::get_extractor_classes() as $className) {
|
|
|
|
$extractor = self::get_extractor($className);
|
2012-08-22 17:52:08 +02:00
|
|
|
|
2015-02-18 15:31:38 +13:00
|
|
|
// Skip unavailable extractors
|
|
|
|
if(!$extractor->isAvailable()) continue;
|
2012-08-22 17:52:08 +02:00
|
|
|
|
2015-02-18 15:31:38 +13:00
|
|
|
// Check extension
|
|
|
|
if($extension && $extractor->supportsExtension($extension)) {
|
|
|
|
return $extractor;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check mime
|
|
|
|
if($mime && $extractor->supportsMime($mime)) {
|
|
|
|
return $extractor;
|
|
|
|
}
|
2012-08-22 17:52:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-08-22 18:25:12 +02:00
|
|
|
/**
|
|
|
|
* Checks if the extractor is supported on the current environment,
|
|
|
|
* for example if the correct binaries or libraries are available.
|
|
|
|
*
|
|
|
|
* @return boolean
|
|
|
|
*/
|
2015-02-18 15:31:38 +13:00
|
|
|
abstract public function isAvailable();
|
2012-08-22 18:25:12 +02:00
|
|
|
|
2012-08-22 17:52:08 +02:00
|
|
|
/**
|
2015-02-18 15:31:38 +13:00
|
|
|
* Determine if this extractor supports the given extension.
|
|
|
|
* If support is determined by mime/type only, then this should return false.
|
|
|
|
*
|
|
|
|
* @param string $extension
|
|
|
|
* @return boolean
|
|
|
|
*/
|
|
|
|
abstract public function supportsExtension($extension);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determine if this extractor suports the given mime type.
|
|
|
|
* Will only be called if supportsExtension returns false.
|
|
|
|
*
|
|
|
|
* @param string $mime
|
|
|
|
* @return boolean
|
2012-08-22 17:52:08 +02:00
|
|
|
*/
|
2015-02-18 15:31:38 +13:00
|
|
|
abstract public function supportsMime($mime);
|
2012-08-22 17:52:08 +02:00
|
|
|
|
|
|
|
/**
|
2012-08-22 18:25:12 +02:00
|
|
|
* Given a file path, extract the contents as text.
|
|
|
|
*
|
2015-02-18 15:31:38 +13:00
|
|
|
* @param string $path
|
|
|
|
* @return string
|
2012-08-22 17:52:08 +02:00
|
|
|
*/
|
2015-02-18 15:31:38 +13:00
|
|
|
abstract public function getContent($path);
|
2012-08-22 17:52:08 +02:00
|
|
|
}
|
|
|
|
|
2012-08-27 11:31:53 +02:00
|
|
|
class FileTextExtractor_Exception extends Exception {}
|