API Using paths instead of File objects in extractors

Makes coupling to File objects optional, by choosing to use the FileTextExtractable extension.
2024-10-22 09:06:00 +00:00 · 2012-08-22 18:25:12 +02:00 · 2012-08-22 18:25:12 +02:00 · 977c4e49c9
commit 977c4e49c9
parent 7de717b0bd
4 changed files with 22 additions and 15 deletions
--- a/code/extensions/FileTextExtractable.php
+++ b/code/extensions/FileTextExtractable.php
@ -27,10 +27,10 @@ class FileTextExtractable extends DataExtension {
 		if (!$forceParse && $this->owner->FileContentCache) return $this->owner->FileContentCache;

 		// Determine which extractor can process this file.
-		$extractor = FileTextExtractor::for_file($this->owner);
+		$extractor = FileTextExtractor::for_file($this->owner->FullPath);
 		if (!$extractor) return null;

-		$text = $extractor->getContent($this->owner);
+		$text = $extractor->getContent($this->owner->FullPath);
 		if (!$text) return null;

 		$this->owner->FileContentCache = $text;
--- a/code/extractors/FileTextExtractor.php
+++ b/code/extractors/FileTextExtractor.php
@ -17,11 +17,11 @@ abstract class FileTextExtractor extends Object {
 	protected static $sorted_extractor_classes = null;

 	/**
-	 * @param  DataObject $file
+	 * @param  String $path
 	 * @return FileTextExtractor
 	 */
-	static function for_file($file) {
-		$extension = strtolower($file->getExtension());
+	static function for_file($path) {
+		$extension = pathinfo($path, PATHINFO_EXTENSION);

 		if (!self::$sorted_extractor_classes) {
 			// Generate the sorted list of extractors on demand.
@ -41,6 +41,14 @@ abstract class FileTextExtractor extends Object {
 		}
 	}

+	/**
+	 * Checks if the extractor is supported on the current environment,
+	 * for example if the correct binaries or libraries are available.
+	 * 
+	 * @return boolean
+	 */
+	abstract function isAvailable();
+
 	/**
 	 * Return an array of content types that the extractor can handle.
 	 * @return unknown_type
@ -48,11 +56,12 @@ abstract class FileTextExtractor extends Object {
 	abstract function supportedExtensions();

 	/**
-	 * Given a file object, extract the contents as text
-	 * @param $file
+	 * Given a file path, extract the contents as text.
+	 * 
+	 * @param $path
 	 * @return unknown_type
 	 */
-	abstract function getContent($file);
+	abstract function getContent($path);
 }

 ?>
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@ -16,9 +16,8 @@ class HTMLTextExtractor extends FileTextExtractor {
 	 */
 	public static $priority = 10;

-	function getContent($file) {
-		$filename = Director::baseFolder() . "/" . $file->Filename;
-		$content = file_get_contents($filename);
+	function getContent($path) {
+		$content = file_get_contents($path);
 		return strip_tags($content);
 	}
 }
--- a/code/extractors/PDFTextExtractor.php
+++ b/code/extractors/PDFTextExtractor.php
@ -24,10 +24,9 @@ class PDFTextExtractor extends FileTextExtractor {
 		return ( $path ? $path . '/' : '' ) . $prog;
 	}
 	
-	function getContent($file) {
-		$filename = Director::baseFolder() . "/" . $file->Filename;
-		if (!$filename) return ""; // no file
-		$content = `{$this->bin('pdftotext')} "$filename" -`;
+	function getContent($path) {
+		if (!$path) return ""; // no file
+		$content = `{$this->bin('pdftotext')} "$path" -`;
 		return $content;
 	}
 }