diff --git a/README.md b/README.md index ddaaf20..f943ef6 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,24 @@ File: - FileTextExtractable ``` +By default any extracted content will be cached against the database row. + +Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth. +In order to swap out the cache backend you can use the following yaml configuration. + + +```yaml +--- +Name: mytextextraction +After: '#textextraction' +--- +Injector: + FileTextCache: FileTextCache_SSCache +FileTextCache_SSCache: + lifetime: 3600 # Number of seconds to cache content for + +``` + ### XPDF PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/) diff --git a/_config/config.yml b/_config/config.yml index 7380657..4073835 100644 --- a/_config/config.yml +++ b/_config/config.yml @@ -1,2 +1,8 @@ +--- +Name: textextraction +--- +Injector: + FileTextCache: FileTextCache_Database + SolrCellTextExtractor: -# base_url: 'http://localhost:8983/solr/update/extract' \ No newline at end of file +# base_url: 'http://localhost:8983/solr/update/extract' diff --git a/code/extensions/FileTextCache.php b/code/extensions/FileTextCache.php new file mode 100644 index 0000000..cbc0750 --- /dev/null +++ b/code/extensions/FileTextCache.php @@ -0,0 +1,81 @@ +FileContentCache; + } + + public function save(File $file, $content) { + $file->FileContentCache = $content; + $file->write(); + } + +} + +/** + * Uses SS_Cache with a lifetime to cache extracted content + */ +class FileTextCache_SSCache implements FileTextCache, Flushable { + + /** + * Default cache to 1 hour + * + * @var int + * @config + */ + private static $lifetime = 3600; + + /** + * @return SS_Cache + */ + protected static function get_cache() { + $lifetime = Config::inst()->get(__CLASS__, 'lifetime'); + $cache = SS_Cache::factory(__CLASS__); + $cache->setLifetime($lifetime); + return $cache; + } + + protected function getKey(File $file) { + return md5($file->getFullPath); + } + + public function load(File $file) { + $key = $this->getKey($file); + $cache = self::get_cache(); + return $cache->load($key); + } + + public function save(File $file, $content) { + $key = $this->getKey($file); + $cache = self::get_cache(); + return $cache->save($content, $key); + } + + public static function flush() { + $cache = self::get_cache(); + $cache->clean(); + } + +} diff --git a/code/extensions/FileTextExtractable.php b/code/extensions/FileTextExtractable.php index 16a1756..453d876 100644 --- a/code/extensions/FileTextExtractable.php +++ b/code/extensions/FileTextExtractable.php @@ -19,6 +19,30 @@ class FileTextExtractable extends DataExtension { 'FileContent' => 'Text' ); + private static $dependencies = array( + 'TextCache' => '%$FileTextCache' + ); + + /** + * @var FileTextCache + */ + protected $fileTextCache = null; + + /** + * + * @param FileTextCache $cache + */ + public function setTextCache(FileTextCache $cache) { + $this->fileTextCache = $cache; + } + + /** + * @return FileTextCache + */ + public function getTextCache() { + return $this->fileTextCache; + } + /** * Helper function for template * @@ -37,17 +61,25 @@ class FileTextExtractable extends DataExtension { * @return string */ public function extractFileAsText($disableCache = false) { - if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache; + if (!$disableCache) { + $text = $this->getTextCache()->load($this->owner); + if($text) { + return $text; + } + } // Determine which extractor can process this file. $extractor = FileTextExtractor::for_file($this->owner->FullPath); - if (!$extractor) return null; + if (!$extractor) { + return null; + } $text = $extractor->getContent($this->owner->FullPath); - if (!$text) return null; + if (!$text) { + return null; + } - $this->owner->FileContentCache = $text; - $this->owner->write(); + $this->getTextCache()->save($this->owner, $text); return $text; }