Provide alternative backends for caching of extracted content

Implement Flushable for clearing the cache
This commit is contained in:
Damian Mooyman 2015-05-05 15:52:01 +12:00
parent 98a83a5bca
commit 98fd4228f9
4 changed files with 143 additions and 6 deletions

View File

@ -63,6 +63,24 @@ File:
- FileTextExtractable
```
By default any extracted content will be cached against the database row.
Alternatively, extracted content can be cached using SS_Cache to prevent excessive database growth.
In order to swap out the cache backend you can use the following yaml configuration.
```yaml
---
Name: mytextextraction
After: '#textextraction'
---
Injector:
FileTextCache: FileTextCache_SSCache
FileTextCache_SSCache:
lifetime: 3600 # Number of seconds to cache content for
```
### XPDF
PDFs require special handling, for example through the [XPDF](http://www.foolabs.com/xpdf/)

View File

@ -1,2 +1,8 @@
---
Name: textextraction
---
Injector:
FileTextCache: FileTextCache_Database
SolrCellTextExtractor:
# base_url: 'http://localhost:8983/solr/update/extract'
# base_url: 'http://localhost:8983/solr/update/extract'

View File

@ -0,0 +1,81 @@
<?php
interface FileTextCache {
/**
* Save extracted content for a given File entity
*
* @param File $file
* @param string $content
*/
public function save(File $file, $content);
/**
* Return any cached extracted content for a given file entity
*
* @param File $file
*/
public function load(File $file);
}
/**
* Caches the extracted content on the record for the file
*/
class FileTextCache_Database implements FileTextCache {
public function load(File $file) {
return $file->FileContentCache;
}
public function save(File $file, $content) {
$file->FileContentCache = $content;
$file->write();
}
}
/**
* Uses SS_Cache with a lifetime to cache extracted content
*/
class FileTextCache_SSCache implements FileTextCache, Flushable {
/**
* Default cache to 1 hour
*
* @var int
* @config
*/
private static $lifetime = 3600;
/**
* @return SS_Cache
*/
protected static function get_cache() {
$lifetime = Config::inst()->get(__CLASS__, 'lifetime');
$cache = SS_Cache::factory(__CLASS__);
$cache->setLifetime($lifetime);
return $cache;
}
protected function getKey(File $file) {
return md5($file->getFullPath);
}
public function load(File $file) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->load($key);
}
public function save(File $file, $content) {
$key = $this->getKey($file);
$cache = self::get_cache();
return $cache->save($content, $key);
}
public static function flush() {
$cache = self::get_cache();
$cache->clean();
}
}

View File

@ -19,6 +19,30 @@ class FileTextExtractable extends DataExtension {
'FileContent' => 'Text'
);
private static $dependencies = array(
'TextCache' => '%$FileTextCache'
);
/**
* @var FileTextCache
*/
protected $fileTextCache = null;
/**
*
* @param FileTextCache $cache
*/
public function setTextCache(FileTextCache $cache) {
$this->fileTextCache = $cache;
}
/**
* @return FileTextCache
*/
public function getTextCache() {
return $this->fileTextCache;
}
/**
* Helper function for template
*
@ -37,17 +61,25 @@ class FileTextExtractable extends DataExtension {
* @return string
*/
public function extractFileAsText($disableCache = false) {
if (!$disableCache && $this->owner->FileContentCache) return $this->owner->FileContentCache;
if (!$disableCache) {
$text = $this->getTextCache()->load($this->owner);
if($text) {
return $text;
}
}
// Determine which extractor can process this file.
$extractor = FileTextExtractor::for_file($this->owner->FullPath);
if (!$extractor) return null;
if (!$extractor) {
return null;
}
$text = $extractor->getContent($this->owner->FullPath);
if (!$text) return null;
if (!$text) {
return null;
}
$this->owner->FileContentCache = $text;
$this->owner->write();
$this->getTextCache()->save($this->owner, $text);
return $text;
}