silverstripe-textextraction/src/Extractor/HTMLTextExtractor.php
Russell Michell f341010d7a FIX: First-pass SS4 compatibility.
- Added namespaces, use statements
- Added missing docblocks etc
- Uses SS4's new Cache system
- Uses proper environment vars
- Cannot instantiate 'FileTextCache' (interface) as a service. This can be configured through YML, so default to FileTextCache_Cache
- Modded YML config to make it run.
- Fixes to allow TIKA to actually get file contents.
- Addresses issues raised by @robbieaverill
- Rebased against github.com/silverstripe/silverstripe-textextraction:master
- Replaced `SS_Log` with Monolog.
2017-12-21 10:41:06 +13:00

95 lines
2.7 KiB
PHP

<?php
namespace SilverStripe\TextExtraction\Extractor;
use SilverStripe\TextExtraction\Extractor\FileTextExtractor;
/**
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
* @author mstephens
*
*/
class HTMLTextExtractor extends FileTextExtractor
{
/**
*
* @return boolean
*/
public function isAvailable()
{
return true;
}
/**
*
* @param string $extension
* @return array
*/
public function supportsExtension($extension)
{
return in_array(
strtolower($extension), array("html", "htm", "xhtml")
);
}
/**
*
* @param string $mime
* @return string
*/
public function supportsMime($mime)
{
return strtolower($mime) === 'text/html';
}
/**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
*
* @config
* @var integer
*/
private static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param string $path
* @return string
*/
public function getContent($path)
{
$content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
), array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "$0", "$0", "$0", "$0", "$0", "$0", "$0", "$0",
), $content
);
return strip_tags($content);
}
}