2012-08-22 17:52:08 +02:00
|
|
|
<?php
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Text extractor that uses php function strip_tags to get just the text. OK for indexing, not the best for readable text.
|
|
|
|
* @author mstephens
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
class HTMLTextExtractor extends FileTextExtractor {
|
2012-08-22 18:25:55 +02:00
|
|
|
|
|
|
|
function isAvailable() {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-08-22 17:52:08 +02:00
|
|
|
function supportedExtensions() {
|
|
|
|
return array("html", "htm", "xhtml");
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
|
|
|
*/
|
|
|
|
public static $priority = 10;
|
|
|
|
|
2012-09-06 13:41:21 +02:00
|
|
|
/**
|
|
|
|
* Extracts content from regex, by using strip_tags()
|
|
|
|
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
|
|
|
* as well as adding line breaks after block tags.
|
|
|
|
*
|
|
|
|
* @param [type] $path [description]
|
|
|
|
* @return [type] [description]
|
|
|
|
*/
|
2012-08-22 18:25:12 +02:00
|
|
|
function getContent($path) {
|
|
|
|
$content = file_get_contents($path);
|
2012-09-06 13:41:21 +02:00
|
|
|
// Yes, yes, regex'ing HTML is evil.
|
|
|
|
// Since we don't care about well-formedness or markup here, it does the job.
|
|
|
|
$content = preg_replace(
|
|
|
|
array(
|
|
|
|
// Remove invisible content
|
|
|
|
'@<head[^>]*?>.*?</head>@siu',
|
|
|
|
'@<style[^>]*?>.*?</style>@siu',
|
|
|
|
'@<script[^>]*?.*?</script>@siu',
|
|
|
|
'@<object[^>]*?.*?</object>@siu',
|
|
|
|
'@<embed[^>]*?.*?</embed>@siu',
|
|
|
|
'@<applet[^>]*?.*?</applet>@siu',
|
|
|
|
'@<noframes[^>]*?.*?</noframes>@siu',
|
|
|
|
'@<noscript[^>]*?.*?</noscript>@siu',
|
|
|
|
'@<noembed[^>]*?.*?</noembed>@siu',
|
|
|
|
// Add line breaks before and after blocks
|
|
|
|
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
|
|
|
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
|
|
|
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
|
|
|
'@</?((table)|(th)|(td)|(caption))@iu',
|
|
|
|
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
|
|
|
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
|
|
|
'@</?((frameset)|(frame)|(iframe))@iu',
|
|
|
|
),
|
|
|
|
array(
|
|
|
|
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
|
|
|
|
),
|
|
|
|
$content
|
|
|
|
);
|
2012-08-22 17:52:08 +02:00
|
|
|
return strip_tags($content);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
?>
|