BUG Improved HTMLTextExtractor, remove non-content tags

This commit is contained in:
Ingo Schommer 2012-09-06 13:41:21 +02:00
parent 733644d6bb
commit 788a49bf9f
3 changed files with 48 additions and 2 deletions

View File

@ -17,12 +17,47 @@ class HTMLTextExtractor extends FileTextExtractor {
/** /**
* Lower priority because its not the most clever HTML extraction. If there is something better, use it * Lower priority because its not the most clever HTML extraction. If there is something better, use it
* @var unknown_type
*/ */
public static $priority = 10; public static $priority = 10;
/**
* Extracts content from regex, by using strip_tags()
* combined with regular expressions to remove non-content tags like <style> or <script>,
* as well as adding line breaks after block tags.
*
* @param [type] $path [description]
* @return [type] [description]
*/
function getContent($path) { function getContent($path) {
$content = file_get_contents($path); $content = file_get_contents($path);
// Yes, yes, regex'ing HTML is evil.
// Since we don't care about well-formedness or markup here, it does the job.
$content = preg_replace(
array(
// Remove invisible content
'@<head[^>]*?>.*?</head>@siu',
'@<style[^>]*?>.*?</style>@siu',
'@<script[^>]*?.*?</script>@siu',
'@<object[^>]*?.*?</object>@siu',
'@<embed[^>]*?.*?</embed>@siu',
'@<applet[^>]*?.*?</applet>@siu',
'@<noframes[^>]*?.*?</noframes>@siu',
'@<noscript[^>]*?.*?</noscript>@siu',
'@<noembed[^>]*?.*?</noembed>@siu',
// Add line breaks before and after blocks
'@</?((address)|(blockquote)|(center)|(del))@iu',
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
'@</?((table)|(th)|(td)|(caption))@iu',
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
'@</?((frameset)|(frame)|(iframe))@iu',
),
array(
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
),
$content
);
return strip_tags($content); return strip_tags($content);
} }
} }

View File

@ -6,6 +6,9 @@ class HTMLTextExtractorTest extends SapphireTest {
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); $content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
$this->assertContains('Test Headline', $content); $this->assertContains('Test Headline', $content);
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
} }
} }

View File

@ -1,8 +1,16 @@
<!doctype html> <!doctype html>
<html> <html>
<head></head> <head>
<style type="text/css">
/* Test Style */
</style>
<script type="text/javascript">
/* Test Script */
</script>
</head>
<body> <body>
<h1>Test Headline</h1> <h1>Test Headline</h1>
<p>Test Text</p> <p>Test Text</p>
<!-- Test Comment -->
</body> </body>
</html> </html>