mirror of
https://github.com/silverstripe/silverstripe-textextraction
synced 2024-10-22 11:06:00 +02:00
BUG Improved HTMLTextExtractor, remove non-content tags
This commit is contained in:
parent
733644d6bb
commit
788a49bf9f
@ -17,12 +17,47 @@ class HTMLTextExtractor extends FileTextExtractor {
|
||||
|
||||
/**
|
||||
* Lower priority because its not the most clever HTML extraction. If there is something better, use it
|
||||
* @var unknown_type
|
||||
*/
|
||||
public static $priority = 10;
|
||||
|
||||
/**
|
||||
* Extracts content from regex, by using strip_tags()
|
||||
* combined with regular expressions to remove non-content tags like <style> or <script>,
|
||||
* as well as adding line breaks after block tags.
|
||||
*
|
||||
* @param [type] $path [description]
|
||||
* @return [type] [description]
|
||||
*/
|
||||
function getContent($path) {
|
||||
$content = file_get_contents($path);
|
||||
// Yes, yes, regex'ing HTML is evil.
|
||||
// Since we don't care about well-formedness or markup here, it does the job.
|
||||
$content = preg_replace(
|
||||
array(
|
||||
// Remove invisible content
|
||||
'@<head[^>]*?>.*?</head>@siu',
|
||||
'@<style[^>]*?>.*?</style>@siu',
|
||||
'@<script[^>]*?.*?</script>@siu',
|
||||
'@<object[^>]*?.*?</object>@siu',
|
||||
'@<embed[^>]*?.*?</embed>@siu',
|
||||
'@<applet[^>]*?.*?</applet>@siu',
|
||||
'@<noframes[^>]*?.*?</noframes>@siu',
|
||||
'@<noscript[^>]*?.*?</noscript>@siu',
|
||||
'@<noembed[^>]*?.*?</noembed>@siu',
|
||||
// Add line breaks before and after blocks
|
||||
'@</?((address)|(blockquote)|(center)|(del))@iu',
|
||||
'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
|
||||
'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
|
||||
'@</?((table)|(th)|(td)|(caption))@iu',
|
||||
'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
|
||||
'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
|
||||
'@</?((frameset)|(frame)|(iframe))@iu',
|
||||
),
|
||||
array(
|
||||
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
|
||||
),
|
||||
$content
|
||||
);
|
||||
return strip_tags($content);
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,9 @@ class HTMLTextExtractorTest extends SapphireTest {
|
||||
|
||||
$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
|
||||
$this->assertContains('Test Headline', $content);
|
||||
$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
|
||||
$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
|
||||
$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
|
||||
}
|
||||
|
||||
}
|
10
tests/fixtures/test1.html
vendored
10
tests/fixtures/test1.html
vendored
@ -1,8 +1,16 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head></head>
|
||||
<head>
|
||||
<style type="text/css">
|
||||
/* Test Style */
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
/* Test Script */
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Headline</h1>
|
||||
<p>Test Text</p>
|
||||
<!-- Test Comment -->
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user