From 788a49bf9f5422a1d8a5f415e506f70efe0ea0ec Mon Sep 17 00:00:00 2001 From: Ingo Schommer Date: Thu, 6 Sep 2012 13:41:21 +0200 Subject: [PATCH] BUG Improved HTMLTextExtractor, remove non-content tags --- code/extractors/HTMLTextExtractor.php | 37 ++++++++++++++++++++++++++- tests/HTMLTextExtractorTest.php | 3 +++ tests/fixtures/test1.html | 10 +++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php index e9efead..76a9918 100644 --- a/code/extractors/HTMLTextExtractor.php +++ b/code/extractors/HTMLTextExtractor.php @@ -17,12 +17,47 @@ class HTMLTextExtractor extends FileTextExtractor { /** * Lower priority because its not the most clever HTML extraction. If there is something better, use it - * @var unknown_type */ public static $priority = 10; + /** + * Extracts content from regex, by using strip_tags() + * combined with regular expressions to remove non-content tags like @siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + '@]*?.*?@siu', + // Add line breaks before and after blocks + '@getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html'); $this->assertContains('Test Headline', $content); + $this->assertNotContains('Test Comment', $content, 'Strips HTML comments'); + $this->assertNotContains('Test Style', $content, 'Strips non-content style tags'); + $this->assertNotContains('Test Script', $content, 'Strips non-content script tags'); } } \ No newline at end of file diff --git a/tests/fixtures/test1.html b/tests/fixtures/test1.html index db00beb..134dcfb 100644 --- a/tests/fixtures/test1.html +++ b/tests/fixtures/test1.html @@ -1,8 +1,16 @@ - + + + +

Test Headline

Test Text

+ \ No newline at end of file