From 788a49bf9f5422a1d8a5f415e506f70efe0ea0ec Mon Sep 17 00:00:00 2001
From: Ingo Schommer <ingo@silverstripe.com>
Date: Thu, 6 Sep 2012 13:41:21 +0200
Subject: [PATCH] BUG Improved HTMLTextExtractor, remove non-content tags

---
 code/extractors/HTMLTextExtractor.php | 37 ++++++++++++++++++++++++++-
 tests/HTMLTextExtractorTest.php       |  3 +++
 tests/fixtures/test1.html             | 10 +++++++-
 3 files changed, 48 insertions(+), 2 deletions(-)
diff --git a/code/extractors/HTMLTextExtractor.php b/code/extractors/HTMLTextExtractor.php
index e9efead..76a9918 100644
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@@ -17,12 +17,47 @@ class HTMLTextExtractor extends FileTextExtractor {
 
 	/**
 	 * Lower priority because its not the most clever HTML extraction. If there is something better, use it
-	 * @var unknown_type
 	 */
 	public static $priority = 10;
 
+	/**
+	 * Extracts content from regex, by using strip_tags()
+	 * combined with regular expressions to remove non-content tags like <style> or <script>,
+	 * as well as adding line breaks after block tags.
+	 * 
+	 * @param  [type] $path [description]
+	 * @return [type]       [description]
+	 */
 	function getContent($path) {
 		$content = file_get_contents($path);
+		// Yes, yes, regex'ing HTML is evil.
+		// Since we don't care about well-formedness or markup here, it does the job.
+		$content = preg_replace( 
+			array( 
+				// Remove invisible content 
+					'@<head[^>]*?>.*?</head>@siu', 
+					'@<style[^>]*?>.*?</style>@siu', 
+					'@<script[^>]*?.*?</script>@siu', 
+					'@<object[^>]*?.*?</object>@siu', 
+					'@<embed[^>]*?.*?</embed>@siu', 
+					'@<applet[^>]*?.*?</applet>@siu', 
+					'@<noframes[^>]*?.*?</noframes>@siu', 
+					'@<noscript[^>]*?.*?</noscript>@siu', 
+					'@<noembed[^>]*?.*?</noembed>@siu', 
+				// Add line breaks before and after blocks 
+					'@</?((address)|(blockquote)|(center)|(del))@iu', 
+					'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', 
+					'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', 
+					'@</?((table)|(th)|(td)|(caption))@iu', 
+					'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', 
+					'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', 
+					'@</?((frameset)|(frame)|(iframe))@iu', 
+			), 
+			array( 
+				' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
+			), 
+			$content 
+		); 
 		return strip_tags($content);
 	}
 }
diff --git a/tests/HTMLTextExtractorTest.php b/tests/HTMLTextExtractorTest.php
index 5a42e93..a1f2429 100644
--- a/tests/HTMLTextExtractorTest.php
+++ b/tests/HTMLTextExtractorTest.php
@@ -6,6 +6,9 @@ class HTMLTextExtractorTest extends SapphireTest {
 
 		$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
 		$this->assertContains('Test Headline', $content);
+		$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
+		$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
+		$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
 	}
 
 }
\ No newline at end of file
diff --git a/tests/fixtures/test1.html b/tests/fixtures/test1.html
index db00beb..134dcfb 100644
--- a/tests/fixtures/test1.html
+++ b/tests/fixtures/test1.html
@@ -1,8 +1,16 @@
 <!doctype html>
 <html>
-<head></head>
+<head>
+	<style type="text/css">
+		/* Test Style */
+	</style>
+	<script type="text/javascript">
+		/* Test Script */
+	</script>
+</head>
 <body>
 	<h1>Test Headline</h1>
 	<p>Test Text</p>
+	<!-- Test Comment -->
 </body>
 </html>
\ No newline at end of file