BUG Improved HTMLTextExtractor, remove non-content tags

2024-10-22 11:06:00 +02:00 · 2012-09-06 13:41:21 +02:00 · 2012-09-06 13:41:21 +02:00 · 788a49bf9f
commit 788a49bf9f
parent 733644d6bb
3 changed files with 48 additions and 2 deletions
--- a/code/extractors/HTMLTextExtractor.php
+++ b/code/extractors/HTMLTextExtractor.php
@ -17,12 +17,47 @@ class HTMLTextExtractor extends FileTextExtractor {

 	/**
 	 * Lower priority because its not the most clever HTML extraction. If there is something better, use it
-	 * @var unknown_type
 	 */
 	public static $priority = 10;

+	/**
+	 * Extracts content from regex, by using strip_tags()
+	 * combined with regular expressions to remove non-content tags like <style> or <script>,
+	 * as well as adding line breaks after block tags.
+	 * 
+	 * @param  [type] $path [description]
+	 * @return [type]       [description]
+	 */
 	function getContent($path) {
 		$content = file_get_contents($path);
+		// Yes, yes, regex'ing HTML is evil.
+		// Since we don't care about well-formedness or markup here, it does the job.
+		$content = preg_replace( 
+			array( 
+				// Remove invisible content 
+					'@<head[^>]*?>.*?</head>@siu', 
+					'@<style[^>]*?>.*?</style>@siu', 
+					'@<script[^>]*?.*?</script>@siu', 
+					'@<object[^>]*?.*?</object>@siu', 
+					'@<embed[^>]*?.*?</embed>@siu', 
+					'@<applet[^>]*?.*?</applet>@siu', 
+					'@<noframes[^>]*?.*?</noframes>@siu', 
+					'@<noscript[^>]*?.*?</noscript>@siu', 
+					'@<noembed[^>]*?.*?</noembed>@siu', 
+				// Add line breaks before and after blocks 
+					'@</?((address)|(blockquote)|(center)|(del))@iu', 
+					'@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu', 
+					'@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu', 
+					'@</?((table)|(th)|(td)|(caption))@iu', 
+					'@</?((form)|(button)|(fieldset)|(legend)|(input))@iu', 
+					'@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu', 
+					'@</?((frameset)|(frame)|(iframe))@iu', 
+			), 
+			array( 
+				' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',"$0", "$0", "$0", "$0", "$0", "$0","$0", "$0",
+			), 
+			$content 
+		); 
 		return strip_tags($content);
 	}
 }
--- a/tests/HTMLTextExtractorTest.php
+++ b/tests/HTMLTextExtractorTest.php
@ -6,6 +6,9 @@ class HTMLTextExtractorTest extends SapphireTest {

 		$content = $extractor->getContent(Director::baseFolder() . '/textextraction/tests/fixtures/test1.html');
 		$this->assertContains('Test Headline', $content);
+		$this->assertNotContains('Test Comment', $content, 'Strips HTML comments');
+		$this->assertNotContains('Test Style', $content, 'Strips non-content style tags');
+		$this->assertNotContains('Test Script', $content, 'Strips non-content script tags');
 	}

 }
--- a/tests/fixtures/test1.html
+++ b/tests/fixtures/test1.html
@ -1,8 +1,16 @@
 <!doctype html>
 <html>
-<head></head>
+<head>
+	<style type="text/css">
+		/* Test Style */
+	</style>
+	<script type="text/javascript">
+		/* Test Script */
+	</script>
+</head>
 <body>
 	<h1>Test Headline</h1>
 	<p>Test Text</p>
+	<!-- Test Comment -->
 </body>
 </html>