MINOR: Tests for previous HTMLText#Summary and HTMLText#FirstSentence patch

BUGFIX: A couple of bugfixes on HTMLText#Summary and HTMLText#FirstSentence so the trickiest tests pass git-svn-id: svn://svn.silverstripe.com/silverstripe/open/modules/sapphire/branches/2.3@78728 467b73ca-7a2a-4603-9d3b-597d59a354a9
2024-10-22 12:05:37 +00:00 · 2009-06-09 20:56:24 +00:00 · 2009-06-09 20:56:24 +00:00 · 98a75735ee
commit 98a75735ee
parent 8aea22c283
2 changed files with 102 additions and 14 deletions
--- a/core/model/fieldtypes/HTMLText.php
+++ b/core/model/fieldtypes/HTMLText.php
@ -41,32 +41,39 @@ class HTMLText extends Text {
 		/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
 		if (class_exists('SimpleXMLElement')) {
 			$doc = new DOMDocument();
-			$doc->strictErrorChecking = FALSE;
-			if ($doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $this->value)) {
+			
+			/* Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error */
+			set_error_handler(create_function('$no, $str', 'throw new Exception("HTML Parse Error: ".$str);'), E_ALL);
+			try { $res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $this->value); }
+			catch (Exception $e) { $res = false; }
+			restore_error_handler();
+			
+			if ($res) {
 				$xml = simplexml_import_dom($doc);
 				$res = $xml->xpath('//p');
 				if (!empty($res)) $str = strip_tags($res[0]->asXML());
 			}
 		}
 		
+		/* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags. We don't use strip_tags because
+		 * that does very badly on broken HTML*/
 		if (!$str) {
-			/* If that failed, use a simple regex + a strip_tags. We look for the first paragraph with some words in it, not just the first paragraph. 
-			 * Not as good on broken HTML, and doesn't understand escaping or cdata blocks, but will probably work on even very malformed HTML */
-			if (preg_match('{<p[^>]*>(.*[A-Za-z]+.*)</p>}', $this->value, $matches)) {
-				$str = strip_tags($matches[1]);
-			}
-			/* If _that_ failed, just use the whole text with strip_tags */
-			else {
-				$str = strip_tags($this->value);
-			}
+			/* See if we can pull a paragraph out*/
+			if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $this->value, $matches)) $str = $matches[2];
+			/* If _that_ failed, just use the whole text */
+			else $str = $this->value;
+			
+			/* Now pull out all the html-alike stuff */
+			$str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str); /* Take out anything that is obviously a tag */
+			$str = preg_replace('{</|<|>}', '', $str); /* Strip out any left over looking bits. Textual < or > should already be encoded to &lt; or &gt; */
 		}
 		
-		/* Now split into words. If we are under the maxWords limit, just return the whole string */
+		/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for whitespace normalization) */
 		$words = preg_split('/\s+/', $str);
-		if ($maxWords == -1 || count($words) <= $maxWords) return $str;
+		if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);

 		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't very good at it) */
-		for ($i = $maxWords; $i > $maxWords - $flex; $i--) {
+		for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) {
 			if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) {
 				return implode(' ', array_slice($words, 0, $i+1));
 			}
--- a/tests/fieldtypes/HTMLTextTest.php
+++ b/tests/fieldtypes/HTMLTextTest.php
@ -21,5 +21,86 @@ class HTMLTextTest extends SapphireTest {
 		}
 	}
 	
+	function testSummaryBasics() {
+		$cases = array(
+			'<h1>Should not take header</h1><p>Should take paragraph</p>' => 'Should take paragraph',
+			'<p>Should strip <b>tags, but leave</b> text</p>' => 'Should strip tags, but leave text',
+			'<p>Unclosed tags <br>should not phase it</p>' => 'Unclosed tags should not phase it',
+			'<p>Second paragraph</p><p>should not cause errors or appear in output</p>' => 'Second paragraph'
+		);
+		
+		foreach($cases as $originalValue => $expectedValue) {
+			$textObj = new HTMLText('Test');
+			$textObj->setValue($originalValue);
+			$this->assertEquals($expectedValue, $textObj->Summary());
+		}
+	}
+
+	function testSummaryLimits() {
+		$cases = array(
+			'<p>A long paragraph should be cut off if limit is set</p>' => 'A long paragraph should be...',
+			'<p>No matter <i>how many <b>tags</b></i> are in it</p>' => 'No matter how many tags...',
+			'<p>A sentence is. nicer than hard limits</p>' => 'A sentence is.',
+			'<p>But not. If it\'s too short</p>' => 'But not. If it\'s too...'
+		);
+		
+		foreach($cases as $originalValue => $expectedValue) {
+			$textObj = new HTMLText('Test');
+			$textObj->setValue($originalValue);
+			$this->assertEquals($expectedValue, $textObj->Summary(5, 3, '...'));
+		}
+	}
+
+	function testSummaryEndings() {
+		$cases = array(
+			'...', ' -> more', ''
+		);
+		
+		$orig = '<p>Cut it off, cut it off</p>';
+		$match = 'Cut it off, cut';
+		
+		foreach($cases as $add) {
+			$textObj = new HTMLText();
+			$textObj->setValue($orig);
+			$this->assertEquals($match.$add, $textObj->Summary(4, 0, $add));
+		}
+	}
+
+	function testSummaryFlexTooBigShouldNotCauseError() {
+		$orig = '<p>Cut it off, cut it off</p>';
+		$match = 'Cut it off, cut';
+		
+		$textObj = new HTMLText();
+		$textObj->setValue($orig);
+		$this->assertEquals($match, $textObj->Summary(4, 10, ''));
+	}
+	
+	function testSummaryInvalidHTML() {
+		$cases = array(
+			'It\'s got a <p<> tag, but<p junk true>This doesn\'t <a id="boo">make</b class="wa"> < ><any< sense</p>' => 'This doesn\'t make any',
+			'This doesn\'t <a style="much horray= true>even</b> < ><have< a <i>p tag' => 'This doesn\'t even have'
+		);
+		
+		foreach($cases as $orig => $match) {
+			$textObj = new HTMLText();
+			$textObj->setValue($orig);
+			$this->assertEquals($match, $textObj->Summary(4, 0, ''));
+		}
+	}
+
+	function testFirstSentence() {
+		$many = str_repeat('many ', 100);
+		$cases = array(
+			'<h1>should ignore</h1><p>First sentence. Second sentence.</p>' => 'First sentence.',
+			'<h1>should ignore</h1><p>First Mr. sentence. Second sentence.</p>' => 'First Mr. sentence.',
+			"<h1>should ignore</h1><p>Sentence with {$many}words. Second sentence.</p>" => "Sentence with {$many}words.",
+		);
+		
+		foreach($cases as $orig => $match) {
+			$textObj = new HTMLText();
+			$textObj->setValue($orig);
+			$this->assertEquals($match, $textObj->FirstSentence());
+		}
+	}	
 }
 ?>