MINOR: Tests for previous HTMLText#Summary and HTMLText#FirstSentence patch

BUGFIX: A couple of bugfixes on HTMLText#Summary and HTMLText#FirstSentence so the trickiest tests pass 

git-svn-id: svn://svn.silverstripe.com/silverstripe/open/modules/sapphire/branches/2.3@78728 467b73ca-7a2a-4603-9d3b-597d59a354a9
This commit is contained in:
Hamish Friedlander 2009-06-09 20:56:24 +00:00 committed by Sam Minnee
parent 8aea22c283
commit 98a75735ee
2 changed files with 102 additions and 14 deletions

View File

@ -41,32 +41,39 @@ class HTMLText extends Text {
/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
if (class_exists('SimpleXMLElement')) {
$doc = new DOMDocument();
$doc->strictErrorChecking = FALSE;
if ($doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $this->value)) {
/* Catch warnings thrown by loadHTML and turn them into a failure boolean rather than a SilverStripe error */
set_error_handler(create_function('$no, $str', 'throw new Exception("HTML Parse Error: ".$str);'), E_ALL);
try { $res = $doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $this->value); }
catch (Exception $e) { $res = false; }
restore_error_handler();
if ($res) {
$xml = simplexml_import_dom($doc);
$res = $xml->xpath('//p');
if (!empty($res)) $str = strip_tags($res[0]->asXML());
}
}
/* If that failed, most likely the passed HTML is broken. use a simple regex + a custom more brutal strip_tags. We don't use strip_tags because
* that does very badly on broken HTML*/
if (!$str) {
/* If that failed, use a simple regex + a strip_tags. We look for the first paragraph with some words in it, not just the first paragraph.
* Not as good on broken HTML, and doesn't understand escaping or cdata blocks, but will probably work on even very malformed HTML */
if (preg_match('{<p[^>]*>(.*[A-Za-z]+.*)</p>}', $this->value, $matches)) {
$str = strip_tags($matches[1]);
}
/* If _that_ failed, just use the whole text with strip_tags */
else {
$str = strip_tags($this->value);
}
/* See if we can pull a paragraph out*/
if (preg_match('{<p(\s[^<>]*)?>(.*[A-Za-z]+.*)</p>}', $this->value, $matches)) $str = $matches[2];
/* If _that_ failed, just use the whole text */
else $str = $this->value;
/* Now pull out all the html-alike stuff */
$str = preg_replace('{</?[a-zA-Z]+[^<>]*>}', '', $str); /* Take out anything that is obviously a tag */
$str = preg_replace('{</|<|>}', '', $str); /* Strip out any left over looking bits. Textual < or > should already be encoded to &lt; or &gt; */
}
/* Now split into words. If we are under the maxWords limit, just return the whole string */
/* Now split into words. If we are under the maxWords limit, just return the whole string (re-implode for whitespace normalization) */
$words = preg_split('/\s+/', $str);
if ($maxWords == -1 || count($words) <= $maxWords) return $str;
if ($maxWords == -1 || count($words) <= $maxWords) return implode(' ', $words);
/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't very good at it) */
for ($i = $maxWords; $i > $maxWords - $flex; $i--) {
for ($i = $maxWords; $i >= $maxWords - $flex && $i >= 0; $i--) {
if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) {
return implode(' ', array_slice($words, 0, $i+1));
}

View File

@ -21,5 +21,86 @@ class HTMLTextTest extends SapphireTest {
}
}
function testSummaryBasics() {
$cases = array(
'<h1>Should not take header</h1><p>Should take paragraph</p>' => 'Should take paragraph',
'<p>Should strip <b>tags, but leave</b> text</p>' => 'Should strip tags, but leave text',
'<p>Unclosed tags <br>should not phase it</p>' => 'Unclosed tags should not phase it',
'<p>Second paragraph</p><p>should not cause errors or appear in output</p>' => 'Second paragraph'
);
foreach($cases as $originalValue => $expectedValue) {
$textObj = new HTMLText('Test');
$textObj->setValue($originalValue);
$this->assertEquals($expectedValue, $textObj->Summary());
}
}
function testSummaryLimits() {
$cases = array(
'<p>A long paragraph should be cut off if limit is set</p>' => 'A long paragraph should be...',
'<p>No matter <i>how many <b>tags</b></i> are in it</p>' => 'No matter how many tags...',
'<p>A sentence is. nicer than hard limits</p>' => 'A sentence is.',
'<p>But not. If it\'s too short</p>' => 'But not. If it\'s too...'
);
foreach($cases as $originalValue => $expectedValue) {
$textObj = new HTMLText('Test');
$textObj->setValue($originalValue);
$this->assertEquals($expectedValue, $textObj->Summary(5, 3, '...'));
}
}
function testSummaryEndings() {
$cases = array(
'...', ' -> more', ''
);
$orig = '<p>Cut it off, cut it off</p>';
$match = 'Cut it off, cut';
foreach($cases as $add) {
$textObj = new HTMLText();
$textObj->setValue($orig);
$this->assertEquals($match.$add, $textObj->Summary(4, 0, $add));
}
}
function testSummaryFlexTooBigShouldNotCauseError() {
$orig = '<p>Cut it off, cut it off</p>';
$match = 'Cut it off, cut';
$textObj = new HTMLText();
$textObj->setValue($orig);
$this->assertEquals($match, $textObj->Summary(4, 10, ''));
}
function testSummaryInvalidHTML() {
$cases = array(
'It\'s got a <p<> tag, but<p junk true>This doesn\'t <a id="boo">make</b class="wa"> < ><any< sense</p>' => 'This doesn\'t make any',
'This doesn\'t <a style="much horray= true>even</b> < ><have< a <i>p tag' => 'This doesn\'t even have'
);
foreach($cases as $orig => $match) {
$textObj = new HTMLText();
$textObj->setValue($orig);
$this->assertEquals($match, $textObj->Summary(4, 0, ''));
}
}
function testFirstSentence() {
$many = str_repeat('many ', 100);
$cases = array(
'<h1>should ignore</h1><p>First sentence. Second sentence.</p>' => 'First sentence.',
'<h1>should ignore</h1><p>First Mr. sentence. Second sentence.</p>' => 'First Mr. sentence.',
"<h1>should ignore</h1><p>Sentence with {$many}words. Second sentence.</p>" => "Sentence with {$many}words.",
);
foreach($cases as $orig => $match) {
$textObj = new HTMLText();
$textObj->setValue($orig);
$this->assertEquals($match, $textObj->FirstSentence());
}
}
}
?>