BUGFIX: Replace HTMLText#Summary with one that works.

API CHANGE: Added two arguments to HTMLText#Summary. Minimal impact since previously any usage of this function threw an error. ENHANCEMENT: Add HTMLText#FirstSentence based on new HTMLText#Summary git-svn-id: svn://svn.silverstripe.com/silverstripe/open/modules/sapphire/branches/2.3@78618 467b73ca-7a2a-4603-9d3b-597d59a354a9
2024-10-22 12:05:37 +00:00 · 2009-06-08 05:38:54 +00:00 · 2009-06-08 05:38:54 +00:00 · f74069f060
commit f74069f060
parent b38315bdba
1 changed files with 71 additions and 50 deletions
--- a/core/model/fieldtypes/HTMLText.php
+++ b/core/model/fieldtypes/HTMLText.php
@ -23,62 +23,83 @@ class HTMLText extends Text {
 	}

 	/**
-	 * Create a summary of the content. This will either be the first paragraph, or the first $maxWords 
-	 * words, whichever is shorter
+	 * Create a summary of the content. This will be some section of the first paragraph, limited by
+	 * $maxWords. All internal tags are stripped out - the return value is a string
+	 * 
+	 * This is sort of the HTML aware equivilent to Text#Summary, although the logic for summarising is not exactly the same
+	 * 
+	 * @param int $maxWords Maximum number of words to return - may return less, but never more. Pass -1 for no limit
+	 * @param int $flex Number of words to search through when looking for a nice cut point 
+	 * @param string $add What to add to the end of the summary if we cut at a less-than-ideal cut point
+	 * @return string A nice(ish) summary with no html tags (but possibly still some html entities)
+	 * 
+	 * @see sapphire/core/model/fieldtypes/Text#Summary($maxWords)
 	 */
-	public function Summary( $maxWords = 50 ) {
-		// split the string into tags and words
-		$parts = Convert::xml2array( $this->value );
-		
-		// store any unmatched tags
-		$tagStack = array();
-		
-		$pIndex = 0;
-		
-		// find the first paragraph tag
-		for( $i = 0; $i < count( $parts ); $i++ )
-			if( strpos( $parts[$i], '<p' ) === 0 ) {
-				$pIndex = $i;
-				break;
-			}
-				
-		$summary = '';
-		$words = 0;
-		
-		// create the summary, keeping track of opening and closing tags
-		while( $words <= $maxWords && $pIndex < count( $parts ) ) {
-			if( $parts[$pIndex] == '</p>' ) {
-				$summary .= $parts[$pIndex];
-				break;
-			}
-			elseif( preg_match( '/<\/(\w+)>/', $parts[$pIndex], $endTag ) && $endTag[1] == substr( $tagStack[count($tagStack) - 1], 1, strlen( $endTag[1] ) ) ) {
-				array_pop( $tagStack );
-				$words++;
-				$summary .= $parts[$pIndex++];
-			} elseif( preg_match( '/^<\w+/', $parts[$pIndex] ) ) {
-				array_push( $tagStack, $parts[$pIndex] );
-				$words++;
-				$summary .= $parts[$pIndex++];
-			} else
-				$summary .= $parts[$pIndex++] . ' ';
-		}
-		
-		// Tags that shouldn't be closed
-		$noClose = array("br", "img");
-		
-		// make sure that the summary is well formed XHTML by closing tags
-		while( $openTag = array_pop( $tagStack ) ) {
-			preg_match( '/^<(\w+)\s+/', $openTag, $tagName );
-			if(sizeof($tagName) > 0) {
-			    if(!in_array($tagName[1], $noClose)) {
-					$summary .= "</{$tagName[1]}>";
-			    }
+	public function Summary($maxWords = 50, $flex = 15, $add = '...') {
+		$str = false;
+
+		/* First we need the text of the first paragraph, without tags. Try using SimpleXML first */
+		if (class_exists('SimpleXMLElement')) {
+			$doc = new DOMDocument();
+			$doc->strictErrorChecking = FALSE;
+			if ($doc->loadHTML('<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>' . $this->value)) {
+				$xml = simplexml_import_dom($doc);
+				$res = $xml->xpath('//p');
+				if (!empty($res)) $str = strip_tags($res[0]->asXML());
 			}
 		}
 		
-		return $summary;
+		if (!$str) {
+			/* If that failed, use a simple regex + a strip_tags. We look for the first paragraph with some words in it, not just the first paragraph. 
+			 * Not as good on broken HTML, and doesn't understand escaping or cdata blocks, but will probably work on even very malformed HTML */
+			if (preg_match('{<p[^>]*>(.*[A-Za-z]+.*)</p>}', $this->value, $matches)) {
+				$str = strip_tags($matches[1]);
+			}
+			/* If _that_ failed, just use the whole text with strip_tags */
+			else {
+				$str = strip_tags($this->value);
+			}
+		}
+		
+		/* Now split into words. If we are under the maxWords limit, just return the whole string */
+		$words = preg_split('/\s+/', $str);
+		if ($maxWords == -1 || count($words) <= $maxWords) return $str;
+
+		/* Otherwise work backwards for a looking for a sentence ending (we try to avoid abbreviations, but aren't very good at it) */
+		for ($i = $maxWords; $i > $maxWords - $flex; $i--) {
+			if (preg_match('/\.$/', $words[$i]) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $words[$i])) {
+				return implode(' ', array_slice($words, 0, $i+1));
+			}
+		}
+		
+		/* If we didn't find a sentence ending quickly enough, just cut at the maxWords point and add '...' to the end */
+		return implode(' ', array_slice($words, 0, $maxWords)) . $add;
 	}
 	
+	/**
+	 * Returns the first sentence from the first paragraph. If it can't figure out what the first paragraph is (or there isn't one)
+	 * it returns the same as Summary()
+	 * 
+	 * This is the HTML aware equivilent to Text#FirstSentence
+	 * 
+	 * @see sapphire/core/model/fieldtypes/Text#FirstSentence()
+	 */
+	function FirstSentence() {
+		/* Use summary's html processing logic to get the first paragraph */
+		$paragraph = $this->Summary(-1);
+		
+		/* Then look for the first sentence ending. We could probably use a nice regex, but for now this will do */
+		$words = preg_split('/\s+/', $paragraph);
+		foreach ($words as $i => $word) {
+			if (preg_match('/\.$/', $word) && !preg_match('/(Dr|Mr|Mrs|Ms|Miss|Sr|Jr|No)\.$/i', $word)) {
+				return implode(' ', array_slice($words, 0, $i+1));
+			}
+		}
+		
+		/* If we didn't find a sentence ending, use the summary. We re-call rather than using paragraph so that Summary will limit the result this time */
+		return $this->Summary();
+	}	
+	
 	public function scaffoldFormField($title = null, $params = null) {
 		return new HtmlEditorField($this->name, $title);
 	}