diff --git a/integration/HTMLValue.php b/integration/HTMLValue.php index ee38a6535..3ec0a70f8 100755 --- a/integration/HTMLValue.php +++ b/integration/HTMLValue.php @@ -29,6 +29,7 @@ class SS_HTMLValue extends ViewableData { * @return string */ public function getContent() { + $content = $this->cleanContent(); // strip the body tags from the output (which are automatically added by DOMDocument) return preg_replace ( array ( @@ -36,7 +37,7 @@ class SS_HTMLValue extends ViewableData { '/<\/body[^>]*>\s*$/i' ), null, - $this->getDocument()->saveXML($this->getDocument()->documentElement->lastChild) + $content ); } @@ -50,7 +51,44 @@ class SS_HTMLValue extends ViewableData { "$content" ); } - + + /** + * Attempt to clean invalid HTML, which messes up diffs. + * This checks for various methods and cleans code if possible. + * + * NB: By default, only extremely simple tidying is performed, + * by passing through DomDocument::loadHTML and saveXML + * You will either need to install the php_tidy module + * See: http://www.php.net/manual/en/tidy.installation.php + * or else install the SilverStripe module for HTMLPurifier from: + * http://svn.silverstripe.com/open/modules/htmlpurifier/trunk + * See also: http://htmlpurifier.org + */ + protected function cleanContent() { + $doc = $this->getDocument(); + // At most basic level of cleaning, use DOMDocument to save valid XML. + $content = $doc->saveXML($doc->documentElement->lastChild); + if (class_exists('Tidy')) { + // Check for the Tidy class, provided by php-tidy + $tidy = tidy_parse_string($content, + array( + 'clean' => true, + 'output-xhtml' => true, + 'show-body-only' => true, + 'wrap' => 0, + 'input-encoding' => 'utf8', + 'output-encoding' => 'utf8' + )); + $tidy->cleanRepair(); + $content = '' . $tidy; + } else if (class_exists('HTMLPurifier')) { + // Look otherwise for HTMLPurifier, provided by module. + $html = new HTMLPurifier(); + $content = $html->purify($content); + } + return $content; + } + /** * @return DOMDocument */