diff --git a/_config/html.yml b/_config/html.yml new file mode 100644 index 000000000..940e3165a --- /dev/null +++ b/_config/html.yml @@ -0,0 +1,3 @@ +Injector: + HTMLValue: + class: SS_HTML4Value diff --git a/core/Diff.php b/core/Diff.php index 7fd94771a..2ae9f78ee 100644 --- a/core/Diff.php +++ b/core/Diff.php @@ -686,7 +686,7 @@ class Diff $content = $cleaner->cleanHTML($content); } else { // At most basic level of cleaning, use DOMDocument to save valid XML. - $doc = new SS_HTMLValue($content); + $doc = Injector::inst()->create('HTMLValue', $content); $content = $doc->getContent(); } diff --git a/core/HTMLCleaner.php b/core/HTMLCleaner.php index 53cf02210..d65cb411e 100644 --- a/core/HTMLCleaner.php +++ b/core/HTMLCleaner.php @@ -69,7 +69,7 @@ class PurifierHTMLCleaner extends HTMLCleaner { public function cleanHTML($content) { $html = new HTMLPurifier(); - $doc = new SS_HTMLValue($html->purify($content)); + $doc = Injector::inst()->create('HTMLValue', $html->purify($content)); return $doc->getContent(); } } diff --git a/forms/HtmlEditorField.php b/forms/HtmlEditorField.php index ab5467e6d..a2bd3d172 100644 --- a/forms/HtmlEditorField.php +++ b/forms/HtmlEditorField.php @@ -56,8 +56,8 @@ class HtmlEditorField extends TextareaField { */ public function Field($properties = array()) { // mark up broken links - $value = new SS_HTMLValue($this->value); - + $value = Injector::inst()->create('HTMLValue', $this->value); + if($links = $value->getElementsByTagName('a')) foreach($links as $link) { $matches = array(); @@ -103,7 +103,7 @@ class HtmlEditorField extends TextareaField { $linkedPages = array(); $linkedFiles = array(); - $htmlValue = new SS_HTMLValue($this->value); + $htmlValue = Injector::inst()->create('HTMLValue', $this->value); if(class_exists('SiteTree')) { // Populate link tracking for internal links & links to asset files. diff --git a/model/HTMLValue.php b/model/HTMLValue.php index e06d3e5d7..9c660ffea 100644 --- a/model/HTMLValue.php +++ b/model/HTMLValue.php @@ -1,69 +1,142 @@ setDocument(new DOMDocument('1.0', 'UTF-8')); - $this->setScrictErrorChecking(false); - $this->setOutputFormatting(false); - $this->setContent($content); +abstract class SS_HTMLValue extends ViewableData { + public function __construct($fragment = null) { + if ($fragment) $this->setContent($fragment); parent::__construct(); } - /** - * Should strict error checking be used? - * @param boolean $bool - */ - public function setScrictErrorChecking($bool) { - $this->getDocument()->scrictErrorChecking = $bool; - } - - /** - * Should the output be formatted? - * @param boolean $bool - */ - public function setOutputFormatting($bool) { - $this->getDocument()->formatOutput = $bool; - } + abstract public function setContent($fragment); /** + * @param string $content * @return string */ public function getContent() { - // strip any surrounding tags before the and after the which are automatically added by - // DOMDocument. Note that we can't use the argument to saveHTML() as it's only supported in PHP 5.3.6+, - // we support 5.3.2 as a minimum in addition to the above, trim any surrounding newlines from the output + $doc = clone $this->getDocument(); + $xp = new DOMXPath($doc); - // shortcodes use square brackets which get escaped into HTML entities by saveHTML() - // this manually replaces them back to square brackets so that the shortcodes still work correctly - // we can't use urldecode() here, as valid characters like "+" will be incorrectly replaced with spaces - return trim( - preg_replace( - array( - '/(.*)/is', - '/<\/body>(.*)/is', - ), - '', - str_replace(array('%5B', '%5D'), array('[', ']'), $this->getDocument()->saveHTML()) - ) + // If there's no body, the content is empty string + if (!$doc->getElementsByTagName('body')->length) return ''; + + // saveHTML Percentage-encodes any URI-based attributes. We don't want this, since it interferes with + // shortcodes. So first, save all the attribute values for later restoration. + $attrs = array(); $i = 0; + + foreach ($xp->query('//body//@*') as $attr) { + $key = "__HTMLVALUE_".($i++); + $attrs[$key] = $attr->value; + $attr->value = $key; + } + + // Then, call saveHTML & extract out the content from the body tag + $res = preg_replace( + array( + '/^(.*?)/is', + '/<\/body>(.*?)$/isD', + ), + '', + $doc->saveHTML() ); + + // Then replace the saved attributes with their original versions + $res = preg_replace_callback('/__HTMLVALUE_(\d+)/', function($matches) use ($attrs) { + return $attrs[$matches[0]]; + }, $res); + + return $res; } + /** @see HTMLValue::getContent() */ + public function forTemplate() { + return $this->getContent(); + } + + /** @var DOMDocument */ + private $document = null; + /** @var bool */ + private $valid = true; + + /** + * Get the DOMDocument for the passed content + * @return DOMDocument | false - Return false if HTML not valid, the DOMDocument instance otherwise + */ + public function getDocument() { + if (!$this->valid) { + return false; + } + else if ($this->document) { + return $this->document; + } + else { + $this->document = new DOMDocument('1.0', 'UTF-8'); + $this->document->strictErrorChecking = false; + $this->document->formatOutput = false; + + return $this->document; + } + } + + /** + * Is this HTMLValue in an errored state? + * @return bool + */ + public function isValid() { + return $this->valid; + } + + /** + * @param DOMDocument $document + */ + public function setDocument($document) { + $this->document = $document; + $this->valid = true; + } + + public function setInvalid() { + $this->document = $this->valid = false; + } + + /** + * Pass through any missed method calls to DOMDocument (if they exist) + * so that HTMLValue can be treated mostly like an instance of DOMDocument + */ + public function __call($method, $arguments) { + $doc = $this->getDocument(); + + if(method_exists($doc, $method)) { + return call_user_func_array(array($doc, $method), $arguments); + } + else { + return parent::__call($method, $arguments); + } + } + + /** + * Make an xpath query against this HTML + * + * @param $query string - The xpath query string + * @return DOMNodeList + */ + public function query($query) { + $xp = new DOMXPath($this->getDocument()); + return $xp->query($query); + } +} + +class SS_HTML4Value extends SS_HTMLValue { + /** * @param string $content * @return bool @@ -73,41 +146,12 @@ class SS_HTMLValue extends ViewableData { // This behaviour is apparently XML spec, but we don't want this because it messes up the HTML $content = str_replace(chr(13), '', $content); + // Reset the document if we're in an invalid state for some reason + if (!$this->isValid()) $this->setDocument(null); + return @$this->getDocument()->loadHTML( '' . "$content" ); } - - /** - * @return DOMDocument - */ - public function getDocument() { - return $this->document; - } - - /** - * @param DOMDocument $document - */ - public function setDocument($document) { - $this->document = $document; - } - - /** - * A simple convenience wrapper around DOMDocument::getElementsByTagName(). - * - * @param string $name - * @return DOMNodeList - */ - public function getElementsByTagName($name) { - return $this->getDocument()->getElementsByTagName($name); - } - - /** - * @see HTMLValue::getContent() - */ - public function forTemplate() { - return $this->getContent(); - } - } diff --git a/tests/integration/HTMLValueTest.php b/tests/integration/HTML4ValueTest.php similarity index 78% rename from tests/integration/HTMLValueTest.php rename to tests/integration/HTML4ValueTest.php index aa2beadbb..b29e99d33 100644 --- a/tests/integration/HTMLValueTest.php +++ b/tests/integration/HTML4ValueTest.php @@ -3,10 +3,10 @@ * @package framework * @subpackage tests */ -class SS_HTMLValueTest extends SapphireTest { - +class SS_HTML4ValueTest extends SapphireTest { public function testInvalidHTMLSaving() { - $value = new SS_HTMLValue(); + $value = new SS_HTML4Value(); + $invalid = array ( '

Enclosed Value

' => '

Enclosed Value

', '' => '', @@ -22,20 +22,15 @@ class SS_HTMLValueTest extends SapphireTest { } public function testUtf8Saving() { - $value = new SS_HTMLValue(); + $value = new SS_HTML4Value(); + $value->setContent('

ö ß ā い 家

'); $this->assertEquals('

ö ß ā い 家

', $value->getContent()); } - public function testOutputFormatting() { - $value = new SS_HTMLValue(); - $value->setOutputFormatting(true); - $value->setContent(''); - $this->assertEquals('', $value->getContent(), 'Formatted output works'); - } - public function testInvalidHTMLTagNames() { - $value = new SS_HTMLValue(); + $value = new SS_HTML4Value(); + $invalid = array( '

', '
', @@ -53,7 +48,8 @@ class SS_HTMLValueTest extends SapphireTest { } public function testMixedNewlines() { - $value = new SS_HTMLValue(); + $value = new SS_HTML4Value(); + $value->setContent("

paragraph

\n"); $this->assertEquals( "

paragraph

\n",