From 168f07149975e76de6dbf17039f3a5279da05e65 Mon Sep 17 00:00:00 2001
From: Hamish Friedlander
Date: Tue, 12 Mar 2013 09:06:20 +1300
Subject: [PATCH] API Make HTMLValue replace-able via DI
Extracted common code out to SS_HTMLValue and made abstract, then
put HTML 4 specific code in SS_HTML4Value. Its now possible to
replace HTMLValue with one designed for HTML 5 or XHTML
Requires a code change from new SS_HTMLValue to
Injector::inst()->create(HTMLValue)
---
_config/html.yml | 3 +
core/Diff.php | 2 +-
core/HTMLCleaner.php | 2 +-
forms/HtmlEditorField.php | 6 +-
model/HTMLValue.php | 202 +++++++++++-------
.../{HTMLValueTest.php => HTML4ValueTest.php} | 22 +-
6 files changed, 140 insertions(+), 97 deletions(-)
create mode 100644 _config/html.yml
rename tests/integration/{HTMLValueTest.php => HTML4ValueTest.php} (78%)
diff --git a/_config/html.yml b/_config/html.yml
new file mode 100644
index 000000000..940e3165a
--- /dev/null
+++ b/_config/html.yml
@@ -0,0 +1,3 @@
+Injector:
+ HTMLValue:
+ class: SS_HTML4Value
diff --git a/core/Diff.php b/core/Diff.php
index 7fd94771a..2ae9f78ee 100644
--- a/core/Diff.php
+++ b/core/Diff.php
@@ -686,7 +686,7 @@ class Diff
$content = $cleaner->cleanHTML($content);
} else {
// At most basic level of cleaning, use DOMDocument to save valid XML.
- $doc = new SS_HTMLValue($content);
+ $doc = Injector::inst()->create('HTMLValue', $content);
$content = $doc->getContent();
}
diff --git a/core/HTMLCleaner.php b/core/HTMLCleaner.php
index 53cf02210..d65cb411e 100644
--- a/core/HTMLCleaner.php
+++ b/core/HTMLCleaner.php
@@ -69,7 +69,7 @@ class PurifierHTMLCleaner extends HTMLCleaner {
public function cleanHTML($content) {
$html = new HTMLPurifier();
- $doc = new SS_HTMLValue($html->purify($content));
+ $doc = Injector::inst()->create('HTMLValue', $html->purify($content));
return $doc->getContent();
}
}
diff --git a/forms/HtmlEditorField.php b/forms/HtmlEditorField.php
index ab5467e6d..a2bd3d172 100644
--- a/forms/HtmlEditorField.php
+++ b/forms/HtmlEditorField.php
@@ -56,8 +56,8 @@ class HtmlEditorField extends TextareaField {
*/
public function Field($properties = array()) {
// mark up broken links
- $value = new SS_HTMLValue($this->value);
-
+ $value = Injector::inst()->create('HTMLValue', $this->value);
+
if($links = $value->getElementsByTagName('a')) foreach($links as $link) {
$matches = array();
@@ -103,7 +103,7 @@ class HtmlEditorField extends TextareaField {
$linkedPages = array();
$linkedFiles = array();
- $htmlValue = new SS_HTMLValue($this->value);
+ $htmlValue = Injector::inst()->create('HTMLValue', $this->value);
if(class_exists('SiteTree')) {
// Populate link tracking for internal links & links to asset files.
diff --git a/model/HTMLValue.php b/model/HTMLValue.php
index e06d3e5d7..9c660ffea 100644
--- a/model/HTMLValue.php
+++ b/model/HTMLValue.php
@@ -1,69 +1,142 @@
setDocument(new DOMDocument('1.0', 'UTF-8'));
- $this->setScrictErrorChecking(false);
- $this->setOutputFormatting(false);
- $this->setContent($content);
+abstract class SS_HTMLValue extends ViewableData {
+ public function __construct($fragment = null) {
+ if ($fragment) $this->setContent($fragment);
parent::__construct();
}
- /**
- * Should strict error checking be used?
- * @param boolean $bool
- */
- public function setScrictErrorChecking($bool) {
- $this->getDocument()->scrictErrorChecking = $bool;
- }
-
- /**
- * Should the output be formatted?
- * @param boolean $bool
- */
- public function setOutputFormatting($bool) {
- $this->getDocument()->formatOutput = $bool;
- }
+ abstract public function setContent($fragment);
/**
+ * @param string $content
* @return string
*/
public function getContent() {
- // strip any surrounding tags before the and after the which are automatically added by
- // DOMDocument. Note that we can't use the argument to saveHTML() as it's only supported in PHP 5.3.6+,
- // we support 5.3.2 as a minimum in addition to the above, trim any surrounding newlines from the output
+ $doc = clone $this->getDocument();
+ $xp = new DOMXPath($doc);
- // shortcodes use square brackets which get escaped into HTML entities by saveHTML()
- // this manually replaces them back to square brackets so that the shortcodes still work correctly
- // we can't use urldecode() here, as valid characters like "+" will be incorrectly replaced with spaces
- return trim(
- preg_replace(
- array(
- '/(.*)/is',
- '/<\/body>(.*)/is',
- ),
- '',
- str_replace(array('%5B', '%5D'), array('[', ']'), $this->getDocument()->saveHTML())
- )
+ // If there's no body, the content is empty string
+ if (!$doc->getElementsByTagName('body')->length) return '';
+
+ // saveHTML Percentage-encodes any URI-based attributes. We don't want this, since it interferes with
+ // shortcodes. So first, save all the attribute values for later restoration.
+ $attrs = array(); $i = 0;
+
+ foreach ($xp->query('//body//@*') as $attr) {
+ $key = "__HTMLVALUE_".($i++);
+ $attrs[$key] = $attr->value;
+ $attr->value = $key;
+ }
+
+ // Then, call saveHTML & extract out the content from the body tag
+ $res = preg_replace(
+ array(
+ '/^(.*?)/is',
+ '/<\/body>(.*?)$/isD',
+ ),
+ '',
+ $doc->saveHTML()
);
+
+ // Then replace the saved attributes with their original versions
+ $res = preg_replace_callback('/__HTMLVALUE_(\d+)/', function($matches) use ($attrs) {
+ return $attrs[$matches[0]];
+ }, $res);
+
+ return $res;
}
+ /** @see HTMLValue::getContent() */
+ public function forTemplate() {
+ return $this->getContent();
+ }
+
+ /** @var DOMDocument */
+ private $document = null;
+ /** @var bool */
+ private $valid = true;
+
+ /**
+ * Get the DOMDocument for the passed content
+ * @return DOMDocument | false - Return false if HTML not valid, the DOMDocument instance otherwise
+ */
+ public function getDocument() {
+ if (!$this->valid) {
+ return false;
+ }
+ else if ($this->document) {
+ return $this->document;
+ }
+ else {
+ $this->document = new DOMDocument('1.0', 'UTF-8');
+ $this->document->strictErrorChecking = false;
+ $this->document->formatOutput = false;
+
+ return $this->document;
+ }
+ }
+
+ /**
+ * Is this HTMLValue in an errored state?
+ * @return bool
+ */
+ public function isValid() {
+ return $this->valid;
+ }
+
+ /**
+ * @param DOMDocument $document
+ */
+ public function setDocument($document) {
+ $this->document = $document;
+ $this->valid = true;
+ }
+
+ public function setInvalid() {
+ $this->document = $this->valid = false;
+ }
+
+ /**
+ * Pass through any missed method calls to DOMDocument (if they exist)
+ * so that HTMLValue can be treated mostly like an instance of DOMDocument
+ */
+ public function __call($method, $arguments) {
+ $doc = $this->getDocument();
+
+ if(method_exists($doc, $method)) {
+ return call_user_func_array(array($doc, $method), $arguments);
+ }
+ else {
+ return parent::__call($method, $arguments);
+ }
+ }
+
+ /**
+ * Make an xpath query against this HTML
+ *
+ * @param $query string - The xpath query string
+ * @return DOMNodeList
+ */
+ public function query($query) {
+ $xp = new DOMXPath($this->getDocument());
+ return $xp->query($query);
+ }
+}
+
+class SS_HTML4Value extends SS_HTMLValue {
+
/**
* @param string $content
* @return bool
@@ -73,41 +146,12 @@ class SS_HTMLValue extends ViewableData {
// This behaviour is apparently XML spec, but we don't want this because it messes up the HTML
$content = str_replace(chr(13), '', $content);
+ // Reset the document if we're in an invalid state for some reason
+ if (!$this->isValid()) $this->setDocument(null);
+
return @$this->getDocument()->loadHTML(
'' .
"$content"
);
}
-
- /**
- * @return DOMDocument
- */
- public function getDocument() {
- return $this->document;
- }
-
- /**
- * @param DOMDocument $document
- */
- public function setDocument($document) {
- $this->document = $document;
- }
-
- /**
- * A simple convenience wrapper around DOMDocument::getElementsByTagName().
- *
- * @param string $name
- * @return DOMNodeList
- */
- public function getElementsByTagName($name) {
- return $this->getDocument()->getElementsByTagName($name);
- }
-
- /**
- * @see HTMLValue::getContent()
- */
- public function forTemplate() {
- return $this->getContent();
- }
-
}
diff --git a/tests/integration/HTMLValueTest.php b/tests/integration/HTML4ValueTest.php
similarity index 78%
rename from tests/integration/HTMLValueTest.php
rename to tests/integration/HTML4ValueTest.php
index aa2beadbb..b29e99d33 100644
--- a/tests/integration/HTMLValueTest.php
+++ b/tests/integration/HTML4ValueTest.php
@@ -3,10 +3,10 @@
* @package framework
* @subpackage tests
*/
-class SS_HTMLValueTest extends SapphireTest {
-
+class SS_HTML4ValueTest extends SapphireTest {
public function testInvalidHTMLSaving() {
- $value = new SS_HTMLValue();
+ $value = new SS_HTML4Value();
+
$invalid = array (
'Enclosed Value
' => 'Enclosed Value
',
'' => '',
@@ -22,20 +22,15 @@ class SS_HTMLValueTest extends SapphireTest {
}
public function testUtf8Saving() {
- $value = new SS_HTMLValue();
+ $value = new SS_HTML4Value();
+
$value->setContent('ö ß ā い 家
');
$this->assertEquals('ö ß ā い 家
', $value->getContent());
}
- public function testOutputFormatting() {
- $value = new SS_HTMLValue();
- $value->setOutputFormatting(true);
- $value->setContent('');
- $this->assertEquals('', $value->getContent(), 'Formatted output works');
- }
-
public function testInvalidHTMLTagNames() {
- $value = new SS_HTMLValue();
+ $value = new SS_HTML4Value();
+
$invalid = array(
'',
'',
@@ -53,7 +48,8 @@ class SS_HTMLValueTest extends SapphireTest {
}
public function testMixedNewlines() {
- $value = new SS_HTMLValue();
+ $value = new SS_HTML4Value();
+
$value->setContent("
paragraph
\n
");
$this->assertEquals(
"
paragraph
\n
",