Merge pull request #10647 from creative-commoners/pulls/5/dom-crawler

ENH Use masterminds/html5 for HTMLValue
This commit is contained in:
Maxime Rainville 2023-01-18 11:38:07 +13:00 committed by GitHub
commit a65d470e93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 192 additions and 145 deletions

View File

@ -2,10 +2,6 @@
Name: corehtml Name: corehtml
--- ---
SilverStripe\Core\Injector\Injector: SilverStripe\Core\Injector\Injector:
SilverStripe\View\Parsers\HTMLValue:
class: SilverStripe\View\Parsers\HTML4Value
# Shorthand
HTMLValue: '%$SilverStripe\View\Parsers\HTMLValue'
SilverStripe\Forms\HTMLEditor\HTMLEditorConfig: SilverStripe\Forms\HTMLEditor\HTMLEditorConfig:
class: SilverStripe\Forms\HTMLEditor\TinyMCEConfig class: SilverStripe\Forms\HTMLEditor\TinyMCEConfig
SilverStripe\Forms\HTMLEditor\TinyMCEScriptGenerator: '%$SilverStripe\Forms\HTMLEditor\TinyMCECombinedGenerator' SilverStripe\Forms\HTMLEditor\TinyMCEScriptGenerator: '%$SilverStripe\Forms\HTMLEditor\TinyMCECombinedGenerator'

View File

@ -29,6 +29,7 @@
"embed/embed": "^4.4.7", "embed/embed": "^4.4.7",
"league/csv": "^9.8.0", "league/csv": "^9.8.0",
"m1/env": "^2.2.0", "m1/env": "^2.2.0",
"masterminds/html5": "^2.7",
"monolog/monolog": "^3.2.0", "monolog/monolog": "^3.2.0",
"nikic/php-parser": "^4.15.0", "nikic/php-parser": "^4.15.0",
"psr/container": "^1.1 || ^2.0", "psr/container": "^1.1 || ^2.0",

View File

@ -82,7 +82,7 @@ class HTML
if ($content) { if ($content) {
throw new InvalidArgumentException("Void element \"{$tag}\" cannot have content"); throw new InvalidArgumentException("Void element \"{$tag}\" cannot have content");
} }
return "<{$tag}{$preparedAttributes} />"; return "<{$tag}{$preparedAttributes}>";
} }
// Closed tag type // Closed tag type

View File

@ -1,31 +0,0 @@
<?php
namespace SilverStripe\View\Parsers;
class HTML4Value extends HTMLValue
{
/**
* @param string $content
* @return bool
*/
public function setContent($content)
{
// Ensure that \r (carriage return) characters don't get replaced with "&#13;" entity by DOMDocument
// This behaviour is apparently XML spec, but we don't want this because it messes up the HTML
$content = str_replace(chr(13), '', $content ?? '');
// Reset the document if we're in an invalid state for some reason
if (!$this->isValid()) {
$this->setDocument(null);
}
$errorState = libxml_use_internal_errors(true);
$result = $this->getDocument()->loadHTML(
'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>' . "<body>$content</body></html>"
);
libxml_clear_errors();
libxml_use_internal_errors($errorState);
return $result;
}
}

View File

@ -4,22 +4,20 @@ namespace SilverStripe\View\Parsers;
use SilverStripe\Core\Convert; use SilverStripe\Core\Convert;
use SilverStripe\View\ViewableData; use SilverStripe\View\ViewableData;
use Masterminds\HTML5;
use DOMNodeList; use DOMNodeList;
use DOMXPath; use DOMXPath;
use DOMDocument; use DOMDocument;
use SilverStripe\View\HTML;
/** /**
* This class handles the converting of HTML fragments between a string and a DOMDocument based * This class handles the converting of HTML fragments between a string and a DOMDocument based
* representation. * representation.
* *
* It's designed to allow dependency injection to replace the standard HTML4 version with one that
* handles XHTML or HTML5 instead
*
* @mixin DOMDocument * @mixin DOMDocument
*/ */
abstract class HTMLValue extends ViewableData class HTMLValue extends ViewableData
{ {
public function __construct($fragment = null) public function __construct($fragment = null)
{ {
if ($fragment) { if ($fragment) {
@ -28,7 +26,25 @@ abstract class HTMLValue extends ViewableData
parent::__construct(); parent::__construct();
} }
abstract public function setContent($fragment); /**
* @param string $content
* @return bool
*/
public function setContent($content)
{
$content = preg_replace('#</?(html|head|body)[^>]*>#si', '', $content);
$html5 = new HTML5(['disable_html_ns' => true]);
$document = $html5->loadHTML(
'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>' .
"<body>$content</body></html>"
);
if ($document) {
$this->setDocument($document);
return true;
}
$this->valid = false;
return false;
}
/** /**
* @return string * @return string

View File

@ -659,7 +659,7 @@ class ShortcodeParser
list($content, $tags) = $this->replaceElementTagsWithMarkers($content); list($content, $tags) = $this->replaceElementTagsWithMarkers($content);
/** @var HTMLValue $htmlvalue */ /** @var HTMLValue $htmlvalue */
$htmlvalue = Injector::inst()->create('HTMLValue', $content); $htmlvalue = Injector::inst()->create(HTMLValue::class, $content);
// Now parse the result into a DOM // Now parse the result into a DOM
if (!$htmlvalue->isValid()) { if (!$htmlvalue->isValid()) {

View File

@ -14,7 +14,7 @@ class HTMLTest extends SapphireTest
'name' => 'description', 'name' => 'description',
'content' => 'test tag', 'content' => 'test tag',
]); ]);
$this->assertEquals('<meta name="description" content="test tag" />', $tag); $this->assertEquals('<meta name="description" content="test tag">', $tag);
} }
public function testEmptyAttributes() public function testEmptyAttributes()
@ -27,7 +27,7 @@ class HTMLTest extends SapphireTest
'disabled' => false, 'disabled' => false,
'readonly' => true, 'readonly' => true,
]); ]);
$this->assertEquals('<meta value="0" max="3" readonly="1" />', $tag); $this->assertEquals('<meta value="0" max="3" readonly="1">', $tag);
} }
public function testNormalTag() public function testNormalTag()
@ -52,7 +52,7 @@ class HTMLTest extends SapphireTest
'alt' => '', 'alt' => '',
]); ]);
$this->assertEquals('<img src="example.png" alt="" />', $tag); $this->assertEquals('<img src="example.png" alt="">', $tag);
} }
public function testVoidContentError() public function testVoidContentError()

View File

@ -1,98 +0,0 @@
<?php
namespace SilverStripe\View\Tests\Parsers;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\View\Parsers\HTML4Value;
class HTML4ValueTest extends SapphireTest
{
public function testInvalidHTMLSaving()
{
$value = new HTML4Value();
$invalid = [
'<p>Enclosed Value</p></p>'
=> '<p>Enclosed Value</p>',
'<meta content="text/html"></meta>'
=> '<meta content="text/html">',
'<p><div class="example"></div></p>'
=> '<p></p><div class="example"></div>',
'<html><html><body><falsetag "attribute=""attribute""">'
=> '<falsetag></falsetag>',
'<body<body<body>/bodu>/body>'
=> '/bodu&gt;/body&gt;'
];
foreach ($invalid as $input => $expected) {
$value->setContent($input);
$this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be saved');
}
}
public function testUtf8Saving()
{
$value = new HTML4Value();
$value->setContent('<p>ö ß ā い 家</p>');
$this->assertEquals('<p>ö ß ā い 家</p>', $value->getContent());
}
public function testInvalidHTMLTagNames()
{
$value = new HTML4Value();
$invalid = [
'<p><div><a href="test-link"></p></div>',
'<html><div><a href="test-link"></a></a></html_>',
'""\'\'\'"""\'""<<<>/</<htmlbody><a href="test-link"<<>'
];
foreach ($invalid as $input) {
$value->setContent($input);
$this->assertEquals(
'test-link',
$value->getElementsByTagName('a')->item(0)->getAttribute('href'),
'Link data can be extracted from malformed HTML'
);
}
}
public function testMixedNewlines()
{
$value = new HTML4Value();
$value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>");
$this->assertEquals(
"<p>paragraph</p>\n<ul><li>1</li>\n</ul>",
$value->getContent(),
'Newlines get converted'
);
}
public function testAttributeEscaping()
{
$value = new HTML4Value();
$value->setContent('<a href="[]"></a>');
$this->assertEquals('<a href="[]"></a>', $value->getContent(), "'[' character isn't escaped");
$value->setContent('<a href="&quot;"></a>');
$this->assertEquals('<a href="&quot;"></a>', $value->getContent(), "'\"' character is escaped");
}
public function testGetContent()
{
$value = new HTML4Value();
$value->setContent('<p>This is valid</p>');
$this->assertEquals('<p>This is valid</p>', $value->getContent(), "Valid content is returned");
$value->setContent('<p?< This is not really valid but it will get parsed into something valid');
// can sometimes get a this state where HTMLValue->valid is false
// for instance if a content editor saves something really weird in a LiteralField
// we can manually get to this state via ->setInvalid()
$value->setInvalid();
$this->assertEquals('', $value->getContent(), "Blank string is returned when invalid");
}
}

View File

@ -0,0 +1,163 @@
<?php
namespace SilverStripe\View\Tests\Parsers;
use SilverStripe\Dev\SapphireTest;
use SilverStripe\View\Parsers\HTMLValue;
use SilverStripe\ORM\FieldType\DBHTMLText;
use SilverStripe\View\Parsers\ShortcodeParser;
use SilverStripe\Core\Convert;
class HTMLValueTest extends SapphireTest
{
public function testInvalidHTMLParsing()
{
$value = new HTMLValue();
$invalid = [
'<p>Enclosed Value</p><p>a' => '<p>Enclosed Value</p><p>a</p>',
'<meta content="text/html"></meta>' => '<meta content="text/html">',
'<p><div class="example"></div><p>' => '<p></p><div class="example"></div><p></p>',
'<html><html><body><falsetag "attribute=""attribute""">' => '<falsetag></falsetag>',
'<body<body<body>/bodu>/body>' => '/bodu&gt;/body&gt;'
];
foreach ($invalid as $input => $expected) {
$value->setContent($input);
$this->assertEquals($expected, $value->getContent(), 'Invalid HTML can be parsed');
}
}
public function testUtf8Saving()
{
$value = new HTMLValue();
$value->setContent('<p>ö ß ā い 家</p>');
$this->assertEquals('<p>ö ß ā い 家</p>', $value->getContent());
}
public function testWhitespaceHandling()
{
$value = new HTMLValue();
$value->setContent('<p></p> <p></p>');
$this->assertEquals('<p></p> <p></p>', $value->getContent());
}
public function testInvalidHTMLTagNames()
{
$value = new HTMLValue();
$invalid = [
'<p><div><a href="test-link"></p></div>',
'<html><div><a href="test-link"></a></a></html_>'
];
foreach ($invalid as $input) {
$value->setContent($input);
$this->assertEquals(
'test-link',
$value->getElementsByTagName('a')->item(0)->getAttribute('href'),
'Link data can be extraced from malformed HTML'
);
}
}
public function testMixedNewlines()
{
$value = new HTMLValue();
$value->setContent("<p>paragraph</p>\n<ul><li>1</li>\r\n</ul>");
$this->assertEquals(
"<p>paragraph</p>\n<ul><li>1</li>\n</ul>",
$value->getContent(),
'Newlines get converted'
);
}
public function testAttributeEscaping()
{
$value = new HTMLValue();
$value->setContent('<a href="[]"></a>');
$this->assertEquals('<a href="[]"></a>', $value->getContent(), "'[' character isn't escaped");
$value->setContent('<a href="&quot;"></a>');
$this->assertEquals('<a href="&quot;"></a>', $value->getContent(), "'\"' character is escaped");
}
public function testShortcodeValue()
{
ShortcodeParser::get('default')->register(
'test_shortcode',
function () {
return 'bit of test shortcode output';
}
);
$content = DBHTMLText::create('Test', ['shortcodes' => true])
->setValue('<p>Some content with a [test_shortcode] and a <br /> followed by an <hr> in it.</p>')
->forTemplate();
$this->assertStringContainsString(
// hr is flow content, not phrasing content, so must be corrected to be outside the p tag.
'<p>Some content with a bit of test shortcode output and a <br> followed by an </p><hr> in it.',
$content
);
}
public function testEntities()
{
$content = '<a href="http://domain.test/path?two&vars">ampersand &amp; test & link</a>';
$output = new HTMLValue($content);
$output = $output->getContent();
$this->assertEquals(
'<a href="http://domain.test/path?two&amp;vars">ampersand &amp; test &amp; link</a>',
$output
);
}
public function testShortcodeEntities()
{
ShortcodeParser::get('default')->register(
'sitetree_link_test',
// A mildly stubbed copy from SilverStripe\CMS\Model\SiteTree::link_shortcode_handler
function ($arguments, $content = null, $parser = null) {
$link = Convert::raw2att('https://google.com/search?q=unit&test');
if ($content) {
$link = sprintf('<a href="%s">%s</a>', $link, $parser->parse($content));
}
return $link;
}
);
$content = [
'[sitetree_link_test,id=2]' => 'https://google.com/search?q=unit&amp;test',
// the random [ triggers the shortcode parser, which seems to be where problems arise.
'<a href="https://google.com/search?q=unit&test"> [ non shortcode link</a>' =>
'<a href="https://google.com/search?q=unit&amp;test"> [ non shortcode link</a>',
'[sitetree_link_test,id=1]test link[/sitetree_link_test]' =>
'<a href="https://google.com/search?q=unit&amp;test">test link</a>'
];
foreach ($content as $input => $expected) {
$output = DBHTMLText::create('Test', ['shortcodes' => true])
->setValue($input)
->forTemplate();
$this->assertEquals($expected, $output);
}
}
public function testValidHTMLInNoscriptTags()
{
$value = new HTMLValue();
$noscripts = [
'<noscript><p>Enclosed Value</p></noscript>',
'<noscript><span class="test">Enclosed Value</span></noscript>',
'<noscript><img src="/test.jpg" alt="test"></noscript>',
];
foreach ($noscripts as $noscript) {
$value->setContent($noscript);
$this->assertEquals($noscript, $value->getContent(), 'Child tags are left untouched in noscript tags.');
}
}
}