silverstripe-framework/dev/CSSContentParser.php
Sean Harvey fe42abce69 BUG CSSContentParser fails if CLI tidy doesn't output anything.
In cases where CLI tidy fails to recognise HTML with errors or
unknown HTML5 elements, it doesn't output any content and
CSSContentParser fails.

This adds the --force-output parameter so that it always outputs
the input content, even if it wasn't completely fixed by tidy.

It seems that tidy class in PHP already does this, so this just
fixes the CLI use of tidy in CSSContentParser to be consistent.

This fix can be observed on the HHVM travis builds, as tidy isn't
compiled into HHVM as an extension.
2014-09-03 23:08:53 +12:00

114 lines
3.5 KiB
PHP

<?php
/**
* CSSContentParser enables parsing & assertion running of HTML content via CSS selectors.
* It works by converting the content to XHTML using tidy, rewriting the CSS selectors as XPath queries, and executing
* those using SimpeXML.
*
* It was built to facilitate testing using PHPUnit and contains a number of assert methods that will throw PHPUnit
* assertion exception when applicable.
*
* Tries to use the PHP tidy extension (http://php.net/tidy),
* and falls back to the "tidy" CLI tool. If none of those exists,
* the string is parsed directly without sanitization.
*
* Caution: Doesn't fully support HTML elements like <header>
* due to them being declared illegal by the "tidy" preprocessing step.
*
* @package framework
* @subpackage core
*/
class CSSContentParser extends Object {
protected $simpleXML = null;
public function __construct($content) {
if(extension_loaded('tidy')) {
// using the tidy php extension
$tidy = new tidy();
$tidy->parseString(
$content,
array(
'output-xhtml' => true,
'numeric-entities' => true,
'wrap' => 0, // We need this to be consistent for functional test string comparisons
),
'utf8'
);
$tidy->cleanRepair();
$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
$tidy = str_replace('&#160;','',$tidy);
} elseif(@shell_exec('which tidy')) {
// using tiny through cli
$CLI_content = escapeshellarg($content);
$tidy = `echo $CLI_content | tidy --force-output 1 -n -q -utf8 -asxhtml -w 0 2> /dev/null`;
$tidy = str_replace('xmlns="http://www.w3.org/1999/xhtml"','',$tidy);
$tidy = str_replace('&#160;','',$tidy);
} else {
// no tidy library found, hence no sanitizing
$tidy = $content;
}
$this->simpleXML = @simplexml_load_string($tidy, 'SimpleXMLElement', LIBXML_NOWARNING);
if(!$this->simpleXML) {
throw new Exception('CSSContentParser::__construct(): Could not parse content.'
. ' Please check the PHP extension tidy is installed.');
}
parent::__construct();
}
/**
* Returns a number of SimpleXML elements that match the given CSS selector.
* Currently the selector engine only supports querying by tag, id, and class.
* See {@link getByXpath()} for a more direct selector syntax.
*
* @param String $selector
* @return SimpleXMLElement
*/
public function getBySelector($selector) {
$xpath = $this->selector2xpath($selector);
return $this->getByXpath($xpath);
}
/**
* Allows querying the content through XPATH selectors.
*
* @param String $xpath SimpleXML compatible XPATH statement
* @return SimpleXMLElement|false
*/
public function getByXpath($xpath) {
return $this->simpleXML->xpath($xpath);
}
/**
* Converts a CSS selector into an equivalent xpath expression.
* Currently the selector engine only supports querying by tag, id, and class.
*
* @param String $selector See {@link getBySelector()}
* @return String XPath expression
*/
public function selector2xpath($selector) {
$parts = preg_split('/\\s+/', $selector);
$xpath = "";
foreach($parts as $part) {
if(preg_match('/^([A-Za-z][A-Za-z0-9]*)/', $part, $matches)) {
$xpath .= "//$matches[1]";
} else {
$xpath .= "//*";
}
$xfilters = array();
if(preg_match('/#([^#.\[]+)/', $part, $matches)) {
$xfilters[] = "@id='$matches[1]'";
}
if(preg_match('/\.([^#.\[]+)/', $part, $matches)) {
$xfilters[] = "contains(@class,'$matches[1]')";
}
if($xfilters) $xpath .= '[' . implode(',', $xfilters) . ']';
}
return $xpath;
}
}