2011-03-30 17:46:13 +13:00
|
|
|
<?php
|
|
|
|
/**
|
2012-04-12 18:02:46 +12:00
|
|
|
* @package framework
|
2011-03-30 17:46:13 +13:00
|
|
|
* @subpackage misc
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Base class for HTML cleaning implementations.
|
|
|
|
*/
|
|
|
|
abstract class HTMLCleaner extends Object {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $defaultConfig = array();
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @var $config Array configuration variables for HTMLCleaners that support configuration (like Tidy)
|
|
|
|
*/
|
|
|
|
public $config;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param Array The configuration for the cleaner, if necessary
|
|
|
|
*/
|
|
|
|
public function __construct($config = null) {
|
|
|
|
if ($config) $this->config = array_merge($this->defaultConfig, $config);
|
|
|
|
else $this->config = $this->defaultConfig;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param Array
|
|
|
|
*/
|
|
|
|
public function setConfig($config) {
|
|
|
|
$this->config = $config;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return Array
|
|
|
|
*/
|
|
|
|
public function getConfig() {
|
|
|
|
return $this->config;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Passed a string, return HTML that has been tidied.
|
|
|
|
*
|
|
|
|
* @param String HTML
|
|
|
|
* @return String HTML, tidied
|
|
|
|
*/
|
2012-09-19 12:07:39 +02:00
|
|
|
abstract public function cleanHTML($content);
|
2011-03-30 17:46:13 +13:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Experimental inst class to create a default html cleaner class
|
|
|
|
*
|
|
|
|
* @return PurifierHTMLCleaner|TidyHTMLCleaner
|
|
|
|
*/
|
|
|
|
public static function inst() {
|
|
|
|
if (class_exists('HTMLPurifier')) return new PurifierHTMLCleaner();
|
|
|
|
elseif (class_exists('tidy')) return new TidyHTMLCleaner();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cleans HTML using the HTMLPurifier package
|
|
|
|
* http://htmlpurifier.org/
|
|
|
|
*/
|
|
|
|
class PurifierHTMLCleaner extends HTMLCleaner {
|
|
|
|
|
|
|
|
public function cleanHTML($content) {
|
|
|
|
$html = new HTMLPurifier();
|
|
|
|
$doc = new SS_HTMLValue($html->purify($content));
|
|
|
|
return $doc->getContent();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Cleans HTML using the Tidy package
|
|
|
|
* http://php.net/manual/en/book.tidy.php
|
|
|
|
*/
|
|
|
|
class TidyHTMLCleaner extends HTMLCleaner {
|
|
|
|
|
|
|
|
protected $defaultConfig = array(
|
|
|
|
'clean' => true,
|
|
|
|
'output-xhtml' => true,
|
|
|
|
'show-body-only' => true,
|
|
|
|
'wrap' => 0,
|
|
|
|
'doctype' => 'omit',
|
|
|
|
'input-encoding' => 'utf8',
|
|
|
|
'output-encoding' => 'utf8'
|
|
|
|
);
|
|
|
|
|
|
|
|
public function cleanHTML($content) {
|
|
|
|
$tidy = new tidy();
|
|
|
|
$output = $tidy->repairString($content, $this->config);
|
2012-10-16 17:10:54 +13:00
|
|
|
|
|
|
|
// Clean leading/trailing whitespace
|
|
|
|
return preg_replace('/(^\s+)|(\s+$)/', '', $output);
|
2011-03-30 17:46:13 +13:00
|
|
|
}
|
2012-03-24 16:04:52 +13:00
|
|
|
}
|