From 341245dd4aaf013d33b292e1ac28494a9cd689b2 Mon Sep 17 00:00:00 2001 From: Julian Seidenberg Date: Wed, 30 Mar 2011 17:46:13 +1300 Subject: [PATCH] ENHANCEMENT Allowing custom HTMLCleaner implementations via Diff:: and new TidyHTMLCleaner and PurifierHTMLCleaner classes --- core/Diff.php | 3 ++ core/HTMLCleaner.php | 98 ++++++++++++++++++++++++++++++++++ tests/core/HTMLCleanerTest.php | 28 ++++++++++ 3 files changed, 129 insertions(+) create mode 100644 core/HTMLCleaner.php create mode 100644 tests/core/HTMLCleanerTest.php diff --git a/core/Diff.php b/core/Diff.php index 333c04b88..ec866fcfd 100755 --- a/core/Diff.php +++ b/core/Diff.php @@ -677,8 +677,11 @@ class Diff if (!$cleaner) { if (class_exists(self::$html_cleaner_class)) { $cleaner = new self::$html_cleaner_class; + } else { + $cleaner = HTMLCleaner::inst(); //load cleaner if the dependent class is available } } + if ($cleaner) { $content = $cleaner->cleanHTML($content); } else { diff --git a/core/HTMLCleaner.php b/core/HTMLCleaner.php new file mode 100644 index 000000000..ca2c40839 --- /dev/null +++ b/core/HTMLCleaner.php @@ -0,0 +1,98 @@ +config = array_merge($this->defaultConfig, $config); + else $this->config = $this->defaultConfig; + } + + /** + * @param Array + */ + public function setConfig($config) { + $this->config = $config; + } + + /** + * @return Array + */ + public function getConfig() { + return $this->config; + } + + /** + * Passed a string, return HTML that has been tidied. + * + * @param String HTML + * @return String HTML, tidied + */ + public abstract function cleanHTML($content); + + /** + * Experimental inst class to create a default html cleaner class + * + * @return PurifierHTMLCleaner|TidyHTMLCleaner + */ + public static function inst() { + if (class_exists('HTMLPurifier')) return new PurifierHTMLCleaner(); + elseif (class_exists('tidy')) return new TidyHTMLCleaner(); + } +} + + +/** + * Cleans HTML using the HTMLPurifier package + * http://htmlpurifier.org/ + */ +class PurifierHTMLCleaner extends HTMLCleaner { + + public function cleanHTML($content) { + $html = new HTMLPurifier(); + $doc = new SS_HTMLValue($html->purify($content)); + return $doc->getContent(); + } +} + +/** + * Cleans HTML using the Tidy package + * http://php.net/manual/en/book.tidy.php + */ +class TidyHTMLCleaner extends HTMLCleaner { + + protected $defaultConfig = array( + 'clean' => true, + 'output-xhtml' => true, + 'show-body-only' => true, + 'wrap' => 0, + 'doctype' => 'omit', + 'input-encoding' => 'utf8', + 'output-encoding' => 'utf8' + ); + + public function cleanHTML($content) { + $tidy = new tidy(); + $output = $tidy->repairString($content, $this->config); + return $output; + } +} \ No newline at end of file diff --git a/tests/core/HTMLCleanerTest.php b/tests/core/HTMLCleanerTest.php new file mode 100644 index 000000000..5ddd9ba46 --- /dev/null +++ b/tests/core/HTMLCleanerTest.php @@ -0,0 +1,28 @@ +assertEquals( + $cleaner->cleanHTML('

wrong nesting

' . "\n"), + '

wrong nesting

' . "\n", + "HTML cleaned properly" + ); + $this->assertEquals( + $cleaner->cleanHTML('

unclosed paragraph' . "\n"), + '

unclosed paragraph

' . "\n", + "HTML cleaned properly" + ); + } else { + $this->markTestSkipped('No HTMLCleaner library available (tidy or HTMLBeautifier)'); + } + } + +} \ No newline at end of file