create(self::$html_cleaner_class);
} else {
//load cleaner if the dependent class is available
$cleaner = HTMLCleaner::inst();
}
}
if ($cleaner) {
$content = $cleaner->cleanHTML($content);
} else {
// At most basic level of cleaning, use DOMDocument to save valid XML.
$doc = Injector::inst()->create('HTMLValue', $content);
$content = $doc->getContent();
}
// Remove empty and tags because browsers hate them
$content = preg_replace('/<(ins|del)[^>]*\/>/','', $content);
return $content;
}
/**
* @param string $from
* @param string $to
* @param bool $escape
* @return string
*/
public static function compareHTML($from, $to, $escape = false) {
// First split up the content into words and tags
$set1 = self::getHTMLChunks($from);
$set2 = self::getHTMLChunks($to);
// Diff that
$diff = new Diff($set1, $set2);
$tagStack[1] = $tagStack[2] = 0;
$rechunked[1] = $rechunked[2] = array();
// Go through everything, converting edited tags (and their content) into single chunks. Otherwise
// the generated HTML gets crusty
foreach($diff->edits as $edit) {
$lookForTag = false;
$stuffFor = [];
switch($edit->type) {
case 'copy':
$lookForTag = false;
$stuffFor[1] = $edit->orig;
$stuffFor[2] = $edit->orig;
break;
case 'change':
$lookForTag = true;
$stuffFor[1] = $edit->orig;
$stuffFor[2] = $edit->final;
break;
case 'add':
$lookForTag = true;
$stuffFor[1] = null;
$stuffFor[2] = $edit->final;
break;
case 'delete':
$lookForTag = true;
$stuffFor[1] = $edit->orig;
$stuffFor[2] = null;
break;
}
foreach($stuffFor as $listName => $chunks) {
if($chunks) {
foreach($chunks as $item) {
// $tagStack > 0 indicates that we should be tag-building
if ($tagStack[$listName]) {
$rechunked[$listName][sizeof($rechunked[$listName])-1] .= ' ' . $item;
} else {
$rechunked[$listName][] = $item;
}
if ($lookForTag
&& !$tagStack[$listName]
&& isset($item[0])
&& $item[0] == "<"
&& substr($item,0,2) != ""
) {
$tagStack[$listName] = 1;
} else if($tagStack[$listName]) {
if(substr($item,0,2) == "") {
$tagStack[$listName]--;
} else if(isset($item[0]) && $item[0] == "<") {
$tagStack[$listName]++;
}
}
}
}
}
}
// Diff the re-chunked data, turning it into maked up HTML
$diff = new Diff($rechunked[1], $rechunked[2]);
$content = '';
foreach($diff->edits as $edit) {
$orig = ($escape) ? Convert::raw2xml($edit->orig) : $edit->orig;
$final = ($escape) ? Convert::raw2xml($edit->final) : $edit->final;
switch($edit->type) {
case 'copy':
$content .= " " . implode(" ", $orig) . " ";
break;
case 'change':
$content .= " " . implode(" ", $final) . " ";
$content .= " " . implode(" ", $orig) . " ";
break;
case 'add':
$content .= " " . implode(" ", $final) . " ";
break;
case 'delete':
$content .= " " . implode(" ", $orig) . " ";
break;
}
}
return self::cleanHTML($content);
}
/**
* @param string|array $content If passed as an array, values will be concatenated with a comma.
* @return array
*/
public static function getHTMLChunks($content) {
if($content && !is_string($content) && !is_array($content) && !is_numeric($content)) {
throw new InvalidArgumentException('$content parameter needs to be a string or array');
}
if(is_array($content)) {
$content = implode(',', $content);
}
$content = str_replace(array(" ", "<", ">"), array(" "," <", "> "), $content);
$candidateChunks = preg_split("/[\t\r\n ]+/", $content);
$chunks = [];
while($chunk = each($candidateChunks)) {
$item = $chunk['value'];
if(isset($item[0]) && $item[0] == "<") {
$newChunk = $item;
while($item[strlen($item)-1] != ">") {
$chunk = each($candidateChunks);
$item = $chunk['value'];
$newChunk .= ' ' . $item;
}
$chunks[] = $newChunk;
} else {
$chunks[] = $item;
}
}
return $chunks;
}
}