create(self::$html_cleaner_class); } else { //load cleaner if the dependent class is available $cleaner = HTMLCleaner::inst(); } } if ($cleaner) { $content = $cleaner->cleanHTML($content); } else { // At most basic level of cleaning, use DOMDocument to save valid XML. $doc = Injector::inst()->create('HTMLValue', $content); $content = $doc->getContent(); } // Remove empty and tags because browsers hate them $content = preg_replace('/<(ins|del)[^>]*\/>/','', $content); return $content; } /** * @param string $from * @param string $to * @param bool $escape * @return string */ public static function compareHTML($from, $to, $escape = false) { // First split up the content into words and tags $set1 = self::getHTMLChunks($from); $set2 = self::getHTMLChunks($to); // Diff that $diff = new Diff($set1, $set2); $tagStack[1] = $tagStack[2] = 0; $rechunked[1] = $rechunked[2] = array(); // Go through everything, converting edited tags (and their content) into single chunks. Otherwise // the generated HTML gets crusty foreach($diff->edits as $edit) { $lookForTag = false; $stuffFor = []; switch($edit->type) { case 'copy': $lookForTag = false; $stuffFor[1] = $edit->orig; $stuffFor[2] = $edit->orig; break; case 'change': $lookForTag = true; $stuffFor[1] = $edit->orig; $stuffFor[2] = $edit->final; break; case 'add': $lookForTag = true; $stuffFor[1] = null; $stuffFor[2] = $edit->final; break; case 'delete': $lookForTag = true; $stuffFor[1] = $edit->orig; $stuffFor[2] = null; break; } foreach($stuffFor as $listName => $chunks) { if($chunks) { foreach($chunks as $item) { // $tagStack > 0 indicates that we should be tag-building if ($tagStack[$listName]) { $rechunked[$listName][sizeof($rechunked[$listName])-1] .= ' ' . $item; } else { $rechunked[$listName][] = $item; } if ($lookForTag && !$tagStack[$listName] && isset($item[0]) && $item[0] == "<" && substr($item,0,2) != "edits as $edit) { $orig = ($escape) ? Convert::raw2xml($edit->orig) : $edit->orig; $final = ($escape) ? Convert::raw2xml($edit->final) : $edit->final; switch($edit->type) { case 'copy': $content .= " " . implode(" ", $orig) . " "; break; case 'change': $content .= " " . implode(" ", $final) . " "; $content .= " " . implode(" ", $orig) . " "; break; case 'add': $content .= " " . implode(" ", $final) . " "; break; case 'delete': $content .= " " . implode(" ", $orig) . " "; break; } } return self::cleanHTML($content); } /** * @param string|array $content If passed as an array, values will be concatenated with a comma. * @return array */ public static function getHTMLChunks($content) { if($content && !is_string($content) && !is_array($content) && !is_numeric($content)) { throw new InvalidArgumentException('$content parameter needs to be a string or array'); } if(is_array($content)) { $content = implode(',', $content); } $content = str_replace(array(" ", "<", ">"), array(" "," <", "> "), $content); $candidateChunks = preg_split("/[\t\r\n ]+/", $content); $chunks = []; while($chunk = each($candidateChunks)) { $item = $chunk['value']; if(isset($item[0]) && $item[0] == "<") { $newChunk = $item; while($item[strlen($item)-1] != ">") { $chunk = each($candidateChunks); $item = $chunk['value']; $newChunk .= ' ' . $item; } $chunks[] = $newChunk; } else { $chunks[] = $item; } } return $chunks; } }