Building XML parsers with the XML class
*
* To use the XML parser, you need to create a subclass. Then you define process_XXX functions to process different tags. The parser walks sequentially through the file and calls the process_XXX functions when it hits different tags.
*
* * **process_(tagname):** This will be called when the parser finds the start tag. It will be passed the attributes of the tag.
* * **process_(tagname)_end:** This will be called when the parser finds the closng tag. It will be passed the attributes and the content of the tag.
*
* * **process_tag :** This will be called if it is implemented and a method has not been created already for the tag being parsed. It is passed the tag name and attributes of the tag.
*
* * **process_tag_end:** This will be called if it is implemented and a method has not been created already for the tag being parsed. It is passed the tag name, the content of the tag and the attributes of the tag.
*
*
* The idea is that within this function, you build up $this->result with a useful representation of the XML data. It could be an array structure, an object, or something else.
*
* There are a couple of methods on the XML object that will help with
*
* * **$this->inContext('(tag)', '(tag).(class)'):** This will return true if the current tag has the specified tags as ancestors, in the order that you've specified.
*
* Finally, there are public methods that can be called on an instantiated XML subclass. This is how you will make use of your new parser.
*
* * **$parser->tidyXHTML($content):** This will run "tidy -asxhtml" on your content. This is useful if you're wanting to use the XML parser to parse HTML that may or may not be XML compliant.
* * **$parser->parse($content):** This will call the parser on the given XML content, and return the $this->result object that gets built.
*
* Example
*
*
* class DeliciousHtmlParser extends XML {
* protected $currentItem = 0;
*
* function process_li($attributes) {
* if($attributes['class'] == "post") {
* $this->currentItem = sizeof($this->parsed);
* }
* }
*
* function process_a_end($content, $attributes) {
* if($this->inContext('li.post','h4.desc')) {
* $this->parsed[$this->currentItem][link] = $attributes[href];
* $this->parsed[$this->currentItem][title] = $content;
*
* } else if($this->inContext('li.post','div.meta') && $attributes['class'] == 'tag') {
* $this->parsed[$this->currentItem][tags][] = $content;
* }
* }
* }
*
* $html = file_get_contents("http://del.icio.us/$user/?setcount=100");
* $parser = new DeliciousHtmlParser();
* $tidyHtml = $parser->tidyXHTML($html);
* $result = $parser->parse($tidyHtml);
*
*
* @package sapphire
* @subpackage misc
*/
class XML extends Object {
protected $parser;
protected $context, $attributeStore;
protected $parsed;
protected $collatedText;
function tidyXHTML($content) {
$cleanFile = TEMP_FOLDER . "/cleaner.tmp";
$fh = fopen($cleanFile,"wb");
fwrite($fh, $content);
fclose($fh);
if(file_exists($cleanFile)) {
$result = `tidy -asxhtml $cleanFile`;
unlink($cleanFile);
return $result;
}
}
function parse($content, $recursive = false) {
$this->parser = xml_parser_create('UTF-8');
// Andrew keeps giving me the wrong FSKING encoding! :-P
$content = ereg_replace('encoding="[^"]+"','encoding="utf-8"', $content);
xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
//xml_parser_set_option($this->parser, XML_OPTION_TARGET_ENCODING, 'UTF-8');
xml_set_object($this->parser, $this);
xml_set_element_handler($this->parser, "tag_open", "tag_close");
xml_set_character_data_handler($this->parser, "cdata");
$this->parsed = null;
$this->context = array();
$this->attributeStore = array();
xml_parse($this->parser, $content);
// Handle a bad encoding type by forcing ISO-8859-15
if(xml_get_error_code($this->parser) == 32 && !$recursive) {
$content = ereg_replace('encoding="[^"]+"','encoding="utf-8"', $content);
return $this->parse($content, true);
}
if($err = xml_get_error_code($this->parser)) {
user_error("XML parser broke with error $err:" . xml_error_string($err), E_USER_ERROR);
}
return $this->parsed;
}
function inContext() {
$items = func_get_args();
$i=0;
foreach($items as $item) {
while($i < sizeof($this->context)) {
if($this->context[$i] == $item) break;
$i++;
}
if($this->context[$i] != $item) return false;
}
return true;
}
function stackActionFor($tag) {
for($i=sizeof($this->contextStack)-1;$i>=0;$i--) {
if($this->context[$i]['tag'] == $tag) return $this->contextStack[$i]['action'];
}
}
function tag_open($parser, $tag, $attributes) {
// Strip namespaces out of tags and attributes
$tag = ereg_replace('[^:]+:','',$tag);
if($attributes) foreach($attributes as $k => $v) $newAttributes[ereg_replace('[^:]+:','',$k)] = $v;
$attributes = isset($newAttributes) ? $newAttributes : $attributes;
if(isset($attributes['class'])) {
$this->context[] = "$tag.{$attributes['class']}";
} else {
$this->context[] = $tag;
}
$this->attributeStore[] = $attributes;
$this->collatedText = "";
$tagProcessorFunc = "process_$tag";
if($this->hasMethod($tagProcessorFunc)) {
$this->$tagProcessorFunc($attributes);
}elseif($this->hasMethod($tagProcessorFunc = "process_tag")){
$this->$tagProcessorFunc($tag, $attributes);
}
if($attributes) foreach($attributes as $k => $v) {
$attProcessorFunc = "processatt_$k";
if($this->hasMethod($attProcessorFunc)) {
$this->$attProcessorFunc($tag, $attributes);
}
}
}
function tag_close($parser, $tag) {
$tag = ereg_replace('[^:]+:','',$tag);
array_pop($this->context);
$attributes = array_pop($this->attributeStore);
if(method_exists($this, $funcName = "process_{$tag}_end")) {
$this->$funcName($this->collatedText, $attributes);
}elseif(method_exists($this,$funcName = "process_tag_end")){
// else run default method
$this->$funcName($tag,$this->collatedText, $attributes);
}
$this->collatedText = "";
}
function cdata($parser, $cdata) {
$this->collatedText .= $cdata;
}
}