ENHANCEMENT: Piston in php-peg library

This commit is contained in:
Hamish Friedlander 2011-02-09 16:53:59 +13:00
parent 66bc92892e
commit 33d102d674
12 changed files with 1729 additions and 0 deletions

9
thirdparty/php-peg/.piston.yml vendored Normal file
View File

@ -0,0 +1,9 @@
---
format: 1
handler:
commit: afb28caf712815da82cd6bb8ece8ac12ab9b2188
branch: master
lock: false
repository_class: Piston::Git::Repository
repository_url: git://github.com/hafriedlander/php-peg.git
exported_to: 0669b742904f8c9f7cd97569166663396897dab9

783
thirdparty/php-peg/Compiler.php vendored Normal file
View File

@ -0,0 +1,783 @@
<?php
/**
* PEG Generator - A PEG Parser for PHP
*
* @author Hamish Friedlander / SilverStripe
*
* See README.md for documentation
*
*/
require 'PHPBuilder.php' ;
class Flags {
function __construct( $parent = NULL ) {
$this->parent = $parent ;
$this->flags = array() ;
}
function __set( $k, $v ) {
$this->flags[$k] = $v ;
return $v ;
}
function __get( $k ) {
if ( isset( $this->flags[$k] ) ) return $this->flags[$k] ;
if ( isset( $this->parent ) ) return $this->parent->$k ;
return NULL ;
}
}
/**
* PHPWriter contains several code generation snippets that are used both by the Token and the Rule compiler
*/
class PHPWriter {
static $varid = 0 ;
function varid() {
return '_' . (self::$varid++) ;
}
function function_name( $str ) {
$str = preg_replace( '/-/', '_', $str ) ;
$str = preg_replace( '/\$/', 'DLR', $str ) ;
$str = preg_replace( '/\*/', 'STR', $str ) ;
$str = preg_replace( '/[^\w]+/', '', $str ) ;
return $str ;
}
function save($id) {
return PHPBuilder::build()
->l(
'$res'.$id.' = $result;',
'$pos'.$id.' = $this->pos;'
);
}
function restore( $id, $remove = FALSE ) {
$code = PHPBuilder::build()
->l(
'$result = $res'.$id.';',
'$this->pos = $pos'.$id.';'
);
if ( $remove ) $code->l(
'unset( $res'.$id.' );',
'unset( $pos'.$id.' );'
);
return $code ;
}
function match_fail_conditional( $on, $match = NULL, $fail = NULL ) {
return PHPBuilder::build()
->b( 'if (' . $on . ')',
$match,
'MATCH'
)
->b( 'else',
$fail,
'FAIL'
);
}
function match_fail_block( $code ) {
$id = $this->varid() ;
return PHPBuilder::build()
->l(
'$'.$id.' = NULL;'
)
->b( 'do',
$code->replace(array(
'MBREAK' => '$'.$id.' = TRUE; break;',
'FBREAK' => '$'.$id.' = FALSE; break;'
))
)
->l(
'while(0);'
)
->b( 'if( $'.$id.' === TRUE )', 'MATCH' )
->b( 'if( $'.$id.' === FALSE)', 'FAIL' )
;
}
}
/**
* A Token is any portion of a match rule. Tokens are responsible for generating the code to match against them.
*
* This base class provides the compile() function, which handles the token modifiers ( ? * + & ! )
*
* Each child class should provide the function match_code() which will generate the code to match against that specific token type.
* In that generated code they should include the lines MATCH or FAIL when a match or a decisive failure occurs. These will
* be overwritten when they are injected into parent Tokens or Rules. There is no requirement on where MATCH and FAIL can occur.
* They tokens are also responsible for storing and restoring state when nessecary to handle a non-decisive failure.
*
* @author hamish
*
*/
abstract class Token extends PHPWriter {
public $optional = FALSE ;
public $zero_or_more = FALSE ;
public $one_or_more = FALSE ;
public $positive_lookahead = FALSE ;
public $negative_lookahead = FALSE ;
public $silent = FALSE ;
public $tag = FALSE ;
public $type ;
public $value ;
function __construct( $type, $value = NULL ) {
$this->type = $type ;
$this->value = $value ;
}
// abstract protected function match_code() ;
function compile() {
$code = $this->match_code() ;
$id = $this->varid() ;
if ( $this->optional ) {
$code = PHPBuilder::build()
->l(
$this->save($id),
$code->replace( array( 'FAIL' => $this->restore($id,true) ))
);
}
if ( $this->zero_or_more ) {
$code = PHPBuilder::build()
->b( 'while (true)',
$this->save($id),
$code->replace( array(
'MATCH' => NULL,
'FAIL' =>
$this->restore($id,true)
->l( 'break;' )
))
)
->l(
'MATCH'
);
}
if ( $this->one_or_more ) {
$code = PHPBuilder::build()
->l(
'$count = 0;'
)
->b( 'while (true)',
$this->save($id),
$code->replace( array(
'MATCH' => NULL,
'FAIL' =>
$this->restore($id,true)
->l( 'break;' )
)),
'$count += 1;'
)
->b( 'if ($count > 0)', 'MATCH' )
->b( 'else', 'FAIL' );
}
if ( $this->positive_lookahead ) {
$code = PHPBuilder::build()
->l(
$this->save($id),
$code->replace( array(
'MATCH' =>
$this->restore($id)
->l( 'MATCH' ),
'FAIL' =>
$this->restore($id)
->l( 'FAIL' )
)));
}
if ( $this->negative_lookahead ) {
$code = PHPBuilder::build()
->l(
$this->save($id),
$code->replace( array(
'MATCH' =>
$this->restore($id)
->l( 'FAIL' ),
'FAIL' =>
$this->restore($id)
->l( 'MATCH' )
)));
}
if ( $this->tag && !($this instanceof TokenRecurse ) ) {
$rid = $this->varid() ;
$code = PHPBuilder::build()
->l(
'$substack[] = $result;',
'$result = $this->construct( "'.$this->tag.'" );',
$code->replace(array(
'MATCH' => PHPBuilder::build()
->l(
'$subres = $result ;',
'$result = array_pop( $substack ) ;',
'$this->store( $result, $subres, \''.$this->tag.'\' );',
'MATCH'
),
'FAIL' => PHPBuilder::build()
->l(
'$result = array_pop( $substack ) ;',
'FAIL'
)
)));
}
return $code ;
}
}
abstract class TokenTerminal extends Token {
function set_text( $text ) {
return $this->silent ? NULL : '$result["text"] .= ' . $text . ';';
}
protected function match_code( $value ) {
return $this->match_fail_conditional( '( $subres = $this->'.$this->type.'( '.$value.' ) ) !== FALSE',
$this->set_text('$subres')
);
}
}
abstract class TokenExpressionable extends TokenTerminal {
static $expression_rx = '/\$(\w+)/' ;
function contains_expression(){
return preg_match(self::$expression_rx, $this->value);
}
function match_code( $value ) {
if (!$this->contains_expression()) parent::match_code($value);
$id = $this->varid() ;
return PHPBuilder::build()->l(
'$'.$id.' = new ParserExpression( $this, $substack, $result );',
parent::match_code('$'.$id.'->expand('.$value.')')
);
}
}
class TokenLiteral extends TokenExpressionable {
function __construct( $value ) {
parent::__construct( 'literal', $value );
}
function match_code() {
// We inline single-character matches for speed
if ( strlen( eval( 'return '. $this->value . ';' ) ) == 1 ) {
return $this->match_fail_conditional( 'substr($this->string,$this->pos,1) == '.$this->value,
PHPBuilder::build()->l(
'$this->pos += 1;',
$this->set_text( $this->value )
)
);
}
return parent::match_code($this->value);
}
}
class TokenRegex extends TokenExpressionable {
static function escape( $rx ) {
$rx = str_replace( "'", "\\'", $rx ) ;
$rx = str_replace( '\\\\', '\\\\\\\\', $rx ) ;
return $rx ;
}
function __construct( $value ) {
parent::__construct('rx', self::escape($value));
}
function match_code() {
return parent::match_code("'{$this->value}'");
}
}
class TokenWhitespace extends TokenTerminal {
function __construct( $optional ) {
parent::__construct( 'whitespace', $optional ) ;
}
/* Call recursion indirectly */
function match_code() {
$code = parent::match_code( '' ) ;
return $this->value ? $code->replace( array( 'FAIL' => NULL )) : $code ;
}
}
class TokenPHP extends TokenTerminal {
function __construct( $value ) {
parent::__construct( 'php', $value ) ;
}
/* Call recursion indirectly */
function match_code() {
$id = $this->varid() ;
return PHPBuilder::build()
->l(
'$'.$id.' = new ParserExpression( $this, $substack, $result );',
$this->match_fail_block( '( $subres = $'.$id.'->match( \''.$this->value.'\' ) ) !== FALSE',
PHPBuilder::build()
->b( 'if ( is_string( $subres ) )',
$this->set_text('$subres')
)
->b( 'else',
'$this->store($result, $subres);'
)
));
}
}
class TokenRecurse extends Token {
function __construct( $value ) {
parent::__construct( 'recurse', $value ) ;
}
function match_code() {
$function = $this->function_name( $this->value ) ;
$storetag = $this->function_name( $this->tag ? $this->tag : $this->value ) ;
if ( ParserCompiler::$debug ) {
$debug_header = PHPBuilder::build()
->l(
'$indent = str_repeat( " ", $this->depth );',
'$this->depth += 2;',
'$sub = ( strlen( $this->string ) - $this->pos > 20 ) ? ( substr( $this->string, $this->pos, 20 ) . "..." ) : substr( $this->string, $this->pos );',
'$sub = preg_replace( \'/(\r|\n)+/\', " {NL} ", $sub );',
'print( $indent."Matching against '.$function.' (".$sub.")\n" );'
);
$debug_match = PHPBuilder::build()
->l(
'print( $indent."MATCH\n" );',
'$this->depth -= 2;'
);
$debug_fail = PHPBuilder::build()
->l(
'print( $indent."FAIL\n" );',
'$this->depth -= 2;'
);
}
else {
$debug_header = $debug_match = $debug_fail = NULL ;
}
return PHPBuilder::build()->l(
$debug_header,
'$key = "'.$function.'"; $pos = $this->pos;', // :{$this->pos}";',
'$subres = ( $this->packhas( $key, $pos ) ? $this->packread( $key, $pos ) : $this->packwrite( $key, $pos, $this->match_'.$function.'(array_merge($substack, array($result))) ) );',
$this->match_fail_conditional( '$subres !== FALSE',
PHPBuilder::build()->l(
$debug_match,
$this->tag === FALSE ?
'$this->store( $result, $subres );' :
'$this->store( $result, $subres, "'.$storetag.'" );'
),
PHPBuilder::build()->l(
$debug_fail
)
));
}
}
class TokenSequence extends Token {
function __construct( $value ) {
parent::__construct( 'sequence', $value ) ;
}
function match_code() {
$code = PHPBuilder::build() ;
foreach( $this->value as $token ) {
$code->l(
$token->compile()->replace(array(
'MATCH' => NULL,
'FAIL' => 'FBREAK'
))
);
}
$code->l( 'MBREAK' );
return $this->match_fail_block( $code ) ;
}
}
class TokenOption extends Token {
function __construct( $opt1, $opt2 ) {
parent::__construct( 'option', array( $opt1, $opt2 ) ) ;
}
function match_code() {
$id = $this->varid() ;
$code = PHPBuilder::build()
->l(
$this->save($id)
) ;
foreach ( $this->value as $opt ) {
$code->l(
$opt->compile()->replace(array(
'MATCH' => 'MBREAK',
'FAIL' => NULL
)),
$this->restore($id)
);
}
$code->l( 'FBREAK' ) ;
return $this->match_fail_block( $code ) ;
}
}
/**
* Handles storing of information for an expression that applys to the <i>next</i> token, and deletion of that
* information after applying
*
* @author Hamish Friedlander
*/
class Pending {
function __construct() {
$this->what = NULL ;
}
function set( $what, $val = TRUE ) {
$this->what = $what ;
$this->val = $val ;
}
function apply_if_present( $on ) {
if ( $this->what !== NULL ) {
$what = $this->what ;
$on->$what = $this->val ;
$this->what = NULL ;
}
}
}
/**
* Rule parsing and code generation
*
* A rule is the basic unit of a PEG. This parses one rule, and generates a function that will match on a string
*
* @author Hamish Friedlander
*/
class Rule extends PHPWriter {
static $rule_rx = '@^[\x20\t]+(.*)@' ;
static $func_rx = '@^[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ;
function __construct( $indent, $rules, $match ) {
$this->indent = $indent;
$this->name = $match[1][0] ;
$this->rule = $match[2][0] ;
$this->functions = array() ;
$active_function = NULL ;
/* Find all the lines following the rule start which are indented */
$offset = $match[0][1] + strlen( $match[0][0] ) ;
$lines = preg_split( '/\r\n|\r|\n/', substr( $rules, $offset ) ) ;
$rule_rx = '@^'.preg_quote($indent).'[\x20\t]+(.*)@' ;
$func_rx = '@^'.preg_quote($indent).'[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ;
foreach( $lines as $line ) {
if ( !trim( $line ) ) continue ;
if ( !preg_match( $rule_rx, $line, $match ) ) break ;
/* Handle function definitions */
if ( preg_match( $func_rx, $line, $func_match, 0 ) ) {
$active_function = $func_match[1] ;
$this->functions[$active_function] = array( $func_match[2], "" ) ;
}
else {
if ( $active_function ) $this->functions[$active_function][1] .= $line . PHP_EOL ;
else $this->rule .= PHP_EOL . trim($line) ;
}
}
$this->parse_rule() ;
}
/* Manual parsing, because we can't bootstrap ourselves yet */
function parse_rule() {
$rule = trim( $this->rule ) ;
/* If this is a regex end-token, just mark it and return */
if ( substr( $rule, 0, 1 ) == '/' ) {
$this->parsed = new TokenRegex( $rule ) ;
}
else {
$tokens = array() ;
$this->tokenize( $rule, $tokens ) ;
$this->parsed = ( count( $tokens ) == 1 ? array_pop( $tokens ) : new TokenSequence( $tokens ) ) ;
}
}
static $rx_rx = '{^/(
((\\\\\\\\)*\\\\/) # Escaped \/, making sure to catch all the \\ first, so that we dont think \\/ is an escaped /
|
[^/] # Anything except /
)*/}xu' ;
function tokenize( $str, &$tokens, $o = 0 ) {
$pending = new Pending() ;
while ( $o < strlen( $str ) ) {
$sub = substr( $str, $o ) ;
/* Absorb white-space */
if ( preg_match( '/^\s+/', $sub, $match ) ) {
$o += strlen( $match[0] ) ;
}
/* Handle expression labels */
elseif ( preg_match( '/^(\w*):/', $sub, $match ) ) {
$pending->set( 'tag', isset( $match[1] ) ? $match[1] : '' ) ;
$o += strlen( $match[0] ) ;
}
/* Handle descent token */
elseif ( preg_match( '/^[\w-]+/', $sub, $match ) ) {
$tokens[] = $t = new TokenRecurse( $match[0] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle " quoted literals */
elseif ( preg_match( '/^"[^"]*"/', $sub, $match ) ) {
$tokens[] = $t = new TokenLiteral( $match[0] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle ' quoted literals */
elseif ( preg_match( "/^'[^']*'/", $sub, $match ) ) {
$tokens[] = $t = new TokenLiteral( $match[0] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle regexs */
elseif ( preg_match( self::$rx_rx, $sub, $match ) ) {
$tokens[] = $t = new TokenRegex( $match[0] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle $ call literals */
elseif ( preg_match( '/^\$(\w+)/', $sub, $match ) ) {
$tokens[] = $t = new TokenPHP( $match[1] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle flags */
elseif ( preg_match( '/^\@(\w+)/', $sub, $match ) ) {
$l = count( $tokens ) - 1 ;
$o += strlen( $match[0] ) ;
user_error( "TODO: Flags not currently supported", E_USER_WARNING ) ;
}
/* Handle control tokens */
else {
$c = substr( $sub, 0, 1 ) ;
$l = count( $tokens ) - 1 ;
$o += 1 ;
switch( $c ) {
case '?':
$tokens[$l]->optional = TRUE ;
break ;
case '*':
$tokens[$l]->zero_or_more = TRUE ;
break ;
case '+':
$tokens[$l]->one_or_more = TRUE ;
break ;
case '&':
$pending->set( 'positive_lookahead' ) ;
break ;
case '!':
$pending->set( 'negative_lookahead' ) ;
break ;
case '.':
$pending->set( 'silent' );
break;
case '[':
case ']':
$tokens[] = new TokenWhitespace( FALSE ) ;
break ;
case '<':
case '>':
$tokens[] = new TokenWhitespace( TRUE ) ;
break ;
case '(':
$subtokens = array() ;
$o = $this->tokenize( $str, $subtokens, $o ) ;
$tokens[] = $t = new TokenSequence( $subtokens ) ; $pending->apply_if_present( $t ) ;
break ;
case ')':
return $o ;
case '|':
$option1 = $tokens ;
$option2 = array() ;
$o = $this->tokenize( $str, $option2, $o ) ;
$option1 = (count($option1) == 1) ? $option1[0] : new TokenSequence( $option1 );
$option2 = (count($option2) == 1) ? $option2[0] : new TokenSequence( $option2 );
$pending->apply_if_present( $option2 ) ;
$tokens = array( new TokenOption( $option1, $option2 ) ) ;
return $o ;
default:
user_error( "Can't parser $c - attempting to skip", E_USER_WARNING ) ;
}
}
}
return $o ;
}
/**
* Generate the PHP code for a function to match against a string for this rule
*/
function compile() {
$function_name = $this->function_name( $this->name ) ;
$match = PHPBuilder::build() ;
if ( $this->parsed instanceof TokenRegex ) {
$match->b( "function match_{$function_name} (\$substack = array())",
'$result = array("name"=>"'.$function_name.'", "text"=>"");',
$this->parsed->compile()->replace(array(
'MATCH' => 'return $result;',
'FAIL' => 'return FALSE;'
))
);
}
else {
$match->b( "function match_{$function_name} (\$substack = array())",
'$result = $this->construct( "'.$function_name.'" );',
$this->parsed->compile()->replace(array(
'MATCH' => 'return $this->finalise( "'.$function_name.'", $result );',
'FAIL' => 'return FALSE;'
))
);
}
$functions = array() ;
foreach( $this->functions as $name => $function ) {
$function_name = $this->function_name( preg_match( '/^_/', $name ) ? $this->name.$name : $this->name.'_'.$name ) ;
$functions[] = implode( PHP_EOL, array(
'function ' . $function_name . ' ( ' . $function[0] . ' ) { ',
$function[1],
));
}
// print_r( $match ) ; return '' ;
return $match->render(NULL, $this->indent) . PHP_EOL . PHP_EOL . implode( PHP_EOL, $functions ) ;
}
}
class ParserCompiler {
static $debug = false;
static $currentClass = null;
static function create_parser( $match ) {
/* We allow indenting of the whole rule block, but only to the level of the comment start's indent */
$indent = $match[1];
/* The regex to match a rule */
$rx = '@^'.preg_quote($indent).'([\w\-]+):(.*)$@m' ;
/* Class isn't actually used ATM. Eventually it might be used for rule inlineing optimization */
if ($class = trim($match[2])) self::$currentClass = $class;
elseif (self::$currentClass) $class = self::$currentClass;
else $class = self::$currentClass = 'Anonymous Parser';
/* Get the actual body of the parser rule set */
$rulestr = $match[3] ;
/* Check for pragmas */
if (strpos($class, '!') === 0) {
switch ($class) {
case '!silent':
// NOP - dont output
return '';
case '!insert_autogen_warning':
return $ident . implode(PHP_EOL.$ident, array(
'/*',
'WARNING: This file has been machine generated. Do not edit it, or your changes will be overwritten next time it is compiled.',
'*/'
)) . PHP_EOL;
case '!debug':
self::$debug = true;
return '';
}
throw new Exception("Unknown pragma $class encountered when compiling parser");
}
$rules = array();
preg_match_all( $rx, $rulestr, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ) ;
foreach ( $matches as $match ) {
$rules[] = new Rule( $indent, $rulestr, $match ) ;
}
$out = array() ;
foreach ( $rules as $rule ) {
$out[] = $indent . '/* ' . $rule->name . ':' . $rule->rule . ' */' . PHP_EOL ;
$out[] = $rule->compile() ;
$out[] = PHP_EOL ;
}
return implode( '', $out ) ;
}
static function compile( $string ) {
static $rx = '@
^([\x20\t]*)/\*!\* (?:[\x20\t]*(!?\w*))? # Start with some indent, a comment with the special marker, then an optional name
((?:[^*]|\*[^/])*) # Any amount of "a character that isnt a star, or a star not followed by a /
\*/ # The comment end
@mx';
return preg_replace_callback( $rx, array( 'ParserCompiler', 'create_parser' ), $string ) ;
}
static function cli( $args ) {
if ( count( $args ) == 1 ) {
print "Parser Compiler: A compiler for PEG parsers in PHP \n" ;
print "(C) 2009 SilverStripe. See COPYING for redistribution rights. \n" ;
print "\n" ;
print "Usage: {$args[0]} infile [ outfile ]\n" ;
print "\n" ;
}
else {
$fname = ( $args[1] == '-' ? 'php://stdin' : $args[1] ) ;
$string = file_get_contents( $fname ) ;
$string = self::compile( $string ) ;
if ( !empty( $args[2] ) && $args[2] != '-' ) {
file_put_contents( $args[2], $string ) ;
}
else {
print $string ;
}
}
}
}

10
thirdparty/php-peg/LICENSE vendored Normal file
View File

@ -0,0 +1,10 @@
Copyright (C) 2009 Hamish Friedlander (hamish@silverstripe.com) and SilverStripe Limited (www.silverstripe.com)
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of Hamish Friedlander nor SilverStripe nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

127
thirdparty/php-peg/PHPBuilder.php vendored Normal file
View File

@ -0,0 +1,127 @@
<?php
class PHPBuilder {
static function build () {
return new PHPBuilder() ;
}
function __construct() {
$this->lines = array() ;
}
function l() {
foreach ( func_get_args() as $lines ) {
if ( !$lines ) continue ;
if ( is_string( $lines ) ) $lines = preg_split( '/\r\n|\r|\n/', $lines ) ;
if ( !$lines ) continue ;
if ( $lines instanceof PHPBuilder ) $lines = $lines->lines ;
else $lines = array_map( 'ltrim', $lines ) ;
if ( !$lines ) continue ;
$this->lines = array_merge( $this->lines, $lines ) ;
}
return $this ;
}
function b() {
$args = func_get_args() ;
$entry = array_shift( $args ) ;
$block = new PHPBuilder() ;
call_user_func_array( array( $block, 'l' ), $args ) ;
$this->lines[] = array( $entry, $block->lines ) ;
return $this ;
}
function replace( $replacements, &$array = NULL ) {
if ( $array === NULL ) {
unset( $array ) ;
$array =& $this->lines ;
}
$i = 0 ;
while ( $i < count( $array ) ) {
/* Recurse into blocks */
if ( is_array( $array[$i] ) ) {
$this->replace( $replacements, $array[$i][1] ) ;
if ( count( $array[$i][1] ) == 0 ) {
$nextelse = isset( $array[$i+1] ) && is_array( $array[$i+1] ) && preg_match( '/^\s*else\s*$/i', $array[$i+1][0] ) ;
$delete = preg_match( '/^\s*else\s*$/i', $array[$i][0] ) ;
$delete = $delete || ( preg_match( '/^\s*if\s*\(/i', $array[$i][0] ) && !$nextelse ) ;
if ( $delete ) {
// Is this always safe? Not if the expression has side-effects.
// print "/* REMOVING EMPTY BLOCK: " . $array[$i][0] . "*/\n" ;
array_splice( $array, $i, 1 ) ;
continue ;
}
}
}
/* Handle replacing lines with NULL to remove, or string, array of strings or PHPBuilder to replace */
else {
if ( array_key_exists( $array[$i], $replacements ) ) {
$rep = $replacements[$array[$i]] ;
if ( $rep === NULL ) {
array_splice( $array, $i, 1 ) ;
continue ;
}
if ( is_string( $rep ) ) {
$array[$i] = $rep ;
$i++ ;
continue ;
}
if ( $rep instanceof PHPBuilder ) $rep = $rep->lines ;
if ( is_array( $rep ) ) {
array_splice( $array, $i, 1, $rep ) ; $i += count( $rep ) + 1 ;
continue ;
}
throw 'Unknown type passed to PHPBuilder#replace' ;
}
}
$i++ ;
}
return $this ;
}
function render( $array = NULL, $indent = "" ) {
if ( $array === NULL ) $array = $this->lines ;
$out = array() ;
foreach( $array as $line ) {
if ( is_array( $line ) ) {
list( $entry, $block ) = $line ;
$str = $this->render( $block, $indent . "\t" ) ;
if ( strlen( $str ) < 40 ) {
$out[] = $indent . $entry . ' { ' . ltrim( $str ) . ' }' ;
}
else {
$out[] = $indent . $entry . ' {' ;
$out[] = $str ;
$out[] = $indent . '}' ;
}
}
else {
$out[] = $indent . $line ;
}
}
return implode( PHP_EOL, $out ) ;
}
}

323
thirdparty/php-peg/Parser.php vendored Normal file
View File

@ -0,0 +1,323 @@
<?php
class ParserExpression {
function __construct( $parser, $substack, $result ) {
$this->parser = $parser ;
$this->substack = $substack ;
$this->result = $result ;
}
function find( $exp ) {
$rule_callback = array( $this->parser, "{$this->result['name']}_DLR{$exp}" ) ;
$pars_callback = array( $this->parser, "DLR{$exp}" ) ;
/* If the current result has that expression, return it */
if ( isset( $this->result[$exp] ) ) return $this->result[$exp] ;
/* Search backwards through the sub-expression stacks */
for ( $i = count( $this->substack ) - 1 ; $i >= 0 ; $i-- ) {
if ( isset( $this->substack[$i][$exp] ) ) return $this->substack[$i][$exp] ;
}
/* If we have a rule-attached method, call that */
if ( is_callable( $rule_callback ) ) return call_user_func( $rule_callback, $result ) ;
/* If we have a class-wide method, call that */
if ( is_callable( $pars_callback ) ) return call_user_func( $pars_callback, $result ) ;
/* If we have a global function, call that */
if ( function_exists( $exp ) ) return call_user_func( $exp, $result ) ;
/* If we have a global constant, call that */
if ( defined( $exp ) ) return constant( $expression ) ;
return FALSE ;
}
function callback( $m ) {
$res = $this->find( $m[1] ) ;
if ( $res === FALSE ) return "" ;
if ( is_string( $res ) ) return $res ;
if ( isset( $res['text'] ) ) return $res['text'] ;
// If we find no matches, assume we don't want a replacement, and replace it with itself
return $m[0] ;
}
function expand( $var ) {
return preg_replace_callback( '/\$(\w+)/', array( $this, 'callback' ), $var ) ;
}
function match( $var ) {
return $this->find( $var ) ;
}
}
/**
* We cache the last regex result. This is a low-cost optimization, because we have to do an un-anchored match + check match position anyway
* (alternative is to do an anchored match on a string cut with substr, but that is very slow for long strings). We then don't need to recheck
* for any position between current position and eventual match position - result will be the same
*
* Of course, the next regex might be outside that bracket - after the bracket if other matches have progressed beyond the match position, or before
* the bracket if a failed match + restore has moved the current position backwards - so we have to check that too.
*/
class ParserRegexp {
function __construct( $parser, $rx ) {
$this->parser = $parser ;
$this->rx = $rx . 'Sx' ;
$this->matches = NULL ;
$this->match_pos = NULL ; // NULL is no-match-to-end-of-string, unless check_pos also == NULL, in which case means undefined
$this->check_pos = NULL ;
}
function match() {
$current_pos = $this->parser->pos ;
$dirty = $this->check_pos === NULL || $this->check_pos > $current_pos || ( $this->match_pos !== NULL && $this->match_pos < $current_pos ) ;
if ( $dirty ) {
$this->check_pos = $current_pos ;
$matched = preg_match( $this->rx, $this->parser->string, $this->matches, PREG_OFFSET_CAPTURE, $this->check_pos) ;
if ( $matched ) $this->match_pos = $this->matches[0][1] ; else $this->match_pos = NULL ;
}
if ( $this->match_pos === $current_pos ) {
$this->parser->pos += strlen( $this->matches[0][0] );
return $this->matches[0][0] ;
}
return FALSE ;
}
}
/**
* Parser base class
* - handles current position in string
* - handles matching that position against literal or rx
* - some abstraction of code that would otherwise be repeated many times in a compiled grammer, mostly related to calling user functions
* for result construction and building
*/
class Parser {
function __construct( $string ) {
$this->string = $string ;
$this->pos = 0 ;
$this->depth = 0 ;
$this->regexps = array() ;
}
function whitespace() {
$matched = preg_match( '/[ \t]+/', $this->string, $matches, PREG_OFFSET_CAPTURE, $this->pos ) ;
if ( $matched && $matches[0][1] == $this->pos ) {
$this->pos += strlen( $matches[0][0] );
return ' ' ;
}
return FALSE ;
}
function literal( $token ) {
/* Debugging: * / print( "Looking for token '$token' @ '" . substr( $this->string, $this->pos ) . "'\n" ) ; /* */
$toklen = strlen( $token ) ;
$substr = substr( $this->string, $this->pos, $toklen ) ;
if ( $substr == $token ) {
$this->pos += $toklen ;
return $token ;
}
return FALSE ;
}
function rx( $rx ) {
if ( !isset( $this->regexps[$rx] ) ) $this->regexps[$rx] = new ParserRegexp( $this, $rx ) ;
return $this->regexps[$rx]->match() ;
}
function expand( $var, $substack, $result ) {
$cb = new Parser_ExpressionCallback( $this, $substack, $result ) ;
$v = preg_replace_callback( '/\$(\w+)/', array( $cb, 'callback' ), $var ) ;
print "Expanded var: $v" ;
return $v ;
}
function php( $var, $substack, $result ) {
$ex = $this->get_expression( $var, $substack, $result ) ;
print_r( $result ) ;
if ( is_string( $ex ) ) {
return ( preg_match( '{^\s*/}', $ex ) ? $this->rx( $ex ) : $this->literal( $ex ) ) ;
}
return $ex ;
}
function packhas( $key, $pos ) {
return false ;
}
function packread( $key, $pos ) {
throw 'PackRead after PackHas=>false in Parser.php' ;
}
function packwrite( $key, $pos, $res ) {
return $res ;
}
function construct( $name ) {
$result = array( 'type' => 'node', 'name' => $name, 'text' => '' ) ;
$callback = array( $this, "{$name}__construct" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
}
return $result ;
}
function finalise( $name, &$result ) {
$callback = array( $this, "{$name}__finalise" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
}
return $result ;
}
function store ( &$result, $subres, $storetag = NULL ) {
$result['text'] .= $subres['text'] ;
$globalcb = array( $this, "{$result['name']}_STR" ) ;
$callback = array( $this, $storetag ? "{$result['name']}_{$storetag}" : "{$result['name']}_{$subres['name']}" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result, $subres ) ) ;
}
elseif ( is_callable( $globalcb ) ) {
call_user_func_array( $globalcb, array( &$result, $subres ) ) ;
}
elseif ( $storetag ) {
if ( !isset( $result[$storetag] ) ) $result[$storetag] = $subres ;
else {
if ( isset( $result[$storetag]['text'] ) ) $result[$storetag] = array( $result[$storetag] ) ;
$result[$storetag][] = $subres ;
}
}
}
}
/**
* By inheriting from Packrat instead of Parser, the parser will run in linear time (instead of exponential like
* Parser), but will require a lot more memory, since every match-attempt at every position is memorised.
*
* We now use a string as a byte-array to store position information rather than a straight array for memory reasons. This
* means there is a (roughly) 8MB limit on the size of the string we can parse
*
* @author Hamish Friedlander
*/
class Packrat extends Parser {
function __construct( $string ) {
parent::__construct( $string ) ;
$max = unpack( 'N', "\x00\xFD\xFF\xFF" ) ;
if ( strlen( $string ) > $max[1] ) user_error( 'Attempting to parse string longer than Packrat Parser can handle', E_USER_ERROR ) ;
$this->packstatebase = str_repeat( "\xFF", strlen( $string )*3 ) ;
$this->packstate = array() ;
$this->packres = array() ;
}
function packhas( $key, $pos ) {
$pos *= 3 ;
return isset( $this->packstate[$key] ) && $this->packstate[$key][$pos] != "\xFF" ;
}
function packread( $key, $pos ) {
$pos *= 3 ;
if ( $this->packstate[$key][$pos] == "\xFE" ) return FALSE ;
$this->pos = ord($this->packstate[$key][$pos]) << 16 | ord($this->packstate[$key][$pos+1]) << 8 | ord($this->packstate[$key][$pos+2]) ;
return $this->packres["$key:$pos"] ;
}
function packwrite( $key, $pos, $res ) {
if ( !isset( $this->packstate[$key] ) ) $this->packstate[$key] = $this->packstatebase ;
$pos *= 3 ;
if ( $res !== FALSE ) {
$i = pack( 'N', $this->pos ) ;
$this->packstate[$key][$pos] = $i[1] ;
$this->packstate[$key][$pos+1] = $i[2] ;
$this->packstate[$key][$pos+2] = $i[3] ;
$this->packres["$key:$pos"] = $res ;
}
else {
$this->packstate[$key][$pos] = "\xFE" ;
}
return $res ;
}
}
/**
* FalseOnlyPackrat only remembers which results where false. Experimental.
*
* @author Hamish Friedlander
*/
class FalseOnlyPackrat extends Parser {
function __construct( $string ) {
parent::__construct( $string ) ;
$this->packstatebase = str_repeat( '.', strlen( $string ) ) ;
$this->packstate = array() ;
}
function packhas( $key, $pos ) {
return isset( $this->packstate[$key] ) && $this->packstate[$key][$pos] == 'F' ;
}
function packread( $key, $pos ) {
return FALSE ;
}
function packwrite( $key, $pos, $res ) {
if ( !isset( $this->packstate[$key] ) ) $this->packstate[$key] = $this->packstatebase ;
if ( $res === FALSE ) {
$this->packstate[$key][$pos] = 'F' ;
}
return $res ;
}
}
/**
* Conservative Packrat will only memo-ize a result on the second hit, making it more memory-lean than Packrat,
* but less likely to go exponential that Parser. Because the store logic is much more complicated this is a net
* loss over Parser for many simple grammars.
*
* @author Hamish Friedlander
*/
class ConservativePackrat extends Parser {
function packhas( $key ) {
return isset( $this->packres[$key] ) && $this->packres[$key] !== NULL ;
}
function packread( $key ) {
$this->pos = $this->packpos[$key];
return $this->packres[$key] ;
}
function packwrite( $key, $res ) {
if ( isset( $this->packres[$key] ) ) {
$this->packres[$key] = $res ;
$this->packpos[$key] = $this->pos ;
}
else {
$this->packres[$key] = NULL ;
}
return $res ;
}
}

230
thirdparty/php-peg/README.md vendored Normal file
View File

@ -0,0 +1,230 @@
# PHP PEG - A PEG compiler for parsing text in PHP
This is a Paring Expression Grammar compiler for PHP. PEG parsers are an alternative to other CFG grammars that includes both tokenization
and lexing in a single top down grammar. For a basic overview of the subject, see http://en.wikipedia.org/wiki/Parsing_expression_grammar
## Quick start
- Write a parser. A parser is a PHP class with a grammar contained within it in a special syntax. The filetype is .peg.inc. See the examples directory.
- Compile the parser. php ./cli.php ExampleParser.peg.inc > ExampleParser.php
- Use the parser (you can also include code to do this in the input parser - again see the examples directory):
<pre><code>
$x = new ExampleParser( 'string to parse' ) ;
$res = $x->match_Expr() ;
</code></pre>
### Parser Format
Parsers are contained within a PHP file, in one or more special comment blocks that start with `/*!* [name | !pragma]` (like a docblock, but with an
exclamation mark in the middle of the stars)
Lexically, these blocks are a set of rules, each consisting of a name token, a matching rule and a set of attached functions.
The name token must contain no whitespace and end with a `:` character. The matching rule and functions are on the same line or on the indented lines below.
You can have multiple comment blocks, all of which are treated as contiguous for the purpose of compiling. During compilation these blocks will be replaced
with a set of "matching" functions (functions which match a string against their rules) for each rule in the block.
The optional name marks the start of a new set of parser rules. This is currently unused, but might be used in future for opimization & debugging purposes.
If unspecified, it defaults to the same name as the previous parser comment block, or 'Anonymous Parser' if no name has ever been set.
If the name starts with an '!' symbol, that comment block is a pragma, and is treated not as some part of the parser, but as a special block of meta-data
##### Tricks and traps
We allow indenting a parser block, but only in a consistant manner - whatever the indent of the /*** marker becomes the "base" indent, and needs to be used
for all lines. You can mix tabs and spaces, but the indent must always be an exact match - if the "base" indent is a tab then two spaces, every line within the
block also needs indenting with a tab then two spaces, not two tabs (even if in your editor, that gives the same indent).
Any line with more than the "base" indent is considered a continuation of the previous rule
Any line with less than the "base" indent is an error
This might get looser if I get around to re-writing the internal "parser parser" in php-peg, bootstrapping the whole thing
### Rules
PEG matching rules try to follow standard PEG format, summarised thusly:
<pre>
token* - Token is optionally repeated
token+ - Token is repeated at least one
token? - Token is optionally present
tokena tokenb - Token tokenb follows tokena, both of which are present
tokena | tokenb - One of tokena or tokenb are present, prefering tokena
&token - Token is present next (but not consumed by parse)
!token - Token is not present next (but not consumed by parse)
( expression ) - Grouping for priority
</code></pre>
But with these extensions:
<pre>
< or > - Optionally match whitespace
[ or ] - Require some whitespace
</code></pre>
### Tokens
Tokens may be
- bare-words, which are recursive matchers - references to token rules defined elsewhere in the grammar,
- literals, surrounded by `"` or `'` quote pairs. No escaping support is provided in literals.
- regexs, surrounded by `/` pairs.
- expressions - single words (match \w+) starting with `$` or more complex surrounded by `${ }` which call a user defined function to perform the match
##### Regular expression tokens
Automatically anchored to the current string start - do not include a string start anchor (`^`) anywhere. Always acts as when the 'x' flag is enabled in PHP -
whitespace is ignored unless escaped, and '#' stats a comment.
Be careful when ending a regular expression token - the '*/' pattern (as in /foo\s*/) will end a PHP comment. Since the 'x' flag is always active,
just split with a space (as in / foo \s* /)
### Expressions
Expressions allow run-time calculated matching. You can embed an expression within a literal or regex token to
match against a calculated value, or simply specify the expression as a token to (optionally) internally handle matching
and generate a result.
Expressions will try a variety of scopes to find a value. It will look for variables already set in the current result,
rule-attached functions and a variety of other functions and constants.
Tried in this order
- against current result
- against containing expression stack in order (for sub-expressions only)
- against parser instance as variable
- against parser instance as rule-attached method INCLUDING `$` ( i.e. `function $foo()` )
- against parser instance as method INCLUDING `$`
- as global method
- as constant
##### Tricks and traps
Be careful against matching against results
<pre><code>
quoted_good: q:/['"]/ string "$q"
quoted_bad: q:/['"]/ string $q
</code></pre>
`"$q"` matches against the value of q again. `$q` simply returns the value of q, without doing any matching
### Named matching rules
Tokens and groups can be given names by prepending name and `:`, e.g.,
<pre><code>
rulea: "'" name:( tokena tokenb )* "'"
</code></pre>
There must be no space betweeen the name and the `:`
<pre><code>
badrule: "'" name : ( tokena tokenb )* "'"
</code></pre>
Recursive matchers can be given a name the same as their rule name by prepending with just a `:`. These next two rules are equivilent
<pre><code>
rulea: tokena tokenb:tokenb
rulea: tokena :tokenb
</code></pre>
### Rule-attached functions
Each rule can have a set of functions attached to it. These functions can be defined
- in-grammar by indenting the function body after the rule
- in-class after close of grammar comment by defining a regular method who's name is `{$rulename}_{$functionname}`, or `{$rulename}{$functionname}` if function name starts with `_`
- in a sub class
All functions that are not in-grammar must have PHP compatible names (see PHP name mapping). In-grammar functions will have their names converted if needed.
All these definitions define the same rule-attached function
<pre><code>
class A extends Parser {
/**Parser
foo: bar baz
function bar() {}
* /
function foo_bar() {}
}
class B extends A {
function foo_bar() {}
}
</code></pre>
### PHP name mapping
Rules in the grammar map to php functions named `match_{$rulename}`. However rule names can contain characters that php functions can't.
These characters are remapped:
<pre><code>
'-' => '_'
'$' => 'DLR'
'*' => 'STR'
</code></pre>
Other dis-allowed characters are removed.
## Results
Results are a tree of nested arrays.
Without any specific control, each rules result will just be the text it matched against in a `['text']` member. This member must always exist.
Marking a subexpression, literal, regex or recursive match with a name (see Named matching rules) will insert a member into the
result array named that name. If there is only one match it will be a single result array. If there is more than one match it will be an array of arrays.
You can override result storing by specifying a rule-attached function with the given name. It will be called with a reference to the current result array
and the sub-match - in this case the default storage action will not occur.
If you specify a rule-attached function for a recursive match, you do not need to name that token at all - it will be call automatically. E.g.
<pre><code>
rulea: tokena tokenb
function tokenb ( &$res, $sub ) { print 'Will be called, even though tokenb is not named or marked with a :' ; }
</code></pre>
You can also specify a rule-attached function called `*`, which will be called with every recursive match made
<pre><code>
rulea: tokena tokenb
function * ( &$res, $sub ) { print 'Will be called for both tokena and tokenb' ; }
</code></pre>
### Silent matches
By default all matches are added to the 'text' property of a result. By prepending a member with `.` that match will not be added to the ['text'] member. This
doesn't affect the other result properties that named rules' add.
### Pragmas
When opening a parser comment block, if instead of a name (or no name) you put a word starting with '!', that comment block is treated as a pragma - not
part of the parser language itself, but some other instruction to the compiler. These pragmas are currently understood:
!silent
This is a comment that should only appear in the source code. Don't output it in the generated code
!insert_autogen_warning
Insert a warning comment into the generated code at this point, warning that the file is autogenerated and not to edit it
## TODO
- Allow configuration of whitespace - specify what matches, and wether it should be injected into results as-is, collapsed, or not at all
- Allow inline-ing of rules into other rules for speed
- More optimisation
- Make Parser-parser be self-generated, instead of a bad hand rolled parser like it is now.
- Slighly more powerfull expressions: `${parent.q}`, `${foo()->bar}`, etc.
- Need to properly escape all literals. Expressions currently need to be in '', not ""
- PHP token parser, and other token streams, instead of strings only like now

5
thirdparty/php-peg/cli.php vendored Normal file
View File

@ -0,0 +1,5 @@
<?php
require 'Compiler.php' ;
ParserCompiler::cli( $_SERVER['argv'] ) ;

View File

@ -0,0 +1,25 @@
<?php
require '../Parser.php' ;
class CalculatedLiterals extends Parser {
/*!* CalculatedLiterals
string: ( /\\./ | /[^${parent.q}]/ )*
simplequote: q:/['"]/ string '$q'
freequote-matched: "qq" q:/[{\[(<]/ string '$matched'
function $matched( $res ) {
$a = array( '{' => '}', '[' => ']', '(' => ')', '<' => '>' ) ;
return $a[$res['q']] ;
}
freequote-unmatched: "qq" q:/./ string '$q'
quoted-string: freequote-matched | freequote-unmatched | simplequote
*/
}

View File

@ -0,0 +1,63 @@
<?php
require '../Parser.php' ;
class Calculator extends Parser {
/*!* Calculator
Number: /[0-9]+/
Value: Number > | '(' > Expr > ')' >
function Number( &$result, $sub ) {
$result['val'] = $sub['text'] ;
}
function Expr( &$result, $sub ) {
$result['val'] = $sub['val'] ;
}
Times: '*' > operand:Value >
Div: '/' > operand:Value >
Product: Value > ( Times | Div ) *
function Value( &$result, $sub ) {
$result['val'] = $sub['val'] ;
}
function Times( &$result, $sub ) {
$result['val'] *= $sub['operand']['val'] ;
}
function Div( &$result, $sub ) {
$result['val'] /= $sub['operand']['val'] ;
}
Plus: '+' > operand:Product >
Minus: '-' > operand:Product >
Sum: Product > ( Plus | Minus ) *
function Product( &$result, $sub ) {
$result['val'] = $sub['val'] ;
}
function Plus( &$result, $sub ) {
$result['val'] += $sub['operand']['val'] ;
}
function Minus( &$result, $sub ) {
$result['val'] -= $sub['operand']['val'] ;
}
Expr: Sum
function Sum( &$result, $sub ) {
$result['val'] = $sub['val'] ;
}
*/
}
$x = new Calculator( '(2 + 4) * 3 - 10' ) ;
$res = $x->match_Expr() ;
if ( $res === FALSE ) {
print "No Match\n" ;
}
else {
print_r( $res ) ;
}

View File

@ -0,0 +1,36 @@
<?php
require '../Parser.php' ;
class EqualRepeat extends Packrat {
/* Any number of a followed by the same number of b and the same number of c characters
* aabbcc - good
* aaabbbccc - good
* aabbc - bad
* aabbacc - bad
*/
/*!* Grammar1
A: "a" A? "b"
B: "b" B? "c"
T: !"b"
X: &(A !"b") "a"+ B !("a" | "b" | "c")
*/
}
function match( $str ) {
$p = new EqualRepeat( $str ) ;
$r = $p->match_X() ;
print "$str\n" ;
print $r ? print_r( $r, true ) : 'No Match' ;
print "\n\n" ;
}
match( 'aabbcc' ) ; // Should match
match( 'aaabbbccc' ) ; // Should match
match( 'aabbbccc' ) ; // Should not match
match( 'aaabbccc' ) ; // Should not match
match( 'aaabbbcc' ) ; // Should not match
match( 'aaabbbcccc' ) ; // Should not match

View File

@ -0,0 +1,88 @@
<?php
require '../Parser.php';
/**
* This parser strictly matches the RFC822 standard. No characters outside the ASCII range 0-127 are allowed
* @author Hamish Friedlander
*/
class Rfc822 extends Parser {
/*!* Rfc822
crlf: /\r\n/
lwsp-char: " " | "\t"
linear-white-space: (crlf? lwsp-char)+
atom: /[^\x00-\x1F\x20()<>@,;:\\".\[\]\x80-\xFF]+/
qtext-chars: /[^"\\\x0D]+/
qtext: linear-white-space | qtext-chars
quoted-pair: /\\[\x00-\x7F]/
quoted-string: .'"' ( quoted-pair | qtext )* .'"'
word: atom | quoted-string
phrase: (word >)+
dtext-chars: /[^\[\]\\\r]+/
dtext: linear-white-space | dtext-chars
domain-literal: "[" ( dtext | quoted-pair )* "]"
domain-ref: atom
sub-domain: domain-ref | domain-literal
domain: sub-domain ("." sub-domain)*
route: "@" domain ("," "@" domain)* ":"
route-addr: "<" route? addr-spec ">"
function addr_spec ( &$self, $sub ) {
$self['addr_spec'] = $sub['text'] ;
}
local-part: word ("." word)*
addr-spec: local-part "@" domain
mailbox: ( addr-spec | phrase route-addr ) >
function __construct( &$self ) {
$self['phrase'] = NULL ;
$self['address'] = NULL ;
}
function phrase ( &$self, $sub ) {
$self['phrase'] = $sub['text'] ;
}
function addr_spec ( &$self, $sub ) {
$self['address'] = $sub['text'] ;
}
function route_addr ( &$self, $sub ) {
$self['address'] = $sub['addr_spec'] ;
}
group: phrase ":" ( mailbox ("," mailbox)* )? ";"
address: :mailbox | group
address-header: address (<","> address)*
function __construct( &$self ) {
$self['addresses'] = array() ;
}
function address( &$self, $sub ) {
$self['addresses'][] = $sub['mailbox'] ;
}
*/
}
$p = new Rfc822( 'John Byorgson <byorn@again.com>, "Akira \"Bad Boy\" Kenada" <akira@neotokyo.com>' ) ;
print_r( $p->match_address_header() ) ;

View File

@ -0,0 +1,30 @@
<?php
require 'Rfc822.php';
/**
* This parser extends the RFC822 standard to allow XML entities and UTF-8 characters in atoms and quoted-strings
* @author Hamish Friedlander
*/
class Rfc822UTF8 extends Rfc822 {
/*!* Rfc822UTF8
crlf: /\r\n/u
atom: /((&[A-Za-z]+;)|(&#(xX)?[A-Fa-f0-9]+;)|([^\x00-\x1F\x20()<>@,;:\\".\[\]]))+/u
qtext-chars: /[^"\\\x0D]+/u
quoted-pair: /\\./u
*/
}
/**
* Some trial code. Remove soon
*/
$p = new Rfc822UTF8( 'JØhn ByØrgsØn <byorn@again.com>, "アキラ" <akira@neotokyo.com>' ) ;
print_r( $p->match_address_header() ) ;
/* */