MINOR: Pull in latest changes from thirdparty tool php-peg

This commit is contained in:
Hamish Friedlander 2011-03-10 12:00:49 +13:00
parent 0b7d396ab8
commit fdf81c80cb
8 changed files with 652 additions and 233 deletions

View File

@ -1,9 +1,9 @@
---
format: 1
handler:
commit: afb28caf712815da82cd6bb8ece8ac12ab9b2188
commit: 2045d5fbfa3ed857a9eac3722e6f9ecc301593c6
branch: master
lock: false
repository_class: Piston::Git::Repository
repository_url: git://github.com/hafriedlander/php-peg.git
exported_to: 0669b742904f8c9f7cd97569166663396897dab9
exported_to: 654df253d884db2cd397016de1a5105455ba1631

View File

@ -215,22 +215,19 @@ abstract class Token extends PHPWriter {
}
if ( $this->tag && !($this instanceof TokenRecurse ) ) {
$rid = $this->varid() ;
$code = PHPBuilder::build()
->l(
'$substack[] = $result;',
'$result = $this->construct( "'.$this->tag.'" );',
'$stack[] = $result; $result = $this->construct( $matchrule, "'.$this->tag.'" ); ',
$code->replace(array(
'MATCH' => PHPBuilder::build()
->l(
'$subres = $result ;',
'$result = array_pop( $substack ) ;',
'$subres = $result; $result = array_pop($stack);',
'$this->store( $result, $subres, \''.$this->tag.'\' );',
'MATCH'
),
'FAIL' => PHPBuilder::build()
->l(
'$result = array_pop( $substack ) ;',
'$result = array_pop($stack);',
'FAIL'
)
)));
@ -255,31 +252,30 @@ abstract class TokenTerminal extends Token {
abstract class TokenExpressionable extends TokenTerminal {
static $expression_rx = '/\$(\w+)/' ;
static $expression_rx = '/ \$(\w+) | { \$(\w+) } /x';
function contains_expression(){
return preg_match(self::$expression_rx, $this->value);
}
function expression_replace($matches) {
return '\'.$this->expression($result, $stack, \'' . (!empty($matches[1]) ? $matches[1] : $matches[2]) . "').'";
}
function match_code( $value ) {
if (!$this->contains_expression()) parent::match_code($value);
$id = $this->varid() ;
return PHPBuilder::build()->l(
'$'.$id.' = new ParserExpression( $this, $substack, $result );',
parent::match_code('$'.$id.'->expand('.$value.')')
);
$value = preg_replace_callback(self::$expression_rx, array($this, 'expression_replace'), $value);
return parent::match_code($value);
}
}
class TokenLiteral extends TokenExpressionable {
function __construct( $value ) {
parent::__construct( 'literal', $value );
parent::__construct( 'literal', "'" . substr($value,1,-1) . "'" );
}
function match_code() {
// We inline single-character matches for speed
if ( strlen( eval( 'return '. $this->value . ';' ) ) == 1 ) {
if ( !$this->contains_expression() && strlen( eval( 'return '. $this->value . ';' ) ) == 1 ) {
return $this->match_fail_conditional( 'substr($this->string,$this->pos,1) == '.$this->value,
PHPBuilder::build()->l(
'$this->pos += 1;',
@ -319,37 +315,18 @@ class TokenWhitespace extends TokenTerminal {
}
}
class TokenPHP extends TokenTerminal {
function __construct( $value ) {
parent::__construct( 'php', $value ) ;
}
/* Call recursion indirectly */
function match_code() {
$id = $this->varid() ;
return PHPBuilder::build()
->l(
'$'.$id.' = new ParserExpression( $this, $substack, $result );',
$this->match_fail_block( '( $subres = $'.$id.'->match( \''.$this->value.'\' ) ) !== FALSE',
PHPBuilder::build()
->b( 'if ( is_string( $subres ) )',
$this->set_text('$subres')
)
->b( 'else',
'$this->store($result, $subres);'
)
));
}
}
class TokenRecurse extends Token {
function __construct( $value ) {
parent::__construct( 'recurse', $value ) ;
}
function match_function() {
return "'".$this->function_name($this->value)."'";
}
function match_code() {
$function = $this->function_name( $this->value ) ;
$storetag = $this->function_name( $this->tag ? $this->tag : $this->value ) ;
$function = $this->match_function() ;
$storetag = $this->function_name( $this->tag ? $this->tag : $this->match_function() ) ;
if ( ParserCompiler::$debug ) {
$debug_header = PHPBuilder::build()
@ -358,7 +335,7 @@ class TokenRecurse extends Token {
'$this->depth += 2;',
'$sub = ( strlen( $this->string ) - $this->pos > 20 ) ? ( substr( $this->string, $this->pos, 20 ) . "..." ) : substr( $this->string, $this->pos );',
'$sub = preg_replace( \'/(\r|\n)+/\', " {NL} ", $sub );',
'print( $indent."Matching against '.$function.' (".$sub.")\n" );'
'print( $indent."Matching against $matcher (".$sub.")\n" );'
);
$debug_match = PHPBuilder::build()
@ -378,9 +355,9 @@ class TokenRecurse extends Token {
}
return PHPBuilder::build()->l(
'$matcher = \'match_\'.'.$function.'; $key = $matcher; $pos = $this->pos;',
$debug_header,
'$key = "'.$function.'"; $pos = $this->pos;', // :{$this->pos}";',
'$subres = ( $this->packhas( $key, $pos ) ? $this->packread( $key, $pos ) : $this->packwrite( $key, $pos, $this->match_'.$function.'(array_merge($substack, array($result))) ) );',
'$subres = ( $this->packhas( $key, $pos ) ? $this->packread( $key, $pos ) : $this->packwrite( $key, $pos, $this->$matcher(array_merge($stack, array($result))) ) );',
$this->match_fail_conditional( '$subres !== FALSE',
PHPBuilder::build()->l(
$debug_match,
@ -395,6 +372,12 @@ class TokenRecurse extends Token {
}
}
class TokenExpressionedRecurse extends TokenRecurse {
function match_function() {
return '$this->expression($result, $stack, \''.$this->value.'\')';
}
}
class TokenSequence extends Token {
function __construct( $value ) {
parent::__construct( 'sequence', $value ) ;
@ -479,40 +462,115 @@ class Pending {
*/
class Rule extends PHPWriter {
static $rule_rx = '@^[\x20\t]+(.*)@' ;
static $func_rx = '@^[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ;
static $rule_rx = '@
(?<name> \w+) # The name of the rule
( \s+ extends \s+ (?<extends>\w+) )? # The extends word
( \s* \( (?<arguments>.*) \) )? # Any variable setters
(
\s*(?<matchmark>:) | # Marks the matching rule start
\s*(?<replacemark>;) | # Marks the replacing rule start
\s*$
)
(?<rule>[\s\S]*)
@x';
function __construct( $indent, $rules, $match ) {
$this->indent = $indent;
$this->name = $match[1][0] ;
$this->rule = $match[2][0] ;
static $argument_rx = '@
( [^=]+ ) # Name
= # Seperator
( [^=,]+ ) # Variable
(,|$)
@x';
static $replacement_rx = '@
( ([^=]|=[^>])+ ) # What to replace
=> # The replacement mark
( [^,]+ ) # What to replace it with
(,|$)
@x';
static $function_rx = '@^\s+function\s+([^\s(]+)\s*(.*)@' ;
protected $parser;
protected $lines;
public $name;
public $extends;
public $mode;
public $rule;
function __construct($parser, $lines) {
$this->parser = $parser;
$this->lines = $lines;
// Find the first line (if any) that's an attached function definition. Can skip first line (unless this block is malformed)
for ($i = 1; $i < count($lines); $i++) {
if (preg_match(self::$function_rx, $lines[$i])) break;
}
// Then split into the two parts
$spec = array_slice($lines, 0, $i);
$funcs = array_slice($lines, $i);
// Parse out the spec
$spec = implode("\n", $spec);
if (!preg_match(self::$rule_rx, $spec, $specmatch)) user_error('Malformed rule spec ' . $spec, E_USER_ERROR);
$this->name = $specmatch['name'];
if ($specmatch['extends']) {
$this->extends = $this->parser->rules[$specmatch['extends']];
if (!$this->extends) user_error('Extended rule '.$specmatch['extends'].' is not defined before being extended', E_USER_ERROR);
}
$this->arguments = array();
if ($specmatch['arguments']) {
preg_match_all(self::$argument_rx, $specmatch['arguments'], $arguments, PREG_SET_ORDER);
foreach ($arguments as $argument){
$this->arguments[trim($argument[1])] = trim($argument[2]);
}
}
$this->mode = $specmatch['matchmark'] ? 'rule' : 'replace';
if ($this->mode == 'rule') {
$this->rule = $specmatch['rule'];
$this->parse_rule() ;
}
else {
if (!$this->extends) user_error('Replace matcher, but not on an extends rule', E_USER_ERROR);
$this->replacements = array();
preg_match_all(self::$replacement_rx, $specmatch['rule'], $replacements, PREG_SET_ORDER);
$rule = $this->extends->rule;
foreach ($replacements as $replacement) {
$search = trim($replacement[1]);
$replace = trim($replacement[3]); if ($replace == "''" || $replace == '""') $replace = "";
$rule = str_replace($search, ' '.$replace.' ', $rule);
}
$this->rule = $rule;
$this->parse_rule() ;
}
// Parse out the functions
$this->functions = array() ;
$active_function = NULL ;
/* Find all the lines following the rule start which are indented */
$offset = $match[0][1] + strlen( $match[0][0] ) ;
$lines = preg_split( '/\r\n|\r|\n/', substr( $rules, $offset ) ) ;
$rule_rx = '@^'.preg_quote($indent).'[\x20\t]+(.*)@' ;
$func_rx = '@^'.preg_quote($indent).'[\x20\t]+function\s+([^\s(]+)\s*\(([^)]*)\)@' ;
foreach( $lines as $line ) {
if ( !trim( $line ) ) continue ;
if ( !preg_match( $rule_rx, $line, $match ) ) break ;
foreach( $funcs as $line ) {
/* Handle function definitions */
if ( preg_match( $func_rx, $line, $func_match, 0 ) ) {
$active_function = $func_match[1] ;
$this->functions[$active_function] = array( $func_match[2], "" ) ;
}
else {
if ( $active_function ) $this->functions[$active_function][1] .= $line . PHP_EOL ;
else $this->rule .= PHP_EOL . trim($line) ;
if ( preg_match( self::$function_rx, $line, $func_match, 0 ) ) {
$active_function = $func_match[1];
$this->functions[$active_function] = $func_match[2] . PHP_EOL;
}
else $this->functions[$active_function] .= $line . PHP_EOL ;
}
$this->parse_rule() ;
}
/* Manual parsing, because we can't bootstrap ourselves yet */
@ -528,6 +586,7 @@ class Rule extends PHPWriter {
$this->tokenize( $rule, $tokens ) ;
$this->parsed = ( count( $tokens ) == 1 ? array_pop( $tokens ) : new TokenSequence( $tokens ) ) ;
}
}
static $rx_rx = '{^/(
@ -574,7 +633,7 @@ class Rule extends PHPWriter {
}
/* Handle $ call literals */
elseif ( preg_match( '/^\$(\w+)/', $sub, $match ) ) {
$tokens[] = $t = new TokenPHP( $match[1] ) ; $pending->apply_if_present( $t ) ;
$tokens[] = $t = new TokenExpressionedRecurse( $match[1] ) ; $pending->apply_if_present( $t ) ;
$o += strlen( $match[0] ) ;
}
/* Handle flags */
@ -652,46 +711,105 @@ class Rule extends PHPWriter {
/**
* Generate the PHP code for a function to match against a string for this rule
*/
function compile() {
function compile($indent) {
$function_name = $this->function_name( $this->name ) ;
// Build the typestack
$typestack = array(); $class=$this;
do {
$typestack[] = $this->function_name($class->name);
}
while($class = $class->extends);
$match = PHPBuilder::build() ;
if ( $this->parsed instanceof TokenRegex ) {
$match->b( "function match_{$function_name} (\$substack = array())",
'$result = array("name"=>"'.$function_name.'", "text"=>"");',
$this->parsed->compile()->replace(array(
'MATCH' => 'return $result;',
'FAIL' => 'return FALSE;'
))
);
$typestack = "array('" . implode("','", $typestack) . "')";
// Build an array of additional arguments to add to result node (if any)
if (empty($this->arguments)) {
$arguments = 'null';
}
else {
$match->b( "function match_{$function_name} (\$substack = array())",
'$result = $this->construct( "'.$function_name.'" );',
$this->parsed->compile()->replace(array(
'MATCH' => 'return $this->finalise( "'.$function_name.'", $result );',
'FAIL' => 'return FALSE;'
))
);
$arguments = "array(";
foreach ($this->arguments as $k=>$v) { $arguments .= "'$k' => '$v'"; }
$arguments .= ")";
}
$match = PHPBuilder::build() ;
$match->l("protected \$match_{$function_name}_typestack = $typestack;");
$match->b( "function match_{$function_name} (\$stack = array())",
'$matchrule = "'.$function_name.'"; $result = $this->construct($matchrule, $matchrule, '.$arguments.');',
$this->parsed->compile()->replace(array(
'MATCH' => 'return $this->finalise($result);',
'FAIL' => 'return FALSE;'
))
);
$functions = array() ;
foreach( $this->functions as $name => $function ) {
$function_name = $this->function_name( preg_match( '/^_/', $name ) ? $this->name.$name : $this->name.'_'.$name ) ;
$functions[] = implode( PHP_EOL, array(
'function ' . $function_name . ' ( ' . $function[0] . ' ) { ',
$function[1],
'function ' . $function_name . ' ' . $function
));
}
// print_r( $match ) ; return '' ;
return $match->render(NULL, $this->indent) . PHP_EOL . PHP_EOL . implode( PHP_EOL, $functions ) ;
return $match->render(NULL, $indent) . PHP_EOL . PHP_EOL . implode( PHP_EOL, $functions ) ;
}
}
class RuleSet {
public $rules = array();
function addRule($indent, $lines, &$out) {
$rule = new Rule($this, $lines) ;
$this->rules[$rule->name] = $rule;
$out[] = $indent . '/* ' . $rule->name . ':' . $rule->rule . ' */' . PHP_EOL ;
$out[] = $rule->compile($indent) ;
$out[] = PHP_EOL ;
}
function compile($indent, $rulestr) {
$indentrx = '@^'.preg_quote($indent).'@';
$out = array();
$block = array();
foreach (preg_split('/\r\n|\r|\n/', $rulestr) as $line) {
// Ignore blank lines
if (!trim($line)) continue;
// Ignore comments
if (preg_match('/^[\x20|\t]+#/', $line)) continue;
// Strip off indent
if (!empty($indent)) {
if (strpos($line, $indent) === 0) $line = substr($line, strlen($indent));
else user_error('Non-blank line with inconsistent index in parser block', E_USER_ERROR);
}
// Any indented line, add to current set of lines
if (preg_match('/^\x20|\t/', $line)) $block[] = $line;
// Any non-indented line marks a new block. Add a rule for the current block, then start a new block
else {
if (count($block)) $this->addRule($indent, $block, $out);
$block = array($line);
}
}
// Any unfinished block add a rule for
if (count($block)) $this->addRule($indent, $block, $out);
// And return the compiled version
return implode( '', $out ) ;
}
}
class ParserCompiler {
static $parsers = array();
static $debug = false;
static $currentClass = null;
@ -700,17 +818,11 @@ class ParserCompiler {
/* We allow indenting of the whole rule block, but only to the level of the comment start's indent */
$indent = $match[1];
/* The regex to match a rule */
$rx = '@^'.preg_quote($indent).'([\w\-]+):(.*)$@m' ;
/* Class isn't actually used ATM. Eventually it might be used for rule inlineing optimization */
/* Get the parser name for this block */
if ($class = trim($match[2])) self::$currentClass = $class;
elseif (self::$currentClass) $class = self::$currentClass;
else $class = self::$currentClass = 'Anonymous Parser';
/* Get the actual body of the parser rule set */
$rulestr = $match[3] ;
/* Check for pragmas */
if (strpos($class, '!') === 0) {
switch ($class) {
@ -718,7 +830,7 @@ class ParserCompiler {
// NOP - dont output
return '';
case '!insert_autogen_warning':
return $ident . implode(PHP_EOL.$ident, array(
return $indent . implode(PHP_EOL.$indent, array(
'/*',
'WARNING: This file has been machine generated. Do not edit it, or your changes will be overwritten next time it is compiled.',
'*/'
@ -731,22 +843,9 @@ class ParserCompiler {
throw new Exception("Unknown pragma $class encountered when compiling parser");
}
$rules = array();
preg_match_all( $rx, $rulestr, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ) ;
foreach ( $matches as $match ) {
$rules[] = new Rule( $indent, $rulestr, $match ) ;
}
$out = array() ;
foreach ( $rules as $rule ) {
$out[] = $indent . '/* ' . $rule->name . ':' . $rule->rule . ' */' . PHP_EOL ;
$out[] = $rule->compile() ;
$out[] = PHP_EOL ;
}
return implode( '', $out ) ;
if (!isset(self::$parsers[$class])) self::$parsers[$class] = new RuleSet();
return self::$parsers[$class]->compile($indent, $match[3]);
}
static function compile( $string ) {

View File

@ -1,59 +1,5 @@
<?php
class ParserExpression {
function __construct( $parser, $substack, $result ) {
$this->parser = $parser ;
$this->substack = $substack ;
$this->result = $result ;
}
function find( $exp ) {
$rule_callback = array( $this->parser, "{$this->result['name']}_DLR{$exp}" ) ;
$pars_callback = array( $this->parser, "DLR{$exp}" ) ;
/* If the current result has that expression, return it */
if ( isset( $this->result[$exp] ) ) return $this->result[$exp] ;
/* Search backwards through the sub-expression stacks */
for ( $i = count( $this->substack ) - 1 ; $i >= 0 ; $i-- ) {
if ( isset( $this->substack[$i][$exp] ) ) return $this->substack[$i][$exp] ;
}
/* If we have a rule-attached method, call that */
if ( is_callable( $rule_callback ) ) return call_user_func( $rule_callback, $result ) ;
/* If we have a class-wide method, call that */
if ( is_callable( $pars_callback ) ) return call_user_func( $pars_callback, $result ) ;
/* If we have a global function, call that */
if ( function_exists( $exp ) ) return call_user_func( $exp, $result ) ;
/* If we have a global constant, call that */
if ( defined( $exp ) ) return constant( $expression ) ;
return FALSE ;
}
function callback( $m ) {
$res = $this->find( $m[1] ) ;
if ( $res === FALSE ) return "" ;
if ( is_string( $res ) ) return $res ;
if ( isset( $res['text'] ) ) return $res['text'] ;
// If we find no matches, assume we don't want a replacement, and replace it with itself
return $m[0] ;
}
function expand( $var ) {
return preg_replace_callback( '/\$(\w+)/', array( $this, 'callback' ), $var ) ;
}
function match( $var ) {
return $this->find( $var ) ;
}
}
/**
* We cache the last regex result. This is a low-cost optimization, because we have to do an un-anchored match + check match position anyway
* (alternative is to do an anchored match on a string cut with substr, but that is very slow for long strings). We then don't need to recheck
@ -133,23 +79,27 @@ class Parser {
return $this->regexps[$rx]->match() ;
}
function expand( $var, $substack, $result ) {
$cb = new Parser_ExpressionCallback( $this, $substack, $result ) ;
$v = preg_replace_callback( '/\$(\w+)/', array( $cb, 'callback' ), $var ) ;
print "Expanded var: $v" ;
return $v ;
}
function php( $var, $substack, $result ) {
$ex = $this->get_expression( $var, $substack, $result ) ;
print_r( $result ) ;
if ( is_string( $ex ) ) {
return ( preg_match( '{^\s*/}', $ex ) ? $this->rx( $ex ) : $this->literal( $ex ) ) ;
function expression( $result, $stack, $value ) {
$stack[] = $result; $rv = false;
/* Search backwards through the sub-expression stacks */
for ( $i = count($stack) - 1 ; $i >= 0 ; $i-- ) {
$node = $stack[$i];
if ( isset($node[$value]) ) { $rv = $node[$value]; break; }
foreach ($this->typestack($node['_matchrule']) as $type) {
$callback = array($this, "{$type}_DLR{$value}");
if ( is_callable( $callback ) ) { $rv = call_user_func( $callback ) ; if ($rv !== FALSE) break; }
}
}
return $ex ;
}
if ($rv === false) $rv = @$this->$value;
if ($rv === false) $rv = @$this->$value();
return is_array($rv) ? $rv['text'] : ($rv ? $rv : '');
}
function packhas( $key, $pos ) {
return false ;
}
@ -162,21 +112,33 @@ class Parser {
return $res ;
}
function construct( $name ) {
$result = array( 'type' => 'node', 'name' => $name, 'text' => '' ) ;
function typestack( $name ) {
$prop = "match_{$name}_typestack";
return $this->$prop;
}
function construct( $matchrule, $name, $arguments = null ) {
$result = array( '_matchrule' => $matchrule, 'name' => $name, 'text' => '' );
if ($arguments) $result = array_merge($result, $arguments) ;
$callback = array( $this, "{$name}__construct" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
foreach ($this->typestack($matchrule) as $type) {
$callback = array( $this, "{$type}__construct" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
break;
}
}
return $result ;
}
function finalise( $name, &$result ) {
$callback = array( $this, "{$name}__finalise" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
function finalise( &$result ) {
foreach ($this->typestack($result['_matchrule']) as $type) {
$callback = array( $this, "{$type}__finalise" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result ) ) ;
break;
}
}
return $result ;
@ -185,16 +147,23 @@ class Parser {
function store ( &$result, $subres, $storetag = NULL ) {
$result['text'] .= $subres['text'] ;
$globalcb = array( $this, "{$result['name']}_STR" ) ;
$callback = array( $this, $storetag ? "{$result['name']}_{$storetag}" : "{$result['name']}_{$subres['name']}" ) ;
$storecalled = false;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result, $subres ) ) ;
}
elseif ( is_callable( $globalcb ) ) {
call_user_func_array( $globalcb, array( &$result, $subres ) ) ;
}
elseif ( $storetag ) {
foreach ($this->typestack($result['_matchrule']) as $type) {
$callback = array( $this, $storetag ? "{$type}_{$storetag}" : "{$type}_{$subres['name']}" ) ;
if ( is_callable( $callback ) ) {
call_user_func_array( $callback, array( &$result, $subres ) ) ;
$storecalled = true; break;
}
$globalcb = array( $this, "{$type}_STR" ) ;
if ( is_callable( $globalcb ) ) {
call_user_func_array( $globalcb, array( &$result, $subres ) ) ;
$storecalled = true; break;
}
}
if ( $storetag && !$storecalled ) {
if ( !isset( $result[$storetag] ) ) $result[$storetag] = $subres ;
else {
if ( isset( $result[$storetag]['text'] ) ) $result[$storetag] = array( $result[$storetag] ) ;

View File

@ -19,9 +19,6 @@ and lexing in a single top down grammar. For a basic overview of the subject, se
Parsers are contained within a PHP file, in one or more special comment blocks that start with `/*!* [name | !pragma]` (like a docblock, but with an
exclamation mark in the middle of the stars)
Lexically, these blocks are a set of rules, each consisting of a name token, a matching rule and a set of attached functions.
The name token must contain no whitespace and end with a `:` character. The matching rule and functions are on the same line or on the indented lines below.
You can have multiple comment blocks, all of which are treated as contiguous for the purpose of compiling. During compilation these blocks will be replaced
with a set of "matching" functions (functions which match a string against their rules) for each rule in the block.
@ -30,6 +27,30 @@ If unspecified, it defaults to the same name as the previous parser comment bloc
If the name starts with an '!' symbol, that comment block is a pragma, and is treated not as some part of the parser, but as a special block of meta-data
Lexically, these blocks are a set of rules & comments. A rule can be a base rule or an extension rule
##### Base rules
Base rules consist of a name for the rule, some optional arguments, the matching rule itself, and an optional set of attached functions
NAME ( "(" ARGUMENT, ... ")" )? ":" MATCHING_RULE
ATTACHED_FUNCTIONS?
Names must be the characters a-z, A-Z, 0-9 and _ only, and must not start with a number
Base rules can be split over multiple lines as long as subsequent lines are indented
##### Extension rules
Extension rules are either the same as a base rule but with an addition name of the rule to extend, or as a replacing extension consist of
a name for the rule, the name of the rule to extend, and optionally: some arguments, some replacements, and a set of attached functions
NAME extend BASENAME ( "(" ARGUMENT, ... ")" )? ":" MATCHING_RULE
ATTACHED_FUNCTIONS?
NAME extends BASENAME ( "(" ARGUMENT, ... ")" )? ( ";" REPLACE "=>" REPLACE_WITH, ... )?
ATTACHED_FUNCTIONS?
##### Tricks and traps
We allow indenting a parser block, but only in a consistant manner - whatever the indent of the /*** marker becomes the "base" indent, and needs to be used
@ -46,7 +67,7 @@ This might get looser if I get around to re-writing the internal "parser parser"
PEG matching rules try to follow standard PEG format, summarised thusly:
<pre>
<pre><code>
token* - Token is optionally repeated
token+ - Token is repeated at least one
token? - Token is optionally present
@ -62,7 +83,7 @@ PEG matching rules try to follow standard PEG format, summarised thusly:
But with these extensions:
<pre>
<pre><code>
< or > - Optionally match whitespace
[ or ] - Require some whitespace
</code></pre>
@ -87,32 +108,56 @@ just split with a space (as in / foo \s* /)
### Expressions
Expressions allow run-time calculated matching. You can embed an expression within a literal or regex token to
match against a calculated value, or simply specify the expression as a token to (optionally) internally handle matching
and generate a result.
match against a calculated value, or simply specify the expression as a token to match against a dynamic rule.
Expressions will try a variety of scopes to find a value. It will look for variables already set in the current result,
rule-attached functions and a variety of other functions and constants.
#### Expression stack
Tried in this order
When getting a value to use for an expression, the parser will travel up the stack looking for a set value. The expression
stack is a list of all the rules passed through to get to this point. For example, given the parser
- against current result
- against containing expression stack in order (for sub-expressions only)
- against parser instance as variable
- against parser instance as rule-attached method INCLUDING `$` ( i.e. `function $foo()` )
- against parser instance as method INCLUDING `$`
- as global method
- as constant
<pre><code>
A: $a
B: A
C: B
</code></pre>
The expression stack for finding $a will be C, B, A - in other words, the A rule will be checked first, followed by B, followed by C
##### Tricks and traps
#### In terminals (literals and regexes)
Be careful against matching against results
The token will be replaced by the looked up value. To find the value for the token, the expression stack will be
travelled up checking for one of the following:
- A key / value pair in the result array node
- A rule-attached method INCLUDING `$` ( i.e. `function $foo()` )
If no value is found it will then check if a method or a property excluding the $ exists on the parser. If neither of those is found
the expression will be replaced with an exmpty string/
#### As tokens
The token will be looked up to find a value, which must be the name of a matching rule. That rule will then be matched
against as if the token was a recurse token for that rule.
To find the name of the rule to match against, the expression stack will be travelled up checking for one of the following:
- A key / value pair in the result array node
- A rule-attached method INCLUDING `$` ( i.e. `function $foo()` )
If no value is found it will then check if a method or a property excluding the $ exists on the parser. If neither of those if found
the rule will fail to match.
#### Tricks and traps
Be careful against using a token expression when you meant to use a terminal expression
<pre><code>
quoted_good: q:/['"]/ string "$q"
quoted_bad: q:/['"]/ string $q
</code></pre>
`"$q"` matches against the value of q again. `$q` simply returns the value of q, without doing any matching
`"$q"` matches against the value of q again. `$q` tries to match against a rule named `"` or `'` (both of which are illegal rule
names, and will therefore fail)
### Named matching rules
@ -149,16 +194,16 @@ All these definitions define the same rule-attached function
<pre><code>
class A extends Parser {
/**Parser
foo: bar baz
function bar() {}
* /
/*!* Parser
foo: bar baz
function bar() {}
*/
function foo_bar() {}
function foo_bar() {}
}
class B extends A {
function foo_bar() {}
function foo_bar() {}
}
</code></pre>
@ -206,6 +251,62 @@ You can also specify a rule-attached function called `*`, which will be called w
By default all matches are added to the 'text' property of a result. By prepending a member with `.` that match will not be added to the ['text'] member. This
doesn't affect the other result properties that named rules' add.
### Inheritance
Rules can inherit off other rules using the keyword extends. There are several ways to change the matching of the rule, but
they all share a common feature - when building a result set the rule will also check the inherited-from rule's rule-attached
functions for storage handlers. This lets you do something like
<pre><code>
A: Foo Bar Baz
function *(){ /* Generic store handler */ }
B extends A
function Bar(){ /* Custom handling for Bar - Foo and Baz will still fall through to the A#* function defined above */ }
</code></pre>
The actual matching rule can be specified in three ways:
#### Duplication
If you don't specify a new rule or a replacement set the matching rule is copied as is. This is useful when you want to
override some storage logic but not the rule itself
#### Text replacement
You can replace some parts of the inherited rule using test replacement by using a ';' instead of an ':' after the name
of the extended rule. You can then put replacements in a comma seperated list. An example might help
<pre><code>
A: Foo | Bar | Baz
# Makes B the equivalent of Foo | Bar | (Baz | Qux)
B extends A: Baz => (Baz | Qux)
</code></pre>
Note that the replacements are not quoted. The exception is when you want to replace with the empty string, e.g.
<pre><code>
A: Foo | Bar | Baz
# Makes B the equivalent of Foo | Bar
B extends A: | Baz => ""
</code></pre>
Currently there is no escaping supported - if you want to replace "," or "=>" characters you'll have to use full replacement
#### Full replacement
You can specify an entirely new rule in the same format as a non-inheriting rule, eg.
<pre><code>
A: Foo | Bar | Baz
B extends A: Foo | Bar | (Baz Qux)
</code></pre>
This is useful is the rule changes too much for text replacement to be readable, but want to keep the storage logic
### Pragmas
When opening a parser comment block, if instead of a name (or no name) you put a word starting with '!', that comment block is treated as a pragma - not
@ -225,6 +326,4 @@ part of the parser language itself, but some other instruction to the compiler.
- Allow inline-ing of rules into other rules for speed
- More optimisation
- Make Parser-parser be self-generated, instead of a bad hand rolled parser like it is now.
- Slighly more powerfull expressions: `${parent.q}`, `${foo()->bar}`, etc.
- Need to properly escape all literals. Expressions currently need to be in '', not ""
- PHP token parser, and other token streams, instead of strings only like now

View File

@ -0,0 +1,123 @@
<?php
require_once "ParserTestBase.php";
class ParserInheritanceTest extends ParserTestBase {
public function testBasicInheritance() {
$parser = $this->buildParser('
/*!* BasicInheritanceTestParser
Foo: "a"
Bar extends Foo
*/
');
$this->assertTrue($parser->matches('Foo', 'a'));
$this->assertTrue($parser->matches('Bar', 'a'));
$this->assertFalse($parser->matches('Foo', 'b'));
$this->assertFalse($parser->matches('Bar', 'b'));
}
public function testBasicInheritanceConstructFallback() {
$parser = $this->buildParser('
/*!* BasicInheritanceConstructFallbackParser
Foo: "a"
function __construct(&$res){ $res["test"] = "test"; }
Bar extends Foo
*/
');
$res = $parser->match('Foo', 'a');
$this->assertEquals($res['test'], 'test');
$res = $parser->match('Bar', 'a');
$this->assertEquals($res['test'], 'test');
$parser = $this->buildParser('
/*!* BasicInheritanceConstructFallbackParser2
Foo: "a"
function __construct(&$res){ $res["testa"] = "testa"; }
Bar extends Foo
function __construct(&$res){ $res["testb"] = "testb"; }
*/
');
$res = $parser->match('Foo', 'a');
$this->assertArrayHasKey('testa', $res);
$this->assertEquals($res['testa'], 'testa');
$this->assertArrayNotHasKey('testb', $res);
$res = $parser->match('Bar', 'a');
$this->assertArrayHasKey('testb', $res);
$this->assertEquals($res['testb'], 'testb');
$this->assertArrayNotHasKey('testa', $res);
}
public function testBasicInheritanceStoreFallback() {
$parser = $this->buildParser('
/*!* BasicInheritanceStoreFallbackParser
Foo: Pow:"a"
function *(&$res, $sub){ $res["test"] = "test"; }
Bar extends Foo
*/
');
$res = $parser->match('Foo', 'a');
$this->assertEquals($res['test'], 'test');
$res = $parser->match('Bar', 'a');
$this->assertEquals($res['test'], 'test');
$parser = $this->buildParser('
/*!* BasicInheritanceStoreFallbackParser2
Foo: Pow:"a" Zap:"b"
function *(&$res, $sub){ $res["testa"] = "testa"; }
Bar extends Foo
function *(&$res, $sub){ $res["testb"] = "testb"; }
Baz extends Foo
function Zap(&$res, $sub){ $res["testc"] = "testc"; }
*/
');
$res = $parser->match('Foo', 'ab');
$this->assertArrayHasKey('testa', $res);
$this->assertEquals($res['testa'], 'testa');
$this->assertArrayNotHasKey('testb', $res);
$res = $parser->match('Bar', 'ab');
$this->assertArrayHasKey('testb', $res);
$this->assertEquals($res['testb'], 'testb');
$this->assertArrayNotHasKey('testa', $res);
$res = $parser->match('Baz', 'ab');
$this->assertArrayHasKey('testa', $res);
$this->assertEquals($res['testa'], 'testa');
$this->assertArrayHasKey('testc', $res);
$this->assertEquals($res['testc'], 'testc');
$this->assertArrayNotHasKey('testb', $res);
}
public function testInheritanceByReplacement() {
$parser = $this->buildParser('
/*!* InheritanceByReplacementParser
A: "a"
B: "b"
Foo: A B
Bar extends Foo; B => A
Baz extends Foo; A => ""
*/
');
$parser->assertMatches('Foo', 'ab');
$parser->assertMatches('Bar', 'aa');
$parser->assertMatches('Baz', 'b');
}
}

View File

@ -0,0 +1,26 @@
<?php
require_once "ParserTestBase.php";
class ParserSyntaxTest extends ParserTestBase {
public function testBasicRuleSyntax() {
$parser = $this->buildParser('
/*!* BasicRuleSyntax
Foo: "a" "b"
Bar: "a"
"b"
Baz:
"a" "b"
Qux:
"a"
"b"
*/
');
$parser->assertMatches('Foo', 'ab');
$parser->assertMatches('Bar', 'ab');
$parser->assertMatches('Baz', 'ab');
$parser->assertMatches('Qux', 'ab');
}
}

View File

@ -0,0 +1,48 @@
<?php
$base = dirname(dirname(__FILE__));
include "$base/Compiler.php";
include "$base/Parser.php";
class ParserTestWrapper {
function __construct($testcase, $class){
$this->testcase = $testcase;
$this->class = $class;
}
function match($method, $string, $allowPartial = false){
$class = $this->class;
$func = 'match_'.$method;
$parser = new $class($string);
$res = $parser->$func();
return ($allowPartial || $parser->pos == strlen($string)) ? $res : false;
}
function matches($method, $string, $allowPartial = false){
return $this->match($method, $string, $allowPartial) !== false;
}
function assertMatches($method, $string, $message = null){
$this->testcase->assertTrue($this->matches($method, $string), $message ? $message : "Assert parser method $method matches string $string");
}
function assertDoesntMatch($method, $string, $message = null){
$this->testcase->assertFalse($this->matches($method, $string), $message ? $message : "Assert parser method $method doesn't match string $string");
}
}
class ParserTestBase extends PHPUnit_Framework_TestCase {
function buildParser($parser) {
$class = 'Parser'.sha1($parser);
echo ParserCompiler::compile("class $class extends Parser {\n $parser\n}") . "\n\n\n";
eval(ParserCompiler::compile("class $class extends Parser {\n $parser\n}"));
return new ParserTestWrapper($this, $class);
}
}

View File

@ -0,0 +1,55 @@
<?php
require_once "ParserTestBase.php";
class ParserVariablesTest extends ParserTestBase {
public function testBasicLiteralVariables() {
$parser = $this->buildParser('
/*!* BasicVariables
Foo: Letter:"a" "$Letter"
Bar: Letter:"b" "$Letter $Letter"
Baz: Letter:"c" "$Letter a $Letter a"
Qux: Letter:"d" "{$Letter}a{$Letter}a"
*/
');
$parser->assertMatches('Foo', 'aa');
$parser->assertMatches('Bar', 'bb b');
$parser->assertMatches('Baz', 'cc a c a');
$parser->assertMatches('Qux', 'ddada');
}
public function testRecurseOnVariables() {
$parser = $this->buildParser('
/*!* RecurseOnVariablesParser
A: "a"
B: "b"
Foo: $Template
Bar: Foo
function __construct(&$res){ $res["Template"] = "A"; }
Baz: Foo
function __construct(&$res){ $res["Template"] = "B"; }
*/
');
$parser->assertMatches('Bar', 'a'); $parser->assertDoesntMatch('Bar', 'b');
$parser->assertMatches('Baz', 'b'); $parser->assertDoesntMatch('Baz', 'a');
}
public function testSetOnRuleVariables() {
$parser = $this->buildParser('
/*!* SetOnRuleVariablesParser
A: "a"
B: "b"
Foo: $Template
Bar (Template = A): Foo
Baz (Template = B): Foo
*/
');
$parser->assertMatches('Bar', 'a');
$parser->assertMatches('Baz', 'b');
}
}