From b1203363d432105a59a5118cf04e09ea90dc128e Mon Sep 17 00:00:00 2001 From: Torsten Ruger Date: Sun, 27 Apr 2014 15:34:35 +0300 Subject: [PATCH] vendored parslet, deemed stable enough and better without dependency --- lib/crystal.rb | 2 - lib/parslet.rb | 302 +++++++++++++++++++++++++ lib/parslet/accelerator.rb | 161 +++++++++++++ lib/parslet/accelerator/application.rb | 62 +++++ lib/parslet/accelerator/engine.rb | 112 +++++++++ lib/parslet/atoms.rb | 35 +++ lib/parslet/atoms/alternative.rb | 50 ++++ lib/parslet/atoms/base.rb | 151 +++++++++++++ lib/parslet/atoms/can_flatten.rb | 137 +++++++++++ lib/parslet/atoms/capture.rb | 38 ++++ lib/parslet/atoms/context.rb | 91 ++++++++ lib/parslet/atoms/dsl.rb | 109 +++++++++ lib/parslet/atoms/dynamic.rb | 32 +++ lib/parslet/atoms/entity.rb | 41 ++++ lib/parslet/atoms/infix.rb | 121 ++++++++++ lib/parslet/atoms/lookahead.rb | 49 ++++ lib/parslet/atoms/named.rb | 32 +++ lib/parslet/atoms/re.rb | 38 ++++ lib/parslet/atoms/repetition.rb | 83 +++++++ lib/parslet/atoms/scope.rb | 26 +++ lib/parslet/atoms/sequence.rb | 45 ++++ lib/parslet/atoms/str.rb | 39 ++++ lib/parslet/atoms/visitor.rb | 89 ++++++++ lib/parslet/cause.rb | 94 ++++++++ lib/parslet/context.rb | 33 +++ lib/parslet/convenience.rb | 33 +++ lib/parslet/error_reporter.rb | 7 + lib/parslet/error_reporter/deepest.rb | 95 ++++++++ lib/parslet/error_reporter/tree.rb | 57 +++++ lib/parslet/export.rb | 162 +++++++++++++ lib/parslet/expression.rb | 51 +++++ lib/parslet/expression/treetop.rb | 92 ++++++++ lib/parslet/graphviz.rb | 97 ++++++++ lib/parslet/parser.rb | 67 ++++++ lib/parslet/pattern.rb | 114 ++++++++++ lib/parslet/pattern/binding.rb | 49 ++++ lib/parslet/rig/rspec.rb | 59 +++++ lib/parslet/scope.rb | 42 ++++ lib/parslet/slice.rb | 101 +++++++++ lib/parslet/source.rb | 87 +++++++ lib/parslet/source/line_cache.rb | 96 ++++++++ lib/parslet/transform.rb | 236 +++++++++++++++++++ 42 files changed, 3415 insertions(+), 2 deletions(-) create mode 100644 lib/parslet.rb create mode 100644 lib/parslet/accelerator.rb create mode 100644 lib/parslet/accelerator/application.rb create mode 100644 lib/parslet/accelerator/engine.rb create mode 100644 lib/parslet/atoms.rb create mode 100644 lib/parslet/atoms/alternative.rb create mode 100644 lib/parslet/atoms/base.rb create mode 100644 lib/parslet/atoms/can_flatten.rb create mode 100644 lib/parslet/atoms/capture.rb create mode 100644 lib/parslet/atoms/context.rb create mode 100644 lib/parslet/atoms/dsl.rb create mode 100644 lib/parslet/atoms/dynamic.rb create mode 100644 lib/parslet/atoms/entity.rb create mode 100644 lib/parslet/atoms/infix.rb create mode 100644 lib/parslet/atoms/lookahead.rb create mode 100644 lib/parslet/atoms/named.rb create mode 100644 lib/parslet/atoms/re.rb create mode 100644 lib/parslet/atoms/repetition.rb create mode 100644 lib/parslet/atoms/scope.rb create mode 100644 lib/parslet/atoms/sequence.rb create mode 100644 lib/parslet/atoms/str.rb create mode 100644 lib/parslet/atoms/visitor.rb create mode 100644 lib/parslet/cause.rb create mode 100644 lib/parslet/context.rb create mode 100644 lib/parslet/convenience.rb create mode 100644 lib/parslet/error_reporter.rb create mode 100644 lib/parslet/error_reporter/deepest.rb create mode 100644 lib/parslet/error_reporter/tree.rb create mode 100644 lib/parslet/export.rb create mode 100644 lib/parslet/expression.rb create mode 100644 lib/parslet/expression/treetop.rb create mode 100644 lib/parslet/graphviz.rb create mode 100644 lib/parslet/parser.rb create mode 100644 lib/parslet/pattern.rb create mode 100644 lib/parslet/pattern/binding.rb create mode 100644 lib/parslet/rig/rspec.rb create mode 100644 lib/parslet/scope.rb create mode 100644 lib/parslet/slice.rb create mode 100644 lib/parslet/source.rb create mode 100644 lib/parslet/source/line_cache.rb create mode 100644 lib/parslet/transform.rb diff --git a/lib/crystal.rb b/lib/crystal.rb index 9cc2169f..9911681c 100644 --- a/lib/crystal.rb +++ b/lib/crystal.rb @@ -1,5 +1,3 @@ -# parslet is assumed to be checked out at the same level as crystal for now -$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', ".." , "parslet",'lib')) require 'parslet' require "asm/program" diff --git a/lib/parslet.rb b/lib/parslet.rb new file mode 100644 index 00000000..2fef5ce9 --- /dev/null +++ b/lib/parslet.rb @@ -0,0 +1,302 @@ +# A simple parser generator library. Typical usage would look like this: +# +# require 'parslet' +# +# class MyParser < Parslet::Parser +# rule(:a) { str('a').repeat } +# root(:a) +# end +# +# pp MyParser.new.parse('aaaa') # => 'aaaa'@0 +# pp MyParser.new.parse('bbbb') # => Parslet::Atoms::ParseFailed: +# # Don't know what to do with bbbb at line 1 char 1. +# +# The simple DSL allows you to define grammars in PEG-style. This kind of +# grammar construction does away with the ambiguities that usually comes with +# parsers; instead, it allows you to construct grammars that are easier to +# debug, since less magic is involved. +# +# Parslet is typically used in stages: +# +# +# * Parsing the input string; this yields an intermediary tree, see +# Parslet.any, Parslet.match, Parslet.str, Parslet::ClassMethods#rule and +# Parslet::ClassMethods#root. +# * Transformation of the tree into something useful to you, see +# Parslet::Transform, Parslet.simple, Parslet.sequence and Parslet.subtree. +# +# The first stage is traditionally intermingled with the second stage; output +# from the second stage is usually called the 'Abstract Syntax Tree' or AST. +# +# The stages are completely decoupled; You can change your grammar around and +# use the second stage to isolate the rest of your code from the changes +# you've effected. +# +# == Further reading +# +# All parslet atoms are subclasses of {Parslet::Atoms::Base}. You might want to +# look at all of those: {Parslet::Atoms::Re}, {Parslet::Atoms::Str}, +# {Parslet::Atoms::Repetition}, {Parslet::Atoms::Sequence}, +# {Parslet::Atoms::Alternative}. +# +# == When things go wrong +# +# A parse that fails will raise {Parslet::ParseFailed}. This exception contains +# all the details of what went wrong, including a detailed error trace that +# can be printed out as an ascii tree. ({Parslet::Cause}) +# +module Parslet + # Extends classes that include Parslet with the module + # {Parslet::ClassMethods}. + # + def self.included(base) + base.extend(ClassMethods) + end + + # Raised when the parse failed to match. It contains the message that should + # be presented to the user. More details can be extracted from the + # exceptions #cause member: It contains an instance of {Parslet::Cause} that + # stores all the details of your failed parse in a tree structure. + # + # begin + # parslet.parse(str) + # rescue Parslet::ParseFailed => failure + # puts failure.cause.ascii_tree + # end + # + # Alternatively, you can just require 'parslet/convenience' and call the + # method #parse_with_debug instead of #parse. This method will never raise + # and print error trees to stdout. + # + # require 'parslet/convenience' + # parslet.parse_with_debug(str) + # + class ParseFailed < StandardError + def initialize(message, cause=nil) + super(message) + @cause = cause + end + + # Why the parse failed. + # + # @return [Parslet::Cause] + attr_reader :cause + end + + module ClassMethods + # Define an entity for the parser. This generates a method of the same + # name that can be used as part of other patterns. Those methods can be + # freely mixed in your parser class with real ruby methods. + # + # class MyParser + # include Parslet + # + # rule(:bar) { str('bar') } + # rule(:twobar) do + # bar >> bar + # end + # + # root :twobar + # end + # + def rule(name, &definition) + define_method(name) do + @rules ||= {} # memoization + return @rules[name] if @rules.has_key?(name) + + # Capture the self of the parser class along with the definition. + definition_closure = proc { + self.instance_eval(&definition) + } + + @rules[name] = Atoms::Entity.new(name, &definition_closure) + end + end + end + + # Allows for delayed construction of #match. See also Parslet.match. + # + # @api private + class DelayedMatchConstructor + def [](str) + Atoms::Re.new("[" + str + "]") + end + end + + # Returns an atom matching a character class. All regular expressions can be + # used, as long as they match only a single character at a time. + # + # match('[ab]') # will match either 'a' or 'b' + # match('[\n\s]') # will match newlines and spaces + # + # There is also another (convenience) form of this method: + # + # match['a-z'] # synonymous to match('[a-z]') + # match['\n'] # synonymous to match('[\n]') + # + # @overload match(str) + # @param str [String] character class to match (regexp syntax) + # @return [Parslet::Atoms::Re] a parslet atom + # + def match(str=nil) + return DelayedMatchConstructor.new unless str + + return Atoms::Re.new(str) + end + module_function :match + + # Returns an atom matching the +str+ given: + # + # str('class') # will match 'class' + # + # @param str [String] string to match verbatim + # @return [Parslet::Atoms::Str] a parslet atom + # + def str(str) + Atoms::Str.new(str) + end + module_function :str + + # Returns an atom matching any character. It acts like the '.' (dot) + # character in regular expressions. + # + # any.parse('a') # => 'a' + # + # @return [Parslet::Atoms::Re] a parslet atom + # + def any + Atoms::Re.new('.') + end + module_function :any + + # Introduces a new capture scope. This means that all old captures stay + # accessible, but new values stored will only be available during the block + # given and the old values will be restored after the block. + # + # Example: + # # :a will be available until the end of the block. Afterwards, + # # :a from the outer scope will be available again, if such a thing + # # exists. + # scope { str('a').capture(:a) } + # + def scope(&block) + Parslet::Atoms::Scope.new(block) + end + module_function :scope + + # Designates a piece of the parser as being dynamic. Dynamic parsers can + # either return a parser at runtime, which will be applied on the input, or + # return a result from a parse. + # + # Dynamic parse pieces are never cached and can introduce performance + # abnormalitites - use sparingly where other constructs fail. + # + # Example: + # # Parses either 'a' or 'b', depending on the weather + # dynamic { rand() < 0.5 ? str('a') : str('b') } + # + def dynamic(&block) + Parslet::Atoms::Dynamic.new(block) + end + module_function :dynamic + + # Returns a parslet atom that parses infix expressions. Operations are + # specified as a list of tuples, where + # atom is simply the parslet atom that matches an operator, precedence is + # a number and associativity is either :left or :right. + # + # Higher precedence indicates that the operation should bind tighter than + # other operations with lower precedence. In common algebra, '+' has + # lower precedence than '*'. So you would have a precedence of 1 for '+' and + # a precedence of 2 for '*'. Only the order relation between these two + # counts, so any number would work. + # + # Associativity is what decides what interpretation to take for strings that + # are ambiguous like '1 + 2 + 3'. If '+' is specified as left associative, + # the expression would be interpreted as '(1 + 2) + 3'. If right + # associativity is chosen, it would be interpreted as '1 + (2 + 3)'. Note + # that the hash trees output reflect that choice as well. + # + # Example: + # infix_expression(integer, [add_op, 1, :left]) + # # would parse things like '1 + 2' + # + # @param element [Parslet::Atoms::Base] elements that take the NUMBER position + # in the expression + # @param operations [Array<(Parslet::Atoms::Base, Integer, {:left, :right})>] + # + # @see Parslet::Atoms::Infix + # + def infix_expression(element, *operations) + Parslet::Atoms::Infix.new(element, operations) + end + module_function :infix_expression + + # A special kind of atom that allows embedding whole treetop expressions + # into parslet construction. + # + # # the same as str('a') >> str('b').maybe + # exp(%Q("a" "b"?)) + # + # @param str [String] a treetop expression + # @return [Parslet::Atoms::Base] the corresponding parslet parser + # + def exp(str) + Parslet::Expression.new(str).to_parslet + end + module_function :exp + + # Returns a placeholder for a tree transformation that will only match a + # sequence of elements. The +symbol+ you specify will be the key for the + # matched sequence in the returned dictionary. + # + # # This would match a body element that contains several declarations. + # { :body => sequence(:declarations) } + # + # The above example would match :body => ['a', 'b'], but not + # :body => 'a'. + # + # see {Parslet::Transform} + # + def sequence(symbol) + Pattern::SequenceBind.new(symbol) + end + module_function :sequence + + # Returns a placeholder for a tree transformation that will only match + # simple elements. This matches everything that #sequence + # doesn't match. + # + # # Matches a single header. + # { :header => simple(:header) } + # + # see {Parslet::Transform} + # + def simple(symbol) + Pattern::SimpleBind.new(symbol) + end + module_function :simple + + # Returns a placeholder for tree transformation patterns that will match + # any kind of subtree. + # + # { :expression => subtree(:exp) } + # + def subtree(symbol) + Pattern::SubtreeBind.new(symbol) + end + module_function :subtree + + autoload :Expression, 'parslet/expression' +end + +require 'parslet/slice' +require 'parslet/cause' +require 'parslet/source' +require 'parslet/atoms' +require 'parslet/pattern' +require 'parslet/pattern/binding' +require 'parslet/transform' +require 'parslet/parser' +require 'parslet/error_reporter' +require 'parslet/scope' \ No newline at end of file diff --git a/lib/parslet/accelerator.rb b/lib/parslet/accelerator.rb new file mode 100644 index 00000000..79987d33 --- /dev/null +++ b/lib/parslet/accelerator.rb @@ -0,0 +1,161 @@ + + +# Optimizes the parsers by pattern matching on the parser atoms and replacing +# matches with better versions. See the file qed/accelerators.md for a more +# in-depth description. +# +# Example: +# quote = str('"') +# parser = quote >> (quote.absent? >> any).repeat >> quote +# +# A = Accelerator # for making what follows a bit shorter +# optimized_parser = A.apply(parser, +# A.rule( (A.str(:x).absent? >> A.any).repeat ) { GobbleUp.new(x) }) +# +# optimized_parser.parse('"Parsing is now fully optimized! (tm)"') +# +module Parslet::Accelerator + + # An expression to match against a tree of parser atoms. Normally, an + # expression is produced by Parslet::Accelerator.any, + # Parslet::Accelerator.str or Parslet::Accelerator.re. + # + # Expressions can be chained much like parslet atoms can be: + # + # expr.repeat(1) # matching repetition + # expr.absent? # matching absent? + # expr.present? # matching present? + # expr1 >> expr2 # matching a sequence + # expr1 | expr2 # matching an alternation + # + # @see Parslet::Accelerator.str + # @see Parslet::Accelerator.re + # @see Parslet::Accelerator.any + # + # @see Parslet::Accelerator + # + class Expression + attr_reader :type + attr_reader :args + + def initialize(type, *args) + @type = type + @args = args + end + + # @return [Expression] + def >> other_expr + join_or_new :seq, other_expr + end + + # @return [Expression] + def | other_expr + join_or_new :alt, other_expr + end + + # @return [Expression] + def absent? + Expression.new(:absent, self) + end + # @return [Expression] + def present? + Expression.new(:present, self) + end + + # @return [Expression] + def repeat min=0, max=nil + Expression.new(:rep, min, max, self) + end + + # @return [Expression] + def as name + Expression.new(:as, name) + end + + # @api private + # @return [Expression] + def join_or_new tag, other_expr + if type == tag + @args << other_expr + else + Expression.new(tag, self, other_expr) + end + end + end + +module_function + # Returns a match expression that will match `str` parslet atoms. + # + # @return [Parslet::Accelerator::Expression] + # + def str variable, *constraints + Expression.new(:str, variable, *constraints) + end + + # Returns a match expression that will match `match` parslet atoms. + # + # @return [Parslet::Accelerator::Expression] + # + def re variable, *constraints + Expression.new(:re, variable, *constraints) + end + + # Returns a match expression that will match `any` parslet atoms. + # + # @return [Parslet::Accelerator::Expression] + # + def any + Expression.new(:re, ".") + end + + # Given a parslet atom and an expression, will determine if the expression + # matches the atom. If successful, returns the bindings into the pattern + # that were made. If no bindings had to be made to make the match successful, + # the empty hash is returned. + # + # @param atom [Parslet::Atoms::Base] parslet atom to match against + # @param expr [Parslet::Accelerator::Expression] expression to match + # @return [nil, Hash] bindings for the match, nil on failure + # + def match atom, expr + engine = Engine.new + + return engine.bindings if engine.match(atom, expr) + end + + # Constructs an accelerator rule. A rule is a matching expression and the + # code that should be executed once the expression could be bound to a + # parser. + # + # Example: + # Accelerator.rule(Accelerator.any) { Parslet.match('.') } + # + def rule expression, &action + [expression, action] + end + + # Given a parslet atom and a set of rules, tries to match the rules + # recursively through the parslet atom. Once a rule could be matched, + # its action block will be called. + # + # Example: + # quote = str('"') + # parser = quote >> (quote.absent? >> any).repeat >> quote + # + # A = Accelerator # for making what follows a bit shorter + # optimized_parser = A.apply(parser, + # A.rule( (A.str(:x).absent? >> A.any).repeat ) { GobbleUp.new(x) }) + # + # optimized_parser.parse('"Parsing is now fully optimized! (tm)"') + # + # @param atom [Parslet::Atoms::Base] a parser to optimize + # @param *rules [Parslet::Accelerator::Rule] rules produced by .rule + # @return [Parslet::Atoms::Base] optimized parser + # + def apply atom, *rules + Application.new(atom, rules).call + end +end + +require 'parslet/accelerator/engine' +require 'parslet/accelerator/application' \ No newline at end of file diff --git a/lib/parslet/accelerator/application.rb b/lib/parslet/accelerator/application.rb new file mode 100644 index 00000000..8015cc89 --- /dev/null +++ b/lib/parslet/accelerator/application.rb @@ -0,0 +1,62 @@ + +# @api private +module Parslet::Accelerator + class Application + def initialize atom, rules + @atom = atom + @rules = rules + end + + def call + @atom.accept(self) + end + + def visit_parser(root) + transform root.accept(self) + end + def visit_entity(name, block) + transform Parslet::Atoms::Entity.new(name) { block.call.accept(self) } + end + def visit_named(name, atom) + transform Parslet::Atoms::Named.new(atom.accept(self), name) + end + def visit_repetition(tag, min, max, atom) + transform Parslet::Atoms::Repetition.new(atom.accept(self), min, max, tag) + end + def visit_alternative(alternatives) + transform Parslet::Atoms::Alternative.new( + *alternatives.map { |atom| atom.accept(self) }) + end + def visit_sequence(sequence) + transform Parslet::Atoms::Sequence.new( + *sequence.map { |atom| atom.accept(self) }) + end + def visit_lookahead(positive, atom) + transform Parslet::Atoms::Lookahead.new(atom, positive) + end + def visit_re(regexp) + transform Parslet::Atoms::Re.new(regexp) + end + def visit_str(str) + transform Parslet::Atoms::Str.new(str) + end + + def transform atom + @rules.each do |expr, action| + # Try and match each rule in turn + binding = Parslet::Accelerator.match(atom, expr) + if binding + # On a successful match, allow the rule action to transform the + # parslet into something new. + ctx = Parslet::Context.new(binding) + return ctx.instance_eval(&action) + end + end # rules.each + + # If no rule matches, this is the fallback - a clean new parslet atom. + return atom + end + end +end + +require 'parslet/context' \ No newline at end of file diff --git a/lib/parslet/accelerator/engine.rb b/lib/parslet/accelerator/engine.rb new file mode 100644 index 00000000..1f081812 --- /dev/null +++ b/lib/parslet/accelerator/engine.rb @@ -0,0 +1,112 @@ + +require 'parslet/atoms/visitor' + +module Parslet::Accelerator + # @api private + class Apply + def initialize(engine, expr) + @engine = engine + @expr = expr + end + + def visit_parser(root) + false + end + def visit_entity(name, block) + false + end + def visit_named(name, atom) + match(:as) do |key| + @engine.try_bind(key, name) + end + end + def visit_repetition(tag, min, max, atom) + match(:rep) do |e_min, e_max, expr| + e_min == min && e_max == max && @engine.match(atom, expr) + end + end + def visit_alternative(alternatives) + match(:alt) do |*expressions| + return false if alternatives.size != expressions.size + + alternatives.zip(expressions).all? do |atom, expr| + @engine.match(atom, expr) + end + end + end + def visit_sequence(sequence) + match(:seq) do |*expressions| + return false if sequence.size != expressions.size + + sequence.zip(expressions).all? do |atom, expr| + @engine.match(atom, expr) + end + end + end + def visit_lookahead(positive, atom) + match(:absent) do |expr| + return positive == false && @engine.match(atom, expr) + end + match(:present) do |expr| + return positive == true && @engine.match(atom, expr) + end + end + def visit_re(regexp) + match(:re) do |*bind_conditions| + bind_conditions.all? { |bind_cond| + @engine.try_bind(bind_cond, regexp) } + end + end + def visit_str(str) + match(:str) do |*bind_conditions| + bind_conditions.all? { |bind_cond| + @engine.try_bind(bind_cond, str) } + end + end + + def match(type_tag) + expr_tag = @expr.type + if expr_tag == type_tag + yield *@expr.args + end + end + end + + # @api private + class Engine + attr_reader :bindings + + def initialize + @bindings = {} + end + + def match(atom, expr) + atom.accept( + Apply.new(self, expr)) + end + + def try_bind(variable, value) + if bound? variable + return value == lookup(variable) + else + case variable + when Symbol + bind(variable, value) + else + # This does not look like a variable - let's try matching it against + # the value: + variable === value + end + end + end + def bound? var + @bindings.has_key? var + end + def lookup var + @bindings[var] + end + def bind var, val + @bindings[var] = val + end + end +end \ No newline at end of file diff --git a/lib/parslet/atoms.rb b/lib/parslet/atoms.rb new file mode 100644 index 00000000..5a4222c6 --- /dev/null +++ b/lib/parslet/atoms.rb @@ -0,0 +1,35 @@ + +# This is where parslets name comes from: Small parser atoms. +# +module Parslet::Atoms + # The precedence module controls parenthesis during the #inspect printing + # of parslets. It is not relevant to other aspects of the parsing. + # + module Precedence + prec = 0 + BASE = (prec+=1) # everything else + LOOKAHEAD = (prec+=1) # &SOMETHING + REPETITION = (prec+=1) # 'a'+, 'a'? + SEQUENCE = (prec+=1) # 'a' 'b' + ALTERNATE = (prec+=1) # 'a' | 'b' + OUTER = (prec+=1) # printing is done here. + end + + require 'parslet/atoms/can_flatten' + require 'parslet/atoms/context' + require 'parslet/atoms/dsl' + require 'parslet/atoms/base' + require 'parslet/atoms/named' + require 'parslet/atoms/lookahead' + require 'parslet/atoms/alternative' + require 'parslet/atoms/sequence' + require 'parslet/atoms/repetition' + require 'parslet/atoms/re' + require 'parslet/atoms/str' + require 'parslet/atoms/entity' + require 'parslet/atoms/capture' + require 'parslet/atoms/dynamic' + require 'parslet/atoms/scope' + require 'parslet/atoms/infix' +end + diff --git a/lib/parslet/atoms/alternative.rb b/lib/parslet/atoms/alternative.rb new file mode 100644 index 00000000..c5f2e39f --- /dev/null +++ b/lib/parslet/atoms/alternative.rb @@ -0,0 +1,50 @@ + +# Alternative during matching. Contains a list of parslets that is tried each +# one in turn. Only fails if all alternatives fail. +# +# Example: +# +# str('a') | str('b') # matches either 'a' or 'b' +# +class Parslet::Atoms::Alternative < Parslet::Atoms::Base + attr_reader :alternatives + + # Constructs an Alternative instance using all given parslets in the order + # given. This is what happens if you call '|' on existing parslets, like + # this: + # + # str('a') | str('b') + # + def initialize(*alternatives) + super() + + @alternatives = alternatives + @error_msg = "Expected one of #{alternatives.inspect}" + end + + #--- + # Don't construct a hanging tree of Alternative parslets, instead store them + # all here. This reduces the number of objects created. + #+++ + def |(parslet) + self.class.new(*@alternatives + [parslet]) + end + + def try(source, context, consume_all) + errors = alternatives.map { |a| + success, value = result = a.apply(source, context, consume_all) + return result if success + + # Aggregate all errors + value + } + + # If we reach this point, all alternatives have failed. + context.err(self, source, @error_msg, errors) + end + + precedence ALTERNATE + def to_s_inner(prec) + alternatives.map { |a| a.to_s(prec) }.join(' / ') + end +end diff --git a/lib/parslet/atoms/base.rb b/lib/parslet/atoms/base.rb new file mode 100644 index 00000000..46879054 --- /dev/null +++ b/lib/parslet/atoms/base.rb @@ -0,0 +1,151 @@ +# Base class for all parslets, handles orchestration of calls and implements +# a lot of the operator and chaining methods. +# +# Also see Parslet::Atoms::DSL chaining parslet atoms together. +# +class Parslet::Atoms::Base + include Parslet::Atoms::Precedence + include Parslet::Atoms::DSL + include Parslet::Atoms::CanFlatten + + # Given a string or an IO object, this will attempt a parse of its contents + # and return a result. If the parse fails, a Parslet::ParseFailed exception + # will be thrown. + # + # @param io [String, Source] input for the parse process + # @option options [Parslet::ErrorReporter] :reporter error reporter to use, + # defaults to Parslet::ErrorReporter::Tree + # @option options [Boolean] :prefix Should a prefix match be accepted? + # (default: false) + # @return [Hash, Array, Parslet::Slice] PORO (Plain old Ruby object) result + # tree + # + def parse(io, options={}) + source = io.respond_to?(:line_and_column) ? + io : + Parslet::Source.new(io) + + # Try to cheat. Assuming that we'll be able to parse the input, don't + # run error reporting code. + success, value = setup_and_apply(source, nil, !options[:prefix]) + + # If we didn't succeed the parse, raise an exception for the user. + # Stack trace will be off, but the error tree should explain the reason + # it failed. + unless success + # Cheating has not paid off. Now pay the cost: Rerun the parse, + # gathering error information in the process. + reporter = options[:reporter] || Parslet::ErrorReporter::Tree.new + source.pos = 0 + success, value = setup_and_apply(source, reporter, !options[:prefix]) + + fail "Assertion failed: success was true when parsing with reporter" \ + if success + + # Value is a Parslet::Cause, which can be turned into an exception: + value.raise + + fail "NEVER REACHED" + end + + # assert: success is true + + # Extra input is now handled inline with the rest of the parsing. If + # really we have success == true, prefix: false and still some input + # is left dangling, that is a BUG. + if !options[:prefix] && source.chars_left > 0 + fail "BUG: New error strategy should not reach this point." + end + + return flatten(value) + end + + # Creates a context for parsing and applies the current atom to the input. + # Returns the parse result. + # + # @return [] Result of the parse. If the first member is + # true, the parse has succeeded. + def setup_and_apply(source, error_reporter, consume_all) + context = Parslet::Atoms::Context.new(error_reporter) + apply(source, context, consume_all) + end + + # Calls the #try method of this parslet. Success consumes input, error will + # rewind the input. + # + # @param source [Parslet::Source] source to read input from + # @param context [Parslet::Atoms::Context] context to use for the parsing + # @param consume_all [Boolean] true if the current parse must consume + # all input by itself. + def apply(source, context, consume_all=false) + old_pos = source.pos + + success, value = result = context.try_with_cache(self, source, consume_all) + + if success + # If a consume_all parse was made and doesn't result in the consumption + # of all the input, that is considered an error. + if consume_all && source.chars_left>0 + # Read 10 characters ahead. Why ten? I don't know. + offending_pos = source.pos + offending_input = source.consume(10) + + # Rewind input (as happens always in error case) + source.pos = old_pos + + return context.err_at( + self, + source, + "Don't know what to do with #{offending_input.to_s.inspect}", + offending_pos + ) + end + + # Looks like the parse was successful after all. Don't rewind the input. + return result + end + + # We only reach this point if the parse has failed. Rewind the input. + source.pos = old_pos + return result + end + + # Override this in your Atoms::Base subclasses to implement parsing + # behaviour. + # + def try(source, context, consume_all) + raise NotImplementedError, \ + "Atoms::Base doesn't have behaviour, please implement #try(source, context)." + end + + # Returns true if this atom can be cached in the packrat cache. Most parslet + # atoms are cached, so this always returns true, unless overridden. + # + def cached? + true + end + + # Debug printing - in Treetop syntax. + # + def self.precedence(prec) + define_method(:precedence) { prec } + end + precedence BASE + def to_s(outer_prec=OUTER) + if outer_prec < precedence + "("+to_s_inner(precedence)+")" + else + to_s_inner(precedence) + end + end + def inspect + to_s(OUTER) + end +private + + # Produces an instance of Success and returns it. + # + def succ(result) + [true, result] + end +end diff --git a/lib/parslet/atoms/can_flatten.rb b/lib/parslet/atoms/can_flatten.rb new file mode 100644 index 00000000..8f5badc9 --- /dev/null +++ b/lib/parslet/atoms/can_flatten.rb @@ -0,0 +1,137 @@ + +module Parslet::Atoms + # A series of helper functions that have the common topic of flattening + # result values into the intermediary tree that consists of Ruby Hashes and + # Arrays. + # + # This module has one main function, #flatten, that takes an annotated + # structure as input and returns the reduced form that users expect from + # Atom#parse. + # + # NOTE: Since all of these functions are just that, functions without + # side effects, they are in a module and not in a class. Its hard to draw + # the line sometimes, but this is beyond. + # + module CanFlatten + # Takes a mixed value coming out of a parslet and converts it to a return + # value for the user by dropping things and merging hashes. + # + # Named is set to true if this result will be embedded in a Hash result from + # naming something using .as(...). It changes the folding + # semantics of repetition. + # + def flatten(value, named=false) + # Passes through everything that isn't an array of things + return value unless value.instance_of? Array + + # Extracts the s-expression tag + tag, *tail = value + + # Merges arrays: + result = tail. + map { |e| flatten(e) } # first flatten each element + + case tag + when :sequence + return flatten_sequence(result) + when :maybe + return named ? result.first : result.first || '' + when :repetition + return flatten_repetition(result, named) + end + + fail "BUG: Unknown tag #{tag.inspect}." + end + + # Lisp style fold left where the first element builds the basis for + # an inject. + # + def foldl(list, &block) + return '' if list.empty? + list[1..-1].inject(list.first, &block) + end + + # Flatten results from a sequence of parslets. + # + # @api private + # + def flatten_sequence(list) + foldl(list.compact) { |r, e| # and then merge flat elements + merge_fold(r, e) + } + end + # @api private + def merge_fold(l, r) + # equal pairs: merge. ---------------------------------------------------- + if l.class == r.class + if l.is_a?(Hash) + warn_about_duplicate_keys(l, r) + return l.merge(r) + else + return l + r + end + end + + # unequal pairs: hoist to same level. ------------------------------------ + + # Maybe classes are not equal, but both are stringlike? + if l.respond_to?(:to_str) && r.respond_to?(:to_str) + # if we're merging a String with a Slice, the slice wins. + return r if r.respond_to? :to_slice + return l if l.respond_to? :to_slice + + fail "NOTREACHED: What other stringlike classes are there?" + end + + # special case: If one of them is a string/slice, the other is more important + return l if r.respond_to? :to_str + return r if l.respond_to? :to_str + + # otherwise just create an array for one of them to live in + return l + [r] if r.class == Hash + return [l] + r if l.class == Hash + + fail "Unhandled case when foldr'ing sequence." + end + + # Flatten results from a repetition of a single parslet. named indicates + # whether the user has named the result or not. If the user has named + # the results, we want to leave an empty list alone - otherwise it is + # turned into an empty string. + # + # @api private + # + def flatten_repetition(list, named) + if list.any? { |e| e.instance_of?(Hash) } + # If keyed subtrees are in the array, we'll want to discard all + # strings inbetween. To keep them, name them. + return list.select { |e| e.instance_of?(Hash) } + end + + if list.any? { |e| e.instance_of?(Array) } + # If any arrays are nested in this array, flatten all arrays to this + # level. + return list. + select { |e| e.instance_of?(Array) }. + flatten(1) + end + + # Consistent handling of empty lists, when we act on a named result + return [] if named && list.empty? + + # If there are only strings, concatenate them and return that. + foldl(list) { |s,e| s+e } + end + + # That annoying warning 'Duplicate subtrees while merging result' comes + # from here. You should add more '.as(...)' names to your intermediary tree. + # + def warn_about_duplicate_keys(h1, h2) + d = h1.keys & h2.keys + unless d.empty? + warn "Duplicate subtrees while merging result of \n #{self.inspect}\nonly the values"+ + " of the latter will be kept. (keys: #{d.inspect})" + end + end + end +end \ No newline at end of file diff --git a/lib/parslet/atoms/capture.rb b/lib/parslet/atoms/capture.rb new file mode 100644 index 00000000..58acd83d --- /dev/null +++ b/lib/parslet/atoms/capture.rb @@ -0,0 +1,38 @@ + +# Stores the result of matching an atom against input in the #captures in +# parse context. Doing so will allow you to pull parts of the ongoing parse +# out later and use them to match other pieces of input. +# +# Example: +# # After this, context.captures[:an_a] returns 'a' +# str('a').capture(:an_a) +# +# # Capture and use of the capture: (matches either 'aa' or 'bb') +# match['ab'].capture(:first) >> +# dynamic { |src, ctx| str(ctx.captures[:first]) } +# +class Parslet::Atoms::Capture < Parslet::Atoms::Base + attr_reader :parslet, :name + + def initialize(parslet, name) + super() + + @parslet, @name = parslet, name + end + + def apply(source, context, consume_all) + success, value = result = parslet.apply(source, context, consume_all) + + if success + context.captures[name.to_sym] = + flatten(value) + end + + return result + end + + def to_s_inner(prec) + "(#{name.inspect} = #{parslet.to_s(prec)})" + end +end + diff --git a/lib/parslet/atoms/context.rb b/lib/parslet/atoms/context.rb new file mode 100644 index 00000000..231f5e01 --- /dev/null +++ b/lib/parslet/atoms/context.rb @@ -0,0 +1,91 @@ +module Parslet::Atoms + # Helper class that implements a transient cache that maps position and + # parslet object to results. This is used for memoization in the packrat + # style. + # + # Also, error reporter is stored here and error reporting happens through + # this class. This makes the reporting pluggable. + # + class Context + # @param reporter [#err, #err_at] Error reporter (leave empty for default + # reporter) + def initialize(reporter=Parslet::ErrorReporter::Tree.new) + @cache = Hash.new { |h, k| h[k] = {} } + @reporter = reporter + @captures = Parslet::Scope.new + end + + # Caches a parse answer for obj at source.pos. Applying the same parslet + # at one position of input always yields the same result, unless the input + # has changed. + # + # We need the entire source here so we can ask for how many characters + # were consumed by a successful parse. Imitation of such a parse must + # advance the input pos by the same amount of bytes. + # + def try_with_cache(obj, source, consume_all) + beg = source.pos + + # Not in cache yet? Return early. + unless entry = lookup(obj, beg) + result = obj.try(source, self, consume_all) + + if obj.cached? + set obj, beg, [result, source.pos-beg] + end + + return result + end + + # the condition in unless has returned true, so entry is not nil. + result, advance = entry + + # The data we're skipping here has been read before. (since it is in + # the cache) PLUS the actual contents are not interesting anymore since + # we know obj matches at beg. So skip reading. + source.pos = beg + advance + return result + end + + # Report an error at a given position. + # @see ErrorReporter + # + def err_at(*args) + return [false, @reporter.err_at(*args)] if @reporter + return [false, nil] + end + + # Report an error. + # @see ErrorReporter + # + def err(*args) + return [false, @reporter.err(*args)] if @reporter + return [false, nil] + end + + # Returns the current captures made on the input (see + # Parslet::Atoms::Base#capture). Use as follows: + # + # context.captures[:foobar] # => returns capture :foobar + # + attr_reader :captures + + # Starts a new scope. Use the #scope method of Parslet::Atoms::DSL + # to call this. + # + def scope + captures.push + yield + ensure + captures.pop + end + + private + def lookup(obj, pos) + @cache[pos][obj] + end + def set(obj, pos, val) + @cache[pos][obj] = val + end + end +end \ No newline at end of file diff --git a/lib/parslet/atoms/dsl.rb b/lib/parslet/atoms/dsl.rb new file mode 100644 index 00000000..5e403aef --- /dev/null +++ b/lib/parslet/atoms/dsl.rb @@ -0,0 +1,109 @@ + +# A mixin module that defines operations that can be called on any subclass +# of Parslet::Atoms::Base. These operations make parslets atoms chainable and +# allow combination of parslet atoms to form bigger parsers. +# +# Example: +# +# str('foo') >> str('bar') +# str('f').repeat +# any.absent? # also called The Epsilon +# +module Parslet::Atoms::DSL + # Construct a new atom that repeats the current atom min times at least and + # at most max times. max can be nil to indicate that no maximum is present. + # + # Example: + # # match any number of 'a's + # str('a').repeat + # + # # match between 1 and 3 'a's + # str('a').repeat(1,3) + # + def repeat(min=0, max=nil) + Parslet::Atoms::Repetition.new(self, min, max) + end + + # Returns a new parslet atom that is only maybe present in the input. This + # is synonymous to calling #repeat(0,1). Generated tree value will be + # either nil (if atom is not present in the input) or the matched subtree. + # + # Example: + # str('foo').maybe + # + def maybe + Parslet::Atoms::Repetition.new(self, 0, 1, :maybe) + end + + # Chains two parslet atoms together as a sequence. + # + # Example: + # str('a') >> str('b') + # + def >>(parslet) + Parslet::Atoms::Sequence.new(self, parslet) + end + + # Chains two parslet atoms together to express alternation. A match will + # always be attempted with the parslet on the left side first. If it doesn't + # match, the right side will be tried. + # + # Example: + # # matches either 'a' OR 'b' + # str('a') | str('b') + # + def |(parslet) + Parslet::Atoms::Alternative.new(self, parslet) + end + + # Tests for absence of a parslet atom in the input stream without consuming + # it. + # + # Example: + # # Only proceed the parse if 'a' is absent. + # str('a').absent? + # + def absent? + Parslet::Atoms::Lookahead.new(self, false) + end + + # Tests for presence of a parslet atom in the input stream without consuming + # it. + # + # Example: + # # Only proceed the parse if 'a' is present. + # str('a').present? + # + def present? + Parslet::Atoms::Lookahead.new(self, true) + end + + # Alias for present? that will disappear in 2.0 (deprecated) + # + alias prsnt? present? + + # Alias for absent? that will disappear in 2.0 (deprecated) + # + alias absnt? absent? + + # Marks a parslet atom as important for the tree output. This must be used + # to achieve meaningful output from the #parse method. + # + # Example: + # str('a').as(:b) # will produce {:b => 'a'} + # + def as(name) + Parslet::Atoms::Named.new(self, name) + end + + # Captures a part of the input and stores it under the name given. This + # is very useful to create self-referential parses. A capture stores + # the result of its parse (may be complex) on a successful parse action. + # + # Example: + # str('a').capture(:b) # will store captures[:b] == 'a' + # + def capture(name) + Parslet::Atoms::Capture.new(self, name) + end +end \ No newline at end of file diff --git a/lib/parslet/atoms/dynamic.rb b/lib/parslet/atoms/dynamic.rb new file mode 100644 index 00000000..2dc4a844 --- /dev/null +++ b/lib/parslet/atoms/dynamic.rb @@ -0,0 +1,32 @@ +# Evaluates a block at parse time. The result from the block must be a parser +# (something which implements #apply). In the first case, the parser will then +# be applied to the input, creating the result. +# +# Dynamic parses are never cached. +# +# Example: +# dynamic { rand < 0.5 ? str('a') : str('b') } +# +class Parslet::Atoms::Dynamic < Parslet::Atoms::Base + attr_reader :block + + def initialize(block) + @block = block + end + + def cached? + false + end + + def try(source, context, consume_all) + result = block.call(source, context) + + # Result is a parslet atom. + return result.apply(source, context, consume_all) + end + + def to_s_inner(prec) + "dynamic { ... }" + end +end + diff --git a/lib/parslet/atoms/entity.rb b/lib/parslet/atoms/entity.rb new file mode 100644 index 00000000..4df60507 --- /dev/null +++ b/lib/parslet/atoms/entity.rb @@ -0,0 +1,41 @@ +# This wraps pieces of parslet definition and gives them a name. The wrapped +# piece is lazily evaluated and cached. This has two purposes: +# +# * Avoid infinite recursion during evaluation of the definition +# * Be able to print things by their name, not by their sometimes +# complicated content. +# +# You don't normally use this directly, instead you should generated it by +# using the structuring method Parslet.rule. +# +class Parslet::Atoms::Entity < Parslet::Atoms::Base + attr_reader :name, :block + def initialize(name, &block) + super() + + @name = name + @block = block + end + + def try(source, context, consume_all) + parslet.apply(source, context, consume_all) + end + + def parslet + @parslet ||= @block.call.tap { |p| + raise_not_implemented unless p + } + end + + def to_s_inner(prec) + name.to_s.upcase + end +private + def raise_not_implemented + trace = caller.reject {|l| l =~ %r{#{Regexp.escape(__FILE__)}}} # blatantly stolen from dependencies.rb in activesupport + exception = NotImplementedError.new("rule(#{name.inspect}) { ... } returns nil. Still not implemented, but already used?") + exception.set_backtrace(trace) + + raise exception + end +end diff --git a/lib/parslet/atoms/infix.rb b/lib/parslet/atoms/infix.rb new file mode 100644 index 00000000..a78345c0 --- /dev/null +++ b/lib/parslet/atoms/infix.rb @@ -0,0 +1,121 @@ +class Parslet::Atoms::Infix < Parslet::Atoms::Base + attr_reader :element, :operations + + def initialize(element, operations) + super() + + @element = element + @operations = operations + end + + def try(source, context, consume_all) + return catch_error { + return succ( + produce_tree( + precedence_climb(source, context, consume_all))) + } + end + + # Turns an array of the form ['1', '+', ['2', '*', '3']] into a hash that + # reflects the same structure. + # + def produce_tree(ary) + return ary unless ary.kind_of? Array + + left = ary.shift + + until ary.empty? + op, right = ary.shift(2) + + # p [left, op, right] + + if right.kind_of? Array + # Subexpression -> Subhash + left = {l: left, o: op, r: produce_tree(right)} + else + left = {l: left, o: op, r: right} + end + end + + left + end + + # A precedence climbing algorithm married to parslet, as described here + # http://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing/ + # + # @note Error handling in this routine is done by throwing :error and + # as a value the error to return to parslet. This avoids cluttering + # the recursion logic here with parslet error handling. + # + def precedence_climb(source, context, consume_all, current_prec=1, needs_element=false) + result = [] + + # To even begin parsing an arithmetic expression, there needs to be + # at least one @element. + success, value = @element.apply(source, context, false) + + unless success + abort context.err(self, source, "#{@element.inspect} was expected", [value]) + end + + result << flatten(value, true) + + # Loop until we fail on operator matching or until input runs out. + loop do + op_pos = source.pos + op_match, prec, assoc = match_operation(source, context, false) + + # If no operator could be matched here, one of several cases + # applies: + # + # - end of file + # - end of expression + # - syntax error + # + # We abort matching the expression here. + break unless op_match + + if prec >= current_prec + next_prec = (assoc == :left) ? prec+1 : prec + + result << op_match + result << precedence_climb( + source, context, consume_all, next_prec, true) + else + source.pos = op_pos + return unwrap(result) + end + end + + return unwrap(result) + end + + def unwrap expr + expr.size == 1 ? expr.first : expr + end + + def match_operation(source, context, consume_all) + errors = [] + @operations.each do |op_atom, prec, assoc| + success, value = op_atom.apply(source, context, consume_all) + return flatten(value, true), prec, assoc if success + + # assert: this was in fact an error, accumulate + errors << value + end + + return nil + end + + def abort(error) + throw :error, error + end + def catch_error + catch(:error) { yield } + end + + def to_s_inner(prec) + ops = @operations.map { |o, _, _| o.inspect }.join(', ') + "infix_expression(#{@element.inspect}, [#{ops}])" + end +end \ No newline at end of file diff --git a/lib/parslet/atoms/lookahead.rb b/lib/parslet/atoms/lookahead.rb new file mode 100644 index 00000000..d50a1b94 --- /dev/null +++ b/lib/parslet/atoms/lookahead.rb @@ -0,0 +1,49 @@ +# Either positive or negative lookahead, doesn't consume its input. +# +# Example: +# +# str('foo').present? # matches when the input contains 'foo', but leaves it +# +class Parslet::Atoms::Lookahead < Parslet::Atoms::Base + attr_reader :positive + attr_reader :bound_parslet + + def initialize(bound_parslet, positive=true) + super() + + # Model positive and negative lookahead by testing this flag. + @positive = positive + @bound_parslet = bound_parslet + + @error_msgs = { + :positive => ["Input should start with ", bound_parslet], + :negative => ["Input should not start with ", bound_parslet] + } + end + + def try(source, context, consume_all) + pos = source.pos + + success, value = bound_parslet.apply(source, context, consume_all) + + if positive + return succ(nil) if success + return context.err_at(self, source, @error_msgs[:positive], pos) + else + return succ(nil) unless success + return context.err_at(self, source, @error_msgs[:negative], pos) + end + + # This is probably the only parslet that rewinds its input in #try. + # Lookaheads NEVER consume their input, even on success, that's why. + ensure + source.pos = pos + end + + precedence LOOKAHEAD + def to_s_inner(prec) + char = positive ? '&' : '!' + + "#{char}#{bound_parslet.to_s(prec)}" + end +end diff --git a/lib/parslet/atoms/named.rb b/lib/parslet/atoms/named.rb new file mode 100644 index 00000000..b5412ba4 --- /dev/null +++ b/lib/parslet/atoms/named.rb @@ -0,0 +1,32 @@ +# Names a match to influence tree construction. +# +# Example: +# +# str('foo') # will return 'foo', +# str('foo').as(:foo) # will return :foo => 'foo' +# +class Parslet::Atoms::Named < Parslet::Atoms::Base + attr_reader :parslet, :name + def initialize(parslet, name) + super() + + @parslet, @name = parslet, name + end + + def apply(source, context, consume_all) + success, value = result = parslet.apply(source, context, consume_all) + + return result unless success + succ( + produce_return_value( + value)) + end + + def to_s_inner(prec) + "#{name}:#{parslet.to_s(prec)}" + end +private + def produce_return_value(val) + { name => flatten(val, true) } + end +end diff --git a/lib/parslet/atoms/re.rb b/lib/parslet/atoms/re.rb new file mode 100644 index 00000000..75b0ac11 --- /dev/null +++ b/lib/parslet/atoms/re.rb @@ -0,0 +1,38 @@ +# Matches a special kind of regular expression that only ever matches one +# character at a time. Useful members of this family are: character +# ranges, \\w, \\d, \\r, \\n, ... +# +# Example: +# +# match('[a-z]') # matches a-z +# match('\s') # like regexps: matches space characters +# +class Parslet::Atoms::Re < Parslet::Atoms::Base + attr_reader :match, :re + def initialize(match) + super() + + @match = match.to_s + @re = Regexp.new(self.match, Regexp::MULTILINE) + @error_msgs = { + :premature => "Premature end of input", + :failed => "Failed to match #{match.inspect[1..-2]}" + } + end + + def try(source, context, consume_all) + return succ(source.consume(1)) if source.matches?(@re) + + # No string could be read + return context.err(self, source, @error_msgs[:premature]) \ + if source.chars_left < 1 + + # No match + return context.err(self, source, @error_msgs[:failed]) + end + + def to_s_inner(prec) + match.inspect[1..-2] + end +end + diff --git a/lib/parslet/atoms/repetition.rb b/lib/parslet/atoms/repetition.rb new file mode 100644 index 00000000..dde129f8 --- /dev/null +++ b/lib/parslet/atoms/repetition.rb @@ -0,0 +1,83 @@ + +# Matches a parslet repeatedly. +# +# Example: +# +# str('a').repeat(1,3) # matches 'a' at least once, but at most three times +# str('a').maybe # matches 'a' if it is present in the input (repeat(0,1)) +# +class Parslet::Atoms::Repetition < Parslet::Atoms::Base + attr_reader :min, :max, :parslet + def initialize(parslet, min, max, tag=:repetition) + super() + + raise ArgumentError, + "Asking for zero repetitions of a parslet. (#{parslet.inspect} repeating #{min},#{max})" \ + if max == 0 + + + @parslet = parslet + @min, @max = min, max + @tag = tag + @error_msgs = { + :minrep => "Expected at least #{min} of #{parslet.inspect}", + :unconsumed => "Extra input after last repetition" + } + end + + def try(source, context, consume_all) + occ = 0 + accum = [@tag] # initialize the result array with the tag (for flattening) + start_pos = source.pos + + break_on = nil + loop do + success, value = parslet.apply(source, context, false) + + break_on = value + break unless success + + occ += 1 + accum << value + + # If we're not greedy (max is defined), check if that has been reached. + return succ(accum) if max && occ>=max + end + + # Last attempt to match parslet was a failure, failure reason in break_on. + + # Greedy matcher has produced a failure. Check if occ (which will + # contain the number of successes) is >= min. + return context.err_at( + self, + source, + @error_msgs[:minrep], + start_pos, + [break_on]) if occ < min + + # consume_all is true, that means that we're inside the part of the parser + # that should consume the input completely. Repetition failing here means + # probably that we didn't. + # + # We have a special clause to create an error here because otherwise + # break_on would get thrown away. It turns out, that contains very + # interesting information in a lot of cases. + # + return context.err( + self, + source, + @error_msgs[:unconsumed], + [break_on]) if consume_all && source.chars_left>0 + + return succ(accum) + end + + precedence REPETITION + def to_s_inner(prec) + minmax = "{#{min}, #{max}}" + minmax = '?' if min == 0 && max == 1 + + parslet.to_s(prec) + minmax + end +end + diff --git a/lib/parslet/atoms/scope.rb b/lib/parslet/atoms/scope.rb new file mode 100644 index 00000000..0642601e --- /dev/null +++ b/lib/parslet/atoms/scope.rb @@ -0,0 +1,26 @@ +# Starts a new scope in the parsing process. Please also see the #captures +# method. +# +class Parslet::Atoms::Scope < Parslet::Atoms::Base + attr_reader :block + def initialize(block) + super() + + @block = block + end + + def cached? + false + end + + def apply(source, context, consume_all) + context.scope do + parslet = block.call + return parslet.apply(source, context, consume_all) + end + end + + def to_s_inner(prec) + "scope { #{block.call.to_s(prec)} }" + end +end diff --git a/lib/parslet/atoms/sequence.rb b/lib/parslet/atoms/sequence.rb new file mode 100644 index 00000000..b556cc9d --- /dev/null +++ b/lib/parslet/atoms/sequence.rb @@ -0,0 +1,45 @@ +# A sequence of parslets, matched from left to right. Denoted by '>>' +# +# Example: +# +# str('a') >> str('b') # matches 'a', then 'b' +# +class Parslet::Atoms::Sequence < Parslet::Atoms::Base + attr_reader :parslets + def initialize(*parslets) + super() + + @parslets = parslets + @error_msgs = { + :failed => "Failed to match sequence (#{self.inspect})" + } + end + + def >>(parslet) + self.class.new(* @parslets+[parslet]) + end + + def try(source, context, consume_all) + # Presize an array + result = Array.new(parslets.size + 1) + result[0] = :sequence + + parslets.each_with_index do |p, idx| + child_consume_all = consume_all && (idx == parslets.size-1) + success, value = p.apply(source, context, child_consume_all) + + unless success + return context.err(self, source, @error_msgs[:failed], [value]) + end + + result[idx+1] = value + end + + return succ(result) + end + + precedence SEQUENCE + def to_s_inner(prec) + parslets.map { |p| p.to_s(prec) }.join(' ') + end +end diff --git a/lib/parslet/atoms/str.rb b/lib/parslet/atoms/str.rb new file mode 100644 index 00000000..a55061fa --- /dev/null +++ b/lib/parslet/atoms/str.rb @@ -0,0 +1,39 @@ +# Matches a string of characters. +# +# Example: +# +# str('foo') # matches 'foo' +# +class Parslet::Atoms::Str < Parslet::Atoms::Base + attr_reader :str + def initialize(str) + super() + + @str = str.to_s + @pat = Regexp.new(Regexp.escape(str)) + @len = str.size + @error_msgs = { + :premature => "Premature end of input", + :failed => "Expected #{str.inspect}, but got " + } + end + + def try(source, context, consume_all) + return succ(source.consume(@len)) if source.matches?(@pat) + + # Input ending early: + return context.err(self, source, @error_msgs[:premature]) \ + if source.chars_left<@len + + # Expected something, but got something else instead: + error_pos = source.pos + return context.err_at( + self, source, + [@error_msgs[:failed], source.consume(@len)], error_pos) + end + + def to_s_inner(prec) + "'#{str}'" + end +end + diff --git a/lib/parslet/atoms/visitor.rb b/lib/parslet/atoms/visitor.rb new file mode 100644 index 00000000..8cb7375a --- /dev/null +++ b/lib/parslet/atoms/visitor.rb @@ -0,0 +1,89 @@ +# Augments all parslet atoms with an accept method that will call back +# to the visitor given. + +# +module Parslet::Atoms + class Base + def accept(visitor) + raise NotImplementedError, "No #accept method on #{self.class.name}." + end + end + + class Str + # Call back visitors #visit_str method. See parslet/export for an example. + # + def accept(visitor) + visitor.visit_str(str) + end + end + + class Entity + # Call back visitors #visit_entity method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_entity(name, block) + end + end + + class Named + # Call back visitors #visit_named method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_named(name, parslet) + end + end + + class Sequence + # Call back visitors #visit_sequence method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_sequence(parslets) + end + end + + class Repetition + # Call back visitors #visit_repetition method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_repetition(@tag, min, max, parslet) + end + end + + class Alternative + # Call back visitors #visit_alternative method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_alternative(alternatives) + end + end + + class Lookahead + # Call back visitors #visit_lookahead method. See parslet/export for an + # example. + # + def accept(visitor) + visitor.visit_lookahead(positive, bound_parslet) + end + end + + class Re + # Call back visitors #visit_re method. See parslet/export for an example. + # + def accept(visitor) + visitor.visit_re(match) + end + end +end + +class Parslet::Parser + # Call back visitors #visit_parser method. + # + def accept(visitor) + visitor.visit_parser(root) + end +end diff --git a/lib/parslet/cause.rb b/lib/parslet/cause.rb new file mode 100644 index 00000000..ce5672c1 --- /dev/null +++ b/lib/parslet/cause.rb @@ -0,0 +1,94 @@ +module Parslet + # Represents a cause why a parse did fail. A lot of these objects are + # constructed - not all of the causes turn out to be failures for the whole + # parse. + # + class Cause + def initialize(message, source, pos, children) + @message, @source, @pos, @children = + message, source, pos, children + end + + # @return [String, Array] A string or an array of message pieces that + # provide failure information. Use #to_s to get a formatted string. + attr_reader :message + + # @return [Parslet::Source] Source that was parsed when this error + # happend. Mainly used for line number information. + attr_reader :source + + # Location of the error. + # + # @return [Fixnum] Position where the error happened. (character offset) + attr_reader :pos + + # When this cause is part of a tree of error causes: child nodes for this + # node. Very often carries the reasons for this cause. + # + # @return [Array] A list of reasons for this cause. + def children + @children ||= [] + end + + # Appends 'at line LINE char CHAR' to the string given. Use +pos+ to + # override the position of the +source+. This method returns an object + # that can be turned into a string using #to_s. + # + # @param source [Parslet::Source] source that was parsed when this error + # happened + # @param pos [Fixnum] position of error + # @param str [String, Array] message parts + # @param children [Array] child nodes for this error tree + # @return [Parslet::Cause] a new instance of {Parslet::Cause} + # + def self.format(source, pos, str, children=[]) + self.new(str, source, pos, children) + end + + def to_s + line, column = source.line_and_column(pos) + # Allow message to be a list of objects. Join them here, since we now + # really need it. + Array(message).map { |o| + o.respond_to?(:to_slice) ? + o.str.inspect : + o.to_s }.join + " at line #{line} char #{column}." + end + + # Signals to the outside that the parse has failed. Use this in + # conjunction with .format for nice error messages. + # + def raise(exception_klass=Parslet::ParseFailed) + exception = exception_klass.new(self.to_s, self) + Kernel.raise exception + end + + # Returns an ascii tree representation of the causes of this node and its + # children. + # + def ascii_tree + StringIO.new.tap { |io| + recursive_ascii_tree(self, io, [true]) }. + string + end + + private + def recursive_ascii_tree(node, stream, curved) + append_prefix(stream, curved) + stream.puts node.to_s + + node.children.each do |child| + last_child = (node.children.last == child) + + recursive_ascii_tree(child, stream, curved + [last_child]) + end + end + def append_prefix(stream, curved) + return if curved.size < 2 + curved[1..-2].each do |c| + stream.print c ? " " : "| " + end + stream.print curved.last ? "`- " : "|- " + end + end +end \ No newline at end of file diff --git a/lib/parslet/context.rb b/lib/parslet/context.rb new file mode 100644 index 00000000..f61e0bc3 --- /dev/null +++ b/lib/parslet/context.rb @@ -0,0 +1,33 @@ +require 'blankslate' + +# Provides a context for tree transformations to run in. The context allows +# accessing each of the bindings in the bindings hash as local method. +# +# Example: +# +# ctx = Context.new(:a => :b) +# ctx.instance_eval do +# a # => :b +# end +# +# @api private +class Parslet::Context < BlankSlate + reveal :methods + reveal :respond_to? + reveal :inspect + reveal :to_s + reveal :instance_variable_set + + def meta_def(name, &body) + metaclass = class < error + # puts parser.cause.ascii_tree + # end + # + # into a convenient method. + # + # Usage: + # + # require 'parslet' + # require 'parslet/convenience' + # + # class FooParser < Parslet::Parser + # rule(:foo) { str('foo') } + # root(:foo) + # end + # + # FooParser.new.parse_with_debug('bar') + # + # @see Parslet::Atoms::Base#parse + # + def parse_with_debug str, opts={} + parse str, opts + rescue Parslet::ParseFailed => error + puts error.cause.ascii_tree + end + +end \ No newline at end of file diff --git a/lib/parslet/error_reporter.rb b/lib/parslet/error_reporter.rb new file mode 100644 index 00000000..567c63a2 --- /dev/null +++ b/lib/parslet/error_reporter.rb @@ -0,0 +1,7 @@ +# A namespace for all error reporters. +# +module Parslet::ErrorReporter +end + +require 'parslet/error_reporter/tree' +require 'parslet/error_reporter/deepest' \ No newline at end of file diff --git a/lib/parslet/error_reporter/deepest.rb b/lib/parslet/error_reporter/deepest.rb new file mode 100644 index 00000000..102b4f87 --- /dev/null +++ b/lib/parslet/error_reporter/deepest.rb @@ -0,0 +1,95 @@ +module Parslet + module ErrorReporter + # Instead of reporting the latest error that happens like {Tree} does, + # this class reports the deepest error. Depth is defined here as how + # advanced into the input an error happens. The errors close to the + # greatest depth tend to be more relevant to the end user, since they + # specify what could be done to make them go away. + # + # More specifically, errors produced by this reporter won't be related to + # the structure of the grammar at all. The positions of the errors will + # be advanced and convey at every grammar level what the deepest rule + # was to fail. + # + class Deepest + def initialize + @deepest_cause = nil + end + + # Produces an error cause that combines the message at the current level + # with the errors that happened at a level below (children). + # + # @param atom [Parslet::Atoms::Base] parslet that failed + # @param source [Source] Source that we're using for this parse. (line + # number information...) + # @param message [String, Array] Error message at this level. + # @param children [Array] A list of errors from a deeper level (or nil). + # @return [Cause] An error tree combining children with message. + # + def err(atom, source, message, children=nil) + position = source.pos + cause = Cause.format(source, position, message, children) + return deepest(cause) + end + + # Produces an error cause that combines the message at the current level + # with the errors that happened at a level below (children). + # + # @param atom [Parslet::Atoms::Base] parslet that failed + # @param source [Source] Source that we're using for this parse. (line + # number information...) + # @param message [String, Array] Error message at this level. + # @param pos [Fixnum] The real position of the error. + # @param children [Array] A list of errors from a deeper level (or nil). + # @return [Cause] An error tree combining children with message. + # + def err_at(atom, source, message, pos, children=nil) + position = pos + cause = Cause.format(source, position, message, children) + return deepest(cause) + end + + # Returns the cause that is currently deepest. Mainly for specs. + # + attr_reader :deepest_cause + + # Checks to see if the lineage of the cause given includes a cause with + # an error position deeper than the current deepest cause stored. If + # yes, it passes the cause through to the caller. If no, it returns the + # current deepest error that was saved as a reference. + # + def deepest(cause) + rank, leaf = deepest_child(cause) + + if !deepest_cause || leaf.pos >= deepest_cause.pos + # This error reaches deeper into the input, save it as reference. + @deepest_cause = leaf + return cause + end + + return deepest_cause + end + + private + # Returns the leaf from a given error tree with the biggest rank. + # + def deepest_child(cause, rank=0) + max_child = cause + max_rank = rank + + if cause.children && !cause.children.empty? + cause.children.each do |child| + c_rank, c_cause = deepest_child(child, rank+1) + + if c_rank > max_rank + max_rank = c_rank + max_child = c_cause + end + end + end + + return max_rank, max_child + end + end + end +end \ No newline at end of file diff --git a/lib/parslet/error_reporter/tree.rb b/lib/parslet/error_reporter/tree.rb new file mode 100644 index 00000000..2fb27504 --- /dev/null +++ b/lib/parslet/error_reporter/tree.rb @@ -0,0 +1,57 @@ +module Parslet + module ErrorReporter + # An error reporter has two central methods, one for reporting errors at + # the current parse position (#err) and one for reporting errors at a + # given parse position (#err_at). The reporter can return an object (a + # 'cause') that will be returned to the caller along with the information + # that the parse failed. + # + # When reporting errors on the outer levels of your parser, these methods + # get passed a list of error objects ('causes') from the inner levels. In + # this default implementation, the inner levels are considered error + # subtrees and are appended to the generated tree node at each level, + # thereby constructing an error tree. + # + # This error tree will report in parallel with the grammar structure that + # failed. A one-to-one correspondence exists between each error in the + # tree and the parslet atom that produced that error. + # + # The implementor is really free to use these return values as he sees + # fit. One example would be to return an error state object from these + # methods that is then updated as errors cascade up the parse derivation + # tree. + # + class Tree + # Produces an error cause that combines the message at the current level + # with the errors that happened at a level below (children). + # + # @param atom [Parslet::Atoms::Base] parslet that failed + # @param source [Source] Source that we're using for this parse. (line + # number information...) + # @param message [String, Array] Error message at this level. + # @param children [Array] A list of errors from a deeper level (or nil). + # @return [Cause] An error tree combining children with message. + # + def err(atom, source, message, children=nil) + position = source.pos + Cause.format(source, position, message, children) + end + + # Produces an error cause that combines the message at the current level + # with the errors that happened at a level below (children). + # + # @param atom [Parslet::Atoms::Base] parslet that failed + # @param source [Source] Source that we're using for this parse. (line + # number information...) + # @param message [String, Array] Error message at this level. + # @param pos [Fixnum] The real position of the error. + # @param children [Array] A list of errors from a deeper level (or nil). + # @return [Cause] An error tree combining children with message. + # + def err_at(atom, source, message, pos, children=nil) + position = pos + Cause.format(source, position, message, children) + end + end + end +end \ No newline at end of file diff --git a/lib/parslet/export.rb b/lib/parslet/export.rb new file mode 100644 index 00000000..37ab20d0 --- /dev/null +++ b/lib/parslet/export.rb @@ -0,0 +1,162 @@ +# Allows exporting parslet grammars to other lingos. + +require 'set' +require 'parslet/atoms/visitor' + +class Parslet::Parser + module Visitors + class Citrus + attr_reader :context, :output + def initialize(context) + @context = context + end + + def visit_str(str) + "\"#{str.inspect[1..-2]}\"" + end + def visit_re(match) + match.to_s + end + + def visit_entity(name, block) + context.deferred(name, block) + + "(#{context.mangle_name(name)})" + end + def visit_named(name, parslet) + parslet.accept(self) + end + + def visit_sequence(parslets) + '(' << + parslets. + map { |el| el.accept(self) }. + join(' ') << + ')' + end + def visit_repetition(tag, min, max, parslet) + parslet.accept(self) << "#{min}*#{max}" + end + def visit_alternative(alternatives) + '(' << + alternatives. + map { |el| el.accept(self) }. + join(' | ') << + ')' + end + + def visit_lookahead(positive, bound_parslet) + (positive ? '&' : '!') << + bound_parslet.accept(self) + end + end + + class Treetop < Citrus + def visit_repetition(tag, min, max, parslet) + parslet.accept(self) << "#{min}..#{max}" + end + + def visit_alternative(alternatives) + '(' << + alternatives. + map { |el| el.accept(self) }. + join(' / ') << + ')' + end + end + end + + # A helper class that formats Citrus and Treetop grammars as a string. + # + class PrettyPrinter + attr_reader :visitor + def initialize(visitor_klass) + @visitor = visitor_klass.new(self) + end + + # Pretty prints the given parslet using the visitor that has been + # configured in initialize. Returns the string representation of the + # Citrus or Treetop grammar. + # + def pretty_print(name, parslet) + output = "grammar #{name}\n" + + output << rule('root', parslet) + + seen = Set.new + loop do + # @todo is constantly filled by the visitor (see #deferred). We + # keep going until it is empty. + break if @todo.empty? + name, block = @todo.shift + + # Track what rules we've already seen. This breaks loops. + next if seen.include?(name) + seen << name + + output << rule(name, block.call) + end + + output << "end\n" + end + + # Formats a rule in either dialect. + # + def rule(name, parslet) + " rule #{mangle_name name}\n" << + " " << parslet.accept(visitor) << "\n" << + " end\n" + end + + # Whenever the visitor encounters an rule in a parslet, it defers the + # pretty printing of the rule by calling this method. + # + def deferred(name, content) + @todo ||= [] + @todo << [name, content] + end + + # Mangles names so that Citrus and Treetop can live with it. This mostly + # transforms some of the things that Ruby allows into other patterns. If + # there is collision, we will not detect it for now. + # + def mangle_name(str) + str.to_s.sub(/\?$/, '_p') + end + end + + # Exports the current parser instance as a string in the Citrus dialect. + # + # Example: + # + # require 'parslet/export' + # class MyParser < Parslet::Parser + # root(:expression) + # rule(:expression) { str('foo') } + # end + # + # MyParser.new.to_citrus # => a citrus grammar as a string + # + def to_citrus + PrettyPrinter.new(Visitors::Citrus). + pretty_print(self.class.name, root) + end + + # Exports the current parser instance as a string in the Treetop dialect. + # + # Example: + # + # require 'parslet/export' + # class MyParser < Parslet::Parser + # root(:expression) + # rule(:expression) { str('foo') } + # end + # + # MyParser.new.to_treetop # => a treetop grammar as a string + # + def to_treetop + PrettyPrinter.new(Visitors::Treetop). + pretty_print(self.class.name, root) + end +end + diff --git a/lib/parslet/expression.rb b/lib/parslet/expression.rb new file mode 100644 index 00000000..1cd13e4c --- /dev/null +++ b/lib/parslet/expression.rb @@ -0,0 +1,51 @@ + +# Allows specifying rules as strings using the exact same grammar that treetop +# does, minus the actions. This is on one hand a good example of a fully +# fledged parser and on the other hand might even turn out really useful. +# +# This can be viewed as an extension to parslet and might even be hosted in +# its own gem one fine day. +# +class Parslet::Expression + include Parslet + + autoload :Treetop, 'parslet/expression/treetop' + + # Creates a parslet from a foreign language expression. + # + # Example: + # + # Parslet::Expression.new("'a' 'b'") + # + def initialize(str, opts={}, context=self) + @type = opts[:type] || :treetop + @exp = str + @parslet = transform( + parse(str)) + end + + # Transforms the parse tree into a parslet expression. + # + def transform(tree) + transform = Treetop::Transform.new + + # pp tree + transform.apply(tree) + rescue + warn "Could not transform: " + tree.inspect + raise + end + + # Parses the string and returns a parse tree. + # + def parse(str) + parser = Treetop::Parser.new + parser.parse(str) + end + + # Turns this expression into a parslet. + # + def to_parslet + @parslet + end +end \ No newline at end of file diff --git a/lib/parslet/expression/treetop.rb b/lib/parslet/expression/treetop.rb new file mode 100644 index 00000000..35da40b5 --- /dev/null +++ b/lib/parslet/expression/treetop.rb @@ -0,0 +1,92 @@ +class Parslet::Expression::Treetop + class Parser < Parslet::Parser + root(:expression) + + rule(:expression) { alternatives } + + # alternative 'a' / 'b' + rule(:alternatives) { + (simple >> (spaced('/') >> simple).repeat).as(:alt) + } + + # sequence by simple concatenation 'a' 'b' + rule(:simple) { occurrence.repeat(1).as(:seq) } + + # occurrence modifiers + rule(:occurrence) { + atom.as(:repetition) >> spaced('*').as(:sign) | + atom.as(:repetition) >> spaced('+').as(:sign) | + atom.as(:repetition) >> repetition_spec | + + atom.as(:maybe) >> spaced('?') | + atom + } + + rule(:atom) { + spaced('(') >> expression.as(:unwrap) >> spaced(')') | + dot | + string | + char_class + } + + # a character class + rule(:char_class) { + (str('[') >> + (str('\\') >> any | + str(']').absent? >> any).repeat(1) >> + str(']')).as(:match) >> space? + } + + # anything at all + rule(:dot) { spaced('.').as(:any) } + + # recognizing strings + rule(:string) { + str('\'') >> + ( + (str('\\') >> any) | + (str("'").absent? >> any) + ).repeat.as(:string) >> + str('\'') >> space? + } + + # repetition specification like {1, 2} + rule(:repetition_spec) { + spaced('{') >> + integer.maybe.as(:min) >> spaced(',') >> + integer.maybe.as(:max) >> spaced('}') + } + rule(:integer) { + match['0-9'].repeat(1) + } + + # whitespace handling + rule(:space) { match("\s").repeat(1) } + rule(:space?) { space.maybe } + + def spaced(str) + str(str) >> space? + end + end + + class Transform < Parslet::Transform + + rule(:repetition => simple(:rep), :sign => simple(:sign)) { + min = sign=='+' ? 1 : 0 + Parslet::Atoms::Repetition.new(rep, min, nil) } + rule(:repetition => simple(:rep), :min => simple(:min), :max => simple(:max)) { + Parslet::Atoms::Repetition.new(rep, + Integer(min || 0), + max && Integer(max) || nil) } + + rule(:alt => subtree(:alt)) { Parslet::Atoms::Alternative.new(*alt) } + rule(:seq => sequence(:s)) { Parslet::Atoms::Sequence.new(*s) } + rule(:unwrap => simple(:u)) { u } + rule(:maybe => simple(:m)) { |d| d[:m].maybe } + rule(:string => simple(:s)) { Parslet::Atoms::Str.new(s) } + rule(:match => simple(:m)) { Parslet::Atoms::Re.new(m) } + rule(:any => simple(:a)) { Parslet::Atoms::Re.new('.') } + end + +end + diff --git a/lib/parslet/graphviz.rb b/lib/parslet/graphviz.rb new file mode 100644 index 00000000..6d6cd75e --- /dev/null +++ b/lib/parslet/graphviz.rb @@ -0,0 +1,97 @@ + +# Paints a graphviz graph of your parser. + +begin + require 'ruby-graphviz' +rescue LoadError + puts "Please install the 'ruby-graphviz' gem first." + fail +end + +require 'set' +require 'parslet/atoms/visitor' + +module Parslet + class GraphvizVisitor + def initialize g + @graph = g + @known_links = Set.new + @visited = Set.new + end + + attr_reader :parent + + def visit_parser(root) + recurse root, node('parser') + end + def visit_entity(name, block) + s = node(name) + + downwards s + + return if @visited.include?(name) + @visited << name + + recurse block.call, s + end + def visit_named(name, atom) + recurse atom, parent + end + def visit_repetition(tag, min, max, atom) + recurse atom, parent + end + def visit_alternative(alternatives) + p = parent + alternatives.each do |atom| + recurse atom, p + end + end + def visit_sequence(sequence) + p = parent + sequence.each do |atom| + recurse atom, p + end + end + def visit_lookahead(positive, atom) + recurse atom, parent + end + def visit_re(regexp) + # downwards node(regexp.object_id, label: escape("re(#{regexp.inspect})")) + end + def visit_str(str) + # downwards node(str.object_id, label: escape("#{str.inspect}")) + end + + def escape str + str.gsub('"', "'") + end + def node name, opts={} + @graph.add_nodes name.to_s, opts + end + def downwards child + if @parent && !@known_links.include?([@parent, child]) + @graph.add_edges(@parent, child) + @known_links << [@parent, child] + end + end + def recurse node, current + @parent = current + node.accept(self) + end + end + + module Graphable + def graph opts + g = GraphViz.new(:G, type: :digraph) + visitor = GraphvizVisitor.new(g) + + new.accept(visitor) + + g.output opts + end + end + + class Parser # reopen for introducing the .graph method + extend Graphable + end +end \ No newline at end of file diff --git a/lib/parslet/parser.rb b/lib/parslet/parser.rb new file mode 100644 index 00000000..a78e443c --- /dev/null +++ b/lib/parslet/parser.rb @@ -0,0 +1,67 @@ + +# The base class for all your parsers. Use as follows: +# +# require 'parslet' +# +# class MyParser < Parslet::Parser +# rule(:a) { str('a').repeat } +# root(:a) +# end +# +# pp MyParser.new.parse('aaaa') # => 'aaaa' +# pp MyParser.new.parse('bbbb') # => Parslet::Atoms::ParseFailed: +# # Don't know what to do with bbbb at line 1 char 1. +# +# Parslet::Parser is also a grammar atom. This means that you can mix full +# fledged parsers freely with small parts of a different parser. +# +# Example: +# class ParserA < Parslet::Parser +# root :aaa +# rule(:aaa) { str('a').repeat(3,3) } +# end +# class ParserB < Parslet::Parser +# root :expression +# rule(:expression) { str('b') >> ParserA.new >> str('b') } +# end +# +# In the above example, ParserB would parse something like 'baaab'. +# +class Parslet::Parser < Parslet::Atoms::Base + include Parslet + + class < { +# :name => 'foobar', +# :args => [1, 2, 3] +# } +# } +# +# A pattern that would match against this tree would be: +# +# { :function_call => { :name => simple(:name), :args => sequence(:args) }} +# +# Note that Parslet::Pattern only matches at a given subtree; it wont try +# to match recursively. To do that, please use Parslet::Transform. +# +class Parslet::Pattern + def initialize(pattern) + @pattern = pattern + end + + # Decides if the given subtree matches this pattern. Returns the bindings + # made on a successful match or nil if the match fails. If you specify + # bindings to be a hash, the mappings in it will be treated like bindings + # made during an attempted match. + # + # Pattern.new('a').match('a', :foo => 'bar') # => { :foo => 'bar' } + # + # @param subtree [String, Hash, Array] poro subtree returned by a parse + # @param bindings [Hash] variable bindings to be verified + # @return [Hash, nil] On success: variable bindings that allow a match. On + # failure: nil + # + def match(subtree, bindings=nil) + bindings = bindings && bindings.dup || Hash.new + return bindings if element_match(subtree, @pattern, bindings) + end + + # Returns true if the tree element given by +tree+ matches the expression + # given by +exp+. This match must respect bindings already made in + # +bindings+. Note that bindings is carried along and modified. + # + # @api private + # + def element_match(tree, exp, bindings) + # p [:elm, tree, exp] + case [tree, exp].map { |e| e.class } + when [Hash,Hash] + return element_match_hash(tree, exp, bindings) + when [Array,Array] + return element_match_ary_single(tree, exp, bindings) + else + # If elements match exactly, then that is good enough in all cases + return true if exp === tree + + # If exp is a bind variable: Check if the binding matches + if exp.respond_to?(:can_bind?) && exp.can_bind?(tree) + return element_match_binding(tree, exp, bindings) + end + + # Otherwise: No match (we don't know anything about the element + # combination) + return false + end + end + + # @api private + # + def element_match_binding(tree, exp, bindings) + var_name = exp.variable_name + + # TODO test for the hidden :_ feature. + if var_name && bound_value = bindings[var_name] + return bound_value == tree + end + + # New binding: + bindings.store var_name, tree + + return true + end + + # @api private + # + def element_match_ary_single(sequence, exp, bindings) + return false if sequence.size != exp.size + + return sequence.zip(exp).all? { |elt, subexp| + element_match(elt, subexp, bindings) } + end + + # @api private + # + def element_match_hash(tree, exp, bindings) + # Early failure when one hash is bigger than the other + return false unless exp.size == tree.size + + # We iterate over expected pattern, since we demand that the keys that + # are there should be in tree as well. + exp.each do |expected_key, expected_value| + return false unless tree.has_key? expected_key + + # Recurse into the value and stop early on failure + value = tree[expected_key] + return false unless element_match(value, expected_value, bindings) + end + + return true + end +end \ No newline at end of file diff --git a/lib/parslet/pattern/binding.rb b/lib/parslet/pattern/binding.rb new file mode 100644 index 00000000..2197db7f --- /dev/null +++ b/lib/parslet/pattern/binding.rb @@ -0,0 +1,49 @@ + +# Used internally for representing a bind placeholder in a Parslet::Transform +# pattern. This is the superclass for all bindings. +# +# It defines the most permissive kind of bind, the one that matches any subtree +# whatever it looks like. +# +class Parslet::Pattern::SubtreeBind < Struct.new(:symbol) + def variable_name + symbol + end + + def inspect + "#{bind_type_name}(#{symbol.inspect})" + end + + def can_bind?(subtree) + true + end + +private + def bind_type_name + if md=self.class.name.match(/(\w+)Bind/) + md.captures.first.downcase + else + # This path should never be used, but since this is for inspection only, + # let's not raise. + 'unknown_bind' + end + end +end + +# Binds a symbol to a simple subtree, one that is not either a sequence of +# elements or a collection of attributes. +# +class Parslet::Pattern::SimpleBind < Parslet::Pattern::SubtreeBind + def can_bind?(subtree) + not [Hash, Array].include?(subtree.class) + end +end + +# Binds a symbol to a sequence of simple leafs ([element1, element2, ...]) +# +class Parslet::Pattern::SequenceBind < Parslet::Pattern::SubtreeBind + def can_bind?(subtree) + subtree.kind_of?(Array) && + (not subtree.any? { |el| [Hash, Array].include?(el.class) }) + end +end \ No newline at end of file diff --git a/lib/parslet/rig/rspec.rb b/lib/parslet/rig/rspec.rb new file mode 100644 index 00000000..71f1aa77 --- /dev/null +++ b/lib/parslet/rig/rspec.rb @@ -0,0 +1,59 @@ +RSpec::Matchers.define(:parse) do |input, opts| + as = block = nil + result = trace = nil + + unless self.respond_to? :failure_message # if RSpec 2.x + class << self + alias_method :failure_message, :failure_message_for_should + alias_method :failure_message_when_negated, :failure_message_for_should_not + end + end + + match do |parser| + begin + result = parser.parse(input) + block ? + block.call(result) : + (as == result || as.nil?) + rescue Parslet::ParseFailed => ex + trace = ex.cause.ascii_tree if opts && opts[:trace] + false + end + end + + failure_message do |is| + if block + "expected output of parsing #{input.inspect}" << + " with #{is.inspect} to meet block conditions, but it didn't" + else + "expected " << + (as ? + "output of parsing #{input.inspect}"<< + " with #{is.inspect} to equal #{as.inspect}, but was #{result.inspect}" : + "#{is.inspect} to be able to parse #{input.inspect}") << + (trace ? + "\n"+trace : + '') + end + end + + failure_message_when_negated do |is| + if block + "expected output of parsing #{input.inspect} with #{is.inspect} not to meet block conditions, but it did" + else + "expected " << + (as ? + "output of parsing #{input.inspect}"<< + " with #{is.inspect} not to equal #{as.inspect}" : + + "#{is.inspect} to not parse #{input.inspect}, but it did") + end + end + + # NOTE: This has a nodoc tag since the rdoc parser puts this into + # Object, a thing I would never allow. + chain :as do |expected_output, &block| + as = expected_output + block = block + end +end diff --git a/lib/parslet/scope.rb b/lib/parslet/scope.rb new file mode 100644 index 00000000..3c41e22a --- /dev/null +++ b/lib/parslet/scope.rb @@ -0,0 +1,42 @@ +class Parslet::Scope + # Raised when the accessed slot has never been assigned a value. + # + class NotFound < StandardError + end + + class Binding + attr_reader :parent + + def initialize(parent=nil) + @parent = parent + @hash = Hash.new + end + + def [](k) + @hash.has_key?(k) && @hash[k] || + parent && parent[k] or + raise NotFound + end + def []=(k,v) + @hash.store(k,v) + end + end + + def [](k) + @current[k] + end + def []=(k,v) + @current[k] = v + end + + def initialize + @current = Binding.new + end + + def push + @current = Binding.new(@current) + end + def pop + @current = @current.parent + end +end \ No newline at end of file diff --git a/lib/parslet/slice.rb b/lib/parslet/slice.rb new file mode 100644 index 00000000..3e2dd465 --- /dev/null +++ b/lib/parslet/slice.rb @@ -0,0 +1,101 @@ + +# A slice is a small part from the parse input. A slice mainly behaves like +# any other string, except that it remembers where it came from (offset in +# original input). +# +# == Extracting line and column +# +# Using the #line_and_column method, you can extract the line and column in +# the original input where this slice starts. +# +# Example: +# slice.line_and_column # => [1, 13] +# slice.offset # => 12 +# +# == Likeness to strings +# +# Parslet::Slice behaves in many ways like a Ruby String. This likeness +# however is not complete - many of the myriad of operations String supports +# are not yet in Slice. You can always extract the internal string instance by +# calling #to_s. +# +# These omissions are somewhat intentional. Rather than maintaining a full +# delegation, we opt for a partial emulation that gets the job done. +# +class Parslet::Slice + attr_reader :str, :offset + attr_reader :line_cache + + # Construct a slice using a string, an offset and an optional line cache. + # The line cache should be able to answer to the #line_and_column message. + # + def initialize(string, offset, line_cache=nil) + @str, @offset = string, offset + @line_cache = line_cache + end + + # Compares slices to other slices or strings. + # + def == other + str == other + end + + # Match regular expressions. + # + def match(regexp) + str.match(regexp) + end + + # Returns the slices size in characters. + # + def size + str.size + end + + # Concatenate two slices; it is assumed that the second slice begins + # where the first one ends. The offset of the resulting slice is the same + # as the one of this slice. + # + def +(other) + self.class.new(str + other.to_s, offset, line_cache) + end + + # Returns a tuple referring to the original input. + # + def line_and_column + raise ArgumentError, "No line cache was given, cannot infer line and column." \ + unless line_cache + + line_cache.line_and_column(self.offset) + end + + + # Conversion operators ----------------------------------------------------- + def to_str + str + end + alias to_s to_str + + def to_slice + self + end + def to_sym + str.to_sym + end + def to_int + Integer(str) + end + def to_i + str.to_i + end + def to_f + str.to_f + end + + # Inspection & Debugging --------------------------------------------------- + + # Prints the slice as "string"@offset. + def inspect + str.inspect << "@#{offset}" + end +end \ No newline at end of file diff --git a/lib/parslet/source.rb b/lib/parslet/source.rb new file mode 100644 index 00000000..a1b71e17 --- /dev/null +++ b/lib/parslet/source.rb @@ -0,0 +1,87 @@ + +require 'stringio' +require 'strscan' + +require 'parslet/source/line_cache' + +module Parslet + # Wraps the input string for parslet. + # + class Source + def initialize(str) + raise( + ArgumentError, + "Must construct Source with a string like object." + ) unless str.respond_to?(:to_str) + + @str = StringScanner.new(str) + + # maps 1 => /./m, 2 => /../m, etc... + @re_cache = Hash.new { |h,k| + h[k] = /(.|$){#{k}}/m } + + @line_cache = LineCache.new + @line_cache.scan_for_line_endings(0, str) + end + + # Checks if the given pattern matches at the current input position. + # + # @param pattern [Regexp] pattern to check for + # @return [Boolean] true if the pattern matches at #pos + # + def matches?(pattern) + @str.match?(pattern) + end + alias match matches? + + # Consumes n characters from the input, returning them as a slice of the + # input. + # + def consume(n) + original_pos = @str.pos + slice_str = @str.scan(@re_cache[n]) + slice = Parslet::Slice.new( + slice_str, + original_pos, + @line_cache) + + return slice + end + + # Returns how many chars remain in the input. + # + def chars_left + @str.rest_size + end + + # Returns how many chars there are between current position and the + # string given. If the string given doesn't occur in the source, then + # the remaining chars (#chars_left) are returned. + # + # @return [Fixnum] count of chars until str or #chars_left + # + def chars_until str + slice_str = @str.check_until(Regexp.new(Regexp.escape(str))) + return chars_left unless slice_str + return slice_str.size - str.size + end + + # Position of the parse as a character offset into the original string. + # @note: Encodings... + def pos + @str.pos + end + def pos=(n) + @str.pos = n + rescue RangeError + end + + # Returns a tuple for the given position. If no position is + # given, line/column information is returned for the current position + # given by #pos. + # + def line_and_column(position=nil) + @line_cache.line_and_column(position || self.pos) + end + end +end diff --git a/lib/parslet/source/line_cache.rb b/lib/parslet/source/line_cache.rb new file mode 100644 index 00000000..315193a4 --- /dev/null +++ b/lib/parslet/source/line_cache.rb @@ -0,0 +1,96 @@ + + +class Parslet::Source + # A cache for line start positions. + # + class LineCache + def initialize + # Stores line endings as a simple position number. The first line always + # starts at 0; numbers beyond the biggest entry are on any line > size, + # but probably make a scan to that position neccessary. + @line_ends = [] + @line_ends.extend RangeSearch + end + + # Returns a tuple for the given input position. + # + def line_and_column(pos) + eol_idx = @line_ends.lbound(pos) + + if eol_idx + # eol_idx points to the offset that ends the current line. + # Let's try to find the offset that starts it: + offset = eol_idx>0 && @line_ends[eol_idx-1] || 0 + return [eol_idx+1, pos-offset+1] + else + # eol_idx is nil, that means that we're beyond the last line end that + # we know about. Pretend for now that we're just on the last line. + offset = @line_ends.last || 0 + return [@line_ends.size+1, pos-offset+1] + end + end + + def scan_for_line_endings(start_pos, buf) + return unless buf + + buf = StringScanner.new(buf) + return unless buf.exist?(/\n/) + + ## If we have already read part or all of buf, we already know about + ## line ends in that portion. remove it and correct cur (search index) + if @last_line_end && start_pos < @last_line_end + # Let's not search the range from start_pos to last_line_end again. + buf.pos = @last_line_end - start_pos + end + + ## Scan the string for line endings; store the positions of all endings + ## in @line_ends. + while buf.skip_until(/\n/) + @last_line_end = start_pos + buf.pos + @line_ends << @last_line_end + end + end + end + + # Mixin for arrays that implicitly give a number of ranges, where one range + # begins where the other one ends. + # + # Example: + # + # [10, 20, 30] + # # would describe [0, 10], (10, 20], (20, 30] + # + module RangeSearch + def find_mid(left, right) + # NOTE: Jonathan Hinkle reported that when mathn is required, just + # dividing and relying on the integer truncation is not enough. + left + ((right - left) / 2).floor + end + + # Scans the array for the first number that is > than bound. Returns the + # index of that number. + # + def lbound(bound) + return nil if empty? + return nil unless last > bound + + left = 0 + right = size - 1 + + loop do + mid = find_mid(left, right) + + if self[mid] > bound + right = mid + else + # assert: self[mid] <= bound + left = mid+1 + end + + if right <= left + return right + end + end + end + end +end diff --git a/lib/parslet/transform.rb b/lib/parslet/transform.rb new file mode 100644 index 00000000..b04fa29d --- /dev/null +++ b/lib/parslet/transform.rb @@ -0,0 +1,236 @@ + +require 'parslet/pattern' + +# Transforms an expression tree into something else. The transformation +# performs a depth-first, post-order traversal of the expression tree. During +# that traversal, each time a rule matches a node, the node is replaced by the +# result of the block associated to the rule. Otherwise the node is accepted +# as is into the result tree. +# +# This is almost what you would generally do with a tree visitor, except that +# you can match several levels of the tree at once. +# +# As a consequence of this, the resulting tree will contain pieces of the +# original tree and new pieces. Most likely, you will want to transform the +# original tree wholly, so this isn't a problem. +# +# You will not be able to create a loop, given that each node will be replaced +# only once and then left alone. This means that the results of a replacement +# will not be acted upon. +# +# Example: +# +# class Example < Parslet::Transform +# rule(:string => simple(:x)) { # (1) +# StringLiteral.new(x) +# } +# end +# +# A tree transform (Parslet::Transform) is defined by a set of rules. Each +# rule can be defined by calling #rule with the pattern as argument. The block +# given will be called every time the rule matches somewhere in the tree given +# to #apply. It is passed a Hash containing all the variable bindings of this +# pattern match. +# +# In the above example, (1) illustrates a simple matching rule. +# +# Let's say you want to parse matching parentheses and distill a maximum nest +# depth. You would probably write a parser like the one in example/parens.rb; +# here's the relevant part: +# +# rule(:balanced) { +# str('(').as(:l) >> balanced.maybe.as(:m) >> str(')').as(:r) +# } +# +# If you now apply this to a string like '(())', you get a intermediate parse +# tree that looks like this: +# +# { +# l: '(', +# m: { +# l: '(', +# m: nil, +# r: ')' +# }, +# r: ')' +# } +# +# This parse tree is good for debugging, but what we would really like to have +# is just the nesting depth. This transformation rule will produce that: +# +# rule(:l => '(', :m => simple(:x), :r => ')') { +# # innermost :m will contain nil +# x.nil? ? 1 : x+1 +# } +# +# = Usage patterns +# +# There are four ways of using this class. The first one is very much +# recommended, followed by the second one for generality. The other ones are +# omitted here. +# +# Recommended usage is as follows: +# +# class MyTransformator < Parslet::Transform +# rule(...) { ... } +# rule(...) { ... } +# # ... +# end +# MyTransformator.new.apply(tree) +# +# Alternatively, you can use the Transform class as follows: +# +# transform = Parslet::Transform.new do +# rule(...) { ... } +# end +# transform.apply(tree) +# +# = Execution context +# +# The execution context of action blocks differs depending on the arity of +# said blocks. This can be confusing. It is however somewhat intentional. You +# should not create fat Transform descendants containing a lot of helper methods, +# instead keep your AST class construction in global scope or make it available +# through a factory. The following piece of code illustrates usage of global +# scope: +# +# transform = Parslet::Transform.new do +# rule(...) { AstNode.new(a_variable) } +# rule(...) { Ast.node(a_variable) } # modules are nice +# end +# transform.apply(tree) +# +# And here's how you would use a class builder (a factory): +# +# transform = Parslet::Transform.new do +# rule(...) { builder.add_node(a_variable) } +# rule(...) { |d| d[:builder].add_node(d[:a_variable]) } +# end +# transform.apply(tree, :builder => Builder.new) +# +# As you can see, Transform allows you to inject local context for your rule +# action blocks to use. +# +class Parslet::Transform + # FIXME: Maybe only part of it? Or maybe only include into constructor + # context? + include Parslet + + class << self + # FIXME: Only do this for subclasses? + include Parslet + + # Define a rule for the transform subclass. + # + def rule(expression, &block) + @__transform_rules ||= [] + @__transform_rules << [Parslet::Pattern.new(expression), block] + end + + # Allows accessing the class' rules + # + def rules + @__transform_rules || [] + end + end + + def initialize(&block) + @rules = [] + + if block + instance_eval(&block) + end + end + + # Defines a rule to be applied whenever apply is called on a tree. A rule + # is composed of two parts: + # + # * an *expression pattern* + # * a *transformation block* + # + def rule(expression, &block) + @rules << [ + Parslet::Pattern.new(expression), + block + ] + end + + # Applies the transformation to a tree that is generated by Parslet::Parser + # or a simple parslet. Transformation will proceed down the tree, replacing + # parts/all of it with new objects. The resulting object will be returned. + # + def apply(obj, context=nil) + transform_elt( + case obj + when Hash + recurse_hash(obj, context) + when Array + recurse_array(obj, context) + else + obj + end, + context + ) + end + + # Executes the block on the bindings obtained by Pattern#match, if such a match + # can be made. Depending on the arity of the given block, it is called in + # one of two environments: the current one or a clean toplevel environment. + # + # If you would like the current environment preserved, please use the + # arity 1 variant of the block. Alternatively, you can inject a context object + # and call methods on it (think :ctx => self). + # + # # the local variable a is simulated + # t.call_on_match(:a => :b) { a } + # # no change of environment here + # t.call_on_match(:a => :b) { |d| d[:a] } + # + def call_on_match(bindings, block) + if block + if block.arity == 1 + return block.call(bindings) + else + context = Context.new(bindings) + return context.instance_eval(&block) + end + end + end + + # Allow easy access to all rules, the ones defined in the instance and the + # ones predefined in a subclass definition. + # + def rules + self.class.rules + @rules + end + + # @api private + # + def transform_elt(elt, context) + rules.each do |pattern, block| + if bindings=pattern.match(elt, context) + # Produces transformed value + return call_on_match(bindings, block) + end + end + + # No rule matched - element is not transformed + return elt + end + + # @api private + # + def recurse_hash(hsh, ctx) + hsh.inject({}) do |new_hsh, (k,v)| + new_hsh[k] = apply(v, ctx) + new_hsh + end + end + # @api private + # + def recurse_array(ary, ctx) + ary.map { |elt| apply(elt, ctx) } + end +end + +require 'parslet/context' \ No newline at end of file