From ea19554b44d65538446ce7258db2fbd3c9f953c5 Mon Sep 17 00:00:00 2001 From: Torsten Ruger Date: Wed, 4 Jun 2014 19:55:04 +0300 Subject: [PATCH] adds crystals parser code --- lib/parser/README.markdown | 56 ++++++++++++++++++++ lib/parser/basic_types.rb | 48 +++++++++++++++++ lib/parser/call_site.rb | 17 ++++++ lib/parser/compound_types.rb | 19 +++++++ lib/parser/control.rb | 20 +++++++ lib/parser/crystal.rb | 37 +++++++++++++ lib/parser/expression.rb | 18 +++++++ lib/parser/function_definition.rb | 17 ++++++ lib/parser/keywords.rb | 28 ++++++++++ lib/parser/module_definition.rb | 15 ++++++ lib/parser/operators.rb | 52 ++++++++++++++++++ lib/parser/tokens.rb | 22 ++++++++ lib/parser/transform.rb | 87 +++++++++++++++++++++++++++++++ 13 files changed, 436 insertions(+) create mode 100644 lib/parser/README.markdown create mode 100644 lib/parser/basic_types.rb create mode 100644 lib/parser/call_site.rb create mode 100644 lib/parser/compound_types.rb create mode 100644 lib/parser/control.rb create mode 100644 lib/parser/crystal.rb create mode 100644 lib/parser/expression.rb create mode 100644 lib/parser/function_definition.rb create mode 100644 lib/parser/keywords.rb create mode 100644 lib/parser/module_definition.rb create mode 100644 lib/parser/operators.rb create mode 100644 lib/parser/tokens.rb create mode 100644 lib/parser/transform.rb diff --git a/lib/parser/README.markdown b/lib/parser/README.markdown new file mode 100644 index 0000000..4c1c279 --- /dev/null +++ b/lib/parser/README.markdown @@ -0,0 +1,56 @@ +Parser +================ + +This includes the parser and generated ast. + +Parslet is really great in that it: +- does not generate code but instean gives a clean dsl to define a grammar +- uses ruby modules so one can split the grammars up +- has support for binary operators with presedence and binding +- has a seperate tranform stage to generate an ast layer + +Especially the last point is great. Since it is seperate it does not clutter up the actual grammar. +And it can generate a layer that has no links to the actual parser anymore, thus saving/automating +a complete tranformation process. + + +Operator list from http://stackoverflow.com/questions/21060234/ruby-operator-precedence-table + +N A M Operator(s) Description +- - - ----------- ----------- +1 R Y ! ~ + boolean NOT, bitwise complement, unary plus + (unary plus may be redefined from Ruby 1.9 with +@) + +2 R Y ** exponentiation +1 R Y - unary minus (redefine with -@) + +2 L Y * / % multiplication, division, modulo (remainder) +2 L Y + - addition (or concatenation), subtraction + +2 L Y << >> bitwise shift-left (or append), bitwise shift-right +2 L Y & bitwise AND + +2 L Y | ^ bitwise OR, bitwise XOR (exclusive OR) +2 L Y < <= >= > ordering + +2 N Y == === != =~ !~ <=> equality, pattern matching, comparison + (!= and !~ may not be redefined prior to Ruby 1.9) + +2 L N && boolean AND +2 L N || boolean OR + +2 N N .. ... range creation (inclusive and exclusive) + and boolean flip-flops + +3 R N ? : ternary if-then-else (conditional) +2 L N rescue exception-handling modifier + +2 R N = assignment +2 R N **= *= /= %= += -= assignment +2 R N <<= >>= assignment +2 R N &&= &= ||= |= ^= assignment + +1 N N defined? test variable definition and type +1 R N not boolean NOT (low precedence) +2 L N and or boolean AND, boolean OR (low precedence) +2 N N if unless while until conditional and loop modifiers \ No newline at end of file diff --git a/lib/parser/basic_types.rb b/lib/parser/basic_types.rb new file mode 100644 index 0000000..1b83dfb --- /dev/null +++ b/lib/parser/basic_types.rb @@ -0,0 +1,48 @@ +module Parser + # Basic types are numbers and strings + module BasicTypes + include Parslet + # space really is just space. ruby is newline sensitive, so there is more whitespace footwork + # rule of thumb is that anything eats space behind it, but only space, no newlines + rule(:space) { (str('\t') | str(' ')).repeat(1) } + rule(:space?) { space.maybe } + rule(:linebreak){ str("\n") >> space? >> linebreak.repeat } + + rule(:quote) { str('"') } + rule(:nonquote) { str('"').absent? >> any } + + rule(:comment){ match('#') >> (linebreak.absent? >> any).repeat >> linebreak } + rule(:newline) { linebreak | comment } + rule(:eol) { newline | any.absent? } + + rule(:double_quote){ str('"') } + rule(:minus) { str('-') } + rule(:plus) { str('+') } + + rule(:sign) { plus | minus } + rule(:dot) { str('.') } + rule(:digit) { match('[0-9]') } + rule(:exponent) { (str('e')| str('E')) } + + # identifier must start with lower case + # TODO rule forbit names like if_true, because it starts with a keyword. a little looser please! + rule(:name) { keyword.absent? >> (match['a-z_'] >> match['a-zA-Z0-9_'].repeat).as(:name) >> space? } + # instance variables must have the @ + rule(:instance_variable) { (str('@') >> name).as(:instance_variable) } + # and class/module names must start with capital + # (admittatly the rule matches constants too, but one step at a time) + rule(:module_name) { keyword.absent? >> (match['A-Z'] >> match['a-zA-Z0-9_'].repeat).as(:module_name) >> space? } + + rule(:escape) { str('\\') >> any.as(:esc) } + rule(:string) { quote >> ( + escape | + nonquote.as(:char) + ).repeat(1).as(:string) >> quote } + + rule(:integer) { sign.maybe >> digit.repeat(1).as(:integer) >> space? } + + rule(:float) { integer >> dot >> integer >> + (exponent >> sign.maybe >> digit.repeat(1,3)).maybe >> space?} + rule(:basic_type){ integer | name | string | float | instance_variable | module_name } + end +end \ No newline at end of file diff --git a/lib/parser/call_site.rb b/lib/parser/call_site.rb new file mode 100644 index 0000000..5d0edac --- /dev/null +++ b/lib/parser/call_site.rb @@ -0,0 +1,17 @@ +module Parser + module CallSite + include Parslet + + rule(:argument_list) { + left_parenthesis >> + ( ((operator_expression|value_expression).as(:argument) >> space? >> + (comma >> space? >> (operator_expression|value_expression).as(:argument)).repeat(0)).repeat(0,1)).as(:argument_list) >> + space? >> right_parenthesis + } + + rule(:call_site) { ((module_name|instance_variable|name).as(:receiver) >> str(".")).maybe >> #possibly qualified + name.as(:call_site) >> argument_list >> comment.maybe} + + + end +end diff --git a/lib/parser/compound_types.rb b/lib/parser/compound_types.rb new file mode 100644 index 0000000..f5a6900 --- /dev/null +++ b/lib/parser/compound_types.rb @@ -0,0 +1,19 @@ +module Parser + # Compound types are Arrays and Hashes + module CompoundTypes + include Parslet + + rule(:array_constant) do + left_bracket >> + ( ((operator_expression|value_expression).as(:array_element) >> space? >> + (comma >> space? >> (operator_expression|value_expression).as(:array_element)).repeat(0)).repeat(0,1)).as(:array_constant) >> + space? >> right_bracket + end + + rule(:hash_pair) { basic_type.as(:hash_key) >> association >> (operator_expression|value_expression).as(:hash_value) } + rule(:hash_constant) { left_brace >> ((hash_pair.as(:hash_pair) >> + (comma >> space? >> hash_pair.as(:hash_pair)).repeat(0)).repeat(0,1)).as(:hash_constant)>> + space? >> right_brace } + + end +end \ No newline at end of file diff --git a/lib/parser/control.rb b/lib/parser/control.rb new file mode 100644 index 0000000..2542438 --- /dev/null +++ b/lib/parser/control.rb @@ -0,0 +1,20 @@ +module Parser + module Control + include Parslet + rule(:conditional) do + keyword_if >> + (( (value_expression|operator_expression).as(:conditional) ) | + left_parenthesis >> (operator_expression|value_expression).as(:conditional) >> right_parenthesis) >> + newline >> expressions_else.as(:if_true) >> newline >> expressions_end.as(:if_false) + end + + rule(:while_do) do + keyword_while >> left_parenthesis >> (operator_expression|value_expression).as(:while_cond) >> + right_parenthesis >> keyword_do >> newline >> + expressions_end.as(:body) + end + rule(:simple_return) do + keyword_return >> (operator_expression|value_expression).as(:return_expression) + end + end +end diff --git a/lib/parser/crystal.rb b/lib/parser/crystal.rb new file mode 100644 index 0000000..bb52818 --- /dev/null +++ b/lib/parser/crystal.rb @@ -0,0 +1,37 @@ +require_relative "basic_types" +require_relative "compound_types" +require_relative "tokens" +require_relative "keywords" +require_relative "control" +require_relative "expression" +require_relative "call_site" +require_relative "function_definition" +require_relative "module_definition" +require_relative "operators" + +module Parser + + # obviously a work in progress !! + # We "compose" the parser from bits, divide and hopefully conquer + + # a note about .maybe : .maybe is almost every respect the same as .repeat(0,1) + # so either 0, or 1, in other words maybe. Nice feature, but there are strings attached: + # a maybe removes the 0 a sequence (array) to a single (hash). Thus 2 transformations are needed + # More work than the prettiness is worth, so only use .maybe on something that does not need capturing + + class Crystal < Parslet::Parser + include BasicTypes + include CompoundTypes + include Tokens + include Keywords + include Control + include Expression + include CallSite + include FunctionDefinition + include Operators + include ModuleDef + + rule(:root_body) {(module_definition | class_definition | function_definition | expression | call_site )} + rule(:root) { root_body.repeat() } + end +end diff --git a/lib/parser/expression.rb b/lib/parser/expression.rb new file mode 100644 index 0000000..77e315f --- /dev/null +++ b/lib/parser/expression.rb @@ -0,0 +1,18 @@ +module Parser + module Expression + include Parslet + + rule(:value_expression) { call_site | basic_type } + + rule(:expression) { (simple_return | while_do | conditional | operator_expression | call_site ) >> newline } + + def delimited_expressions( delimit ) + ( (delimit.absent? >> expression).repeat(1)).as(:expressions) >> delimit + end + + rule(:expressions_do) { delimited_expressions(keyword_do) } + rule(:expressions_else) { delimited_expressions(keyword_else) } + rule(:expressions_end) { delimited_expressions(keyword_end) } + + end +end diff --git a/lib/parser/function_definition.rb b/lib/parser/function_definition.rb new file mode 100644 index 0000000..1aa1815 --- /dev/null +++ b/lib/parser/function_definition.rb @@ -0,0 +1,17 @@ +module Parser + module FunctionDefinition + include Parslet + + rule(:function_definition) { + keyword_def >> ((module_name|instance_variable|name).as(:receiver) >> str(".")).maybe >> #possibly qualified + name.as(:function_name) >> parmeter_list.maybe >> newline >> expressions_end >> newline + } + + rule(:parmeter_list) { + left_parenthesis >> + ((name.as(:parmeter) >> (comma >> name.as(:parmeter)).repeat(0)).repeat(0,1)).as(:parmeter_list) >> + right_parenthesis + } + + end +end diff --git a/lib/parser/keywords.rb b/lib/parser/keywords.rb new file mode 100644 index 0000000..478e3f2 --- /dev/null +++ b/lib/parser/keywords.rb @@ -0,0 +1,28 @@ +module Parser + module Keywords + include Parslet + + rule(:keyword_begin) { str('begin').as(:begin) >> space?} + rule(:keyword_class) { str('class') >> space? } + rule(:keyword_def) { str('def') >> space? } + rule(:keyword_do) { str('do').as(:do) >> space?} + rule(:keyword_else) { str('else').as(:else) >> space? } + rule(:keyword_end) { str('end').as(:end) >> space? } + rule(:keyword_false) { str('false').as(:false) >> space?} + rule(:keyword_if) { str('if').as(:if) >> space? } + rule(:keyword_rescue) { str('rescue').as(:rescue) >> space?} + rule(:keyword_return) { str('return').as(:return) >> space?} + rule(:keyword_true) { str('true').as(:true) >> space?} + rule(:keyword_module) { str('module') >> space? } + rule(:keyword_nil) { str('nil').as(:nil) >> space?} + rule(:keyword_unless) { str('unless').as(:unless) >> space?} + rule(:keyword_until) { str('until').as(:until) >> space?} + rule(:keyword_while) { str('while').as(:while) >> space?} + + # this rule is just to make sure identifiers can't be keywords. Kind of duplication here, but we need the + # space in above rules, so just make sure to add any here too. + rule(:keyword){ str('begin') | str('def') | str('do') | str('else') | str('end') | + str('false')| str('if')| str('rescue')| str('true')| str('nil') | + str('unless')| str('until')| str('while')} + end +end \ No newline at end of file diff --git a/lib/parser/module_definition.rb b/lib/parser/module_definition.rb new file mode 100644 index 0000000..023685f --- /dev/null +++ b/lib/parser/module_definition.rb @@ -0,0 +1,15 @@ +module Parser + module ModuleDef + include Parslet + rule(:module_definition) do + keyword_module >> module_name >> eol >> + ( (keyword_end.absent? >> root_body).repeat()).as(:module_expressions) >> keyword_end >> newline + end + + rule(:class_definition) do + keyword_class >> module_name >> eol >> + ( (keyword_end.absent? >> root_body).repeat()).as(:class_expressions) >> keyword_end >> newline + end + + end +end diff --git a/lib/parser/operators.rb b/lib/parser/operators.rb new file mode 100644 index 0000000..dac03e6 --- /dev/null +++ b/lib/parser/operators.rb @@ -0,0 +1,52 @@ +module Parser + module Operators + include Parslet + rule(:exponent) { str('**') >> space?} + rule(:multiply) { match['*/%'] >> space? } + rule(:plus) { match['+-'] >> space? } + rule(:shift) { str(">>") | str("<<") >> space?} + rule(:bit_and) { str('&') >> space?} + rule(:bit_or) { str('|') >> space?} + rule(:greater_equal) { str('>=') >> space?} + rule(:less_or_equal) { str('<=') >> space?} + rule(:larger) { str('>') >> space?} + rule(:smaller) { str('<') >> space?} + rule(:identity) { str('===') >> space?} + rule(:equal) { str('==') >> space?} + rule(:not_equal) { str('!=') >> space?} + rule(:boolean_and) { str('&&') | str("and") >> space?} + rule(:boolean_or) { str('||') | str("or") >> space?} + rule(:assign) { str('=') >> space?} + rule(:op_assign) { str('+=')|str('-=')|str('*=')|str('/=')|str('%=') >> space?} + rule(:eclipse) { str('..') |str("...") >> space?} + rule(:assign) { str('=') >> space?} + + #infix doing the heavy lifting here, + # is defined as an expressions and array of [atoms,priority,binding] triples + rule(:operator_expression) do infix_expression(value_expression, + [exponent, 120, :left] , + [multiply, 120, :left] , + [plus, 110, :left], + [shift, 100, :left], + [bit_and, 90, :left], + [bit_or, 90, :right], + [greater_equal, 80, :left], + [less_or_equal, 80, :left], + [larger, 80, :left], + [smaller, 80, :left], + [identity, 70, :right], + [equal, 70, :right], + [not_equal, 70, :right], + [boolean_and, 60, :left], + [boolean_or, 50, :right], + [eclipse, 40, :right], + [keyword_rescue, 30, :right], + [assign, 20, :right], + [op_assign, 20, :right], + [keyword_until, 10, :right], + [keyword_while, 10, :right], + [keyword_unless, 10, :right], + [keyword_if, 10, :right]) + end + end +end diff --git a/lib/parser/tokens.rb b/lib/parser/tokens.rb new file mode 100644 index 0000000..da9b5e0 --- /dev/null +++ b/lib/parser/tokens.rb @@ -0,0 +1,22 @@ +module Parser + # Tokens are single or double character combinations with "meaning" + # braces, comman, point, questionmark , quotes, that kind of thing + # operator symbols are seperate in Opreators + module Tokens + include Parslet + rule(:left_parenthesis) { str('(') >> space? } + rule(:right_parenthesis) { str(')') >> space? } + rule(:left_brace) { str('{') >> space? } + rule(:right_brace) { str('}') >> space? } + rule(:left_bracket) { str('[') >> space? } + rule(:right_bracket) { str(']') >> space? } + + rule(:association) { str("=>") >> space? } + rule(:comma) { str(',') >> space? } + rule(:colon) { str(':') >> space? } + rule(:semicolon) { str(';') >> space? } + rule(:question_mark) { str('?') >> space? } + rule(:excamation_mark) { str('!') >> space? } + + end +end \ No newline at end of file diff --git a/lib/parser/transform.rb b/lib/parser/transform.rb new file mode 100644 index 0000000..cddbcef --- /dev/null +++ b/lib/parser/transform.rb @@ -0,0 +1,87 @@ +require 'parslet' +require 'ast/expression' + +module Parser + class Transform < Parslet::Transform + rule(:string => sequence(:chars)) { Ast::StringExpression.new chars.join } + rule(:esc => simple(:esc)) { '\\' + esc } + rule(char: simple(:char)) { char } + + rule(:integer => simple(:value)) { Ast::IntegerExpression.new(value.to_i) } + rule(:name => simple(:name)) { Ast::NameExpression.new(name.to_s) } + rule(:instance_variable => simple(:instance_variable)) { Ast::VariableExpression.new(instance_variable.name) } + rule(:module_name => simple(:module_name)) { Ast::ModuleName.new(module_name.to_s) } + + rule(:array_constant => sequence(:array_constant) ) { Ast::ArrayExpression.new(array_constant) } + rule(:array_element => simple(:array_element)) { array_element } + rule(:hash_constant => sequence(:hash_constant) ) { Ast::HashExpression.new(hash_constant) } + rule(:hash_key => simple(:hash_key) , :hash_value => simple(:hash_value)) { Ast::AssociationExpression.new(hash_key,hash_value) } + rule(:hash_pair => simple(:hash_pair) ) { hash_pair } + + rule(:argument => simple(:argument)) { argument } + rule(:argument_list => sequence(:argument_list)) { argument_list } + + #Two rules for calls, simple and qualified. Keeps the rules simpler + rule( :call_site => simple(:call_site), + :argument_list => sequence(:argument_list)) do + Ast::CallSiteExpression.new(call_site.name, argument_list ) + end + rule( :receiver => simple(:receiver) , :call_site => simple(:call_site), + :argument_list => sequence(:argument_list)) do + Ast::CallSiteExpression.new(call_site.name, argument_list , receiver) + end + + rule(:if => simple(:if), :conditional => simple(:conditional), + :if_true => {:expressions => sequence(:if_true) , :else => simple(:else) }, + :if_false => {:expressions => sequence(:if_false) , :end => simple(:e) }) do + Ast::IfExpression.new(conditional, if_true, if_false) + end + + rule(:while => simple(:while), + :while_cond => simple(:while_cond) , :do => simple(:do) , + :body => {:expressions => sequence(:body) , :end => simple(:e) }) do + Ast::WhileExpression.new(while_cond, body) + end + + rule(:return => simple(:return) , :return_expression => simple(:return_expression))do + Ast::ReturnExpression.new(return_expression) + end + + rule(:parmeter => simple(:parmeter)) { parmeter } + rule(:parmeter_list => sequence(:parmeter_list)) { parmeter_list } + + # Also two rules for function definitions, unqualified and qualified + rule(:function_name => simple(:function_name), + :parmeter_list => sequence(:parmeter_list), + :expressions => sequence(:expressions) , :end => simple(:e)) do + Ast::FunctionExpression.new(function_name.name, parmeter_list, expressions) + end + + rule(:receiver=> simple(:receiver), + :function_name => simple(:function_name), + :parmeter_list => sequence(:parmeter_list), + :expressions => sequence(:expressions) , :end => simple(:e)) do + Ast::FunctionExpression.new(function_name.name, parmeter_list, expressions , receiver) + end + + rule(l: simple(:l), o: simple(:o) , r: simple(:r)) do + Ast::OperatorExpression.new( o.to_s.strip , l ,r) + end + + #modules and classes are understandibly quite similar Class < Module + rule( :module_name => simple(:module_name) , :module_expressions => sequence(:module_expressions) , :end=>"end") do + Ast::ModuleExpression.new(module_name , module_expressions) + end + rule( :module_name => simple(:module_name) , :class_expressions => sequence(:class_expressions) , :end=>"end") do + Ast::ClassExpression.new(module_name , class_expressions) + end + + #shortcut to get the ast tree for a given string + # optional second arguement specifies a rule that will be parsed (mainly for testing) + def self.ast string , rule = :root + syntax = Parser.new.send(rule).parse(string) + tree = Transform.new.apply(syntax) + tree + end + end +end