From 9c7ec14baf6458acba0830cf0fa879ed9d991909 Mon Sep 17 00:00:00 2001 From: 0x4261756D <38735823+0x4261756D@users.noreply.github.com> Date: Sun, 9 Jul 2023 06:28:43 +0200 Subject: [PATCH] Multiple improvements to compaction + ability to codegen for rust --- .gitignore | 3 + src/main/scala/Main.scala | 130 ++++++++++++++++++++++++++++++++++---- 2 files changed, 122 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 9e79245..99ce725 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,6 @@ metals.sbt .idea .idea_modules /.worksheet/ + +# project specific +*.rs \ No newline at end of file diff --git a/src/main/scala/Main.scala b/src/main/scala/Main.scala index 9e32438..3334fe5 100644 --- a/src/main/scala/Main.scala +++ b/src/main/scala/Main.scala @@ -111,6 +111,7 @@ def grammarToChomsky(grammar: Vector[GrammarRule]): Vector[GrammarRule] = val additional = unit.filter(_.lhs == rule.rhs(0).name).map((a: GrammarRule) => GrammarRule(rule.lhs, a.rhs)) unit.addAll(additional) rest = rest.diff(resolvable) + if nullable.contains("S_0") then unit.addOne(GrammarRule("S_0", Vector(GrammarRhs("@", true)))) return unit.toVector val blockRhs = EbnfRhs("block") @@ -227,19 +228,36 @@ val baseGrammar = ebnfToGrammar(baseEbnf) def printGrammar(grammar: Vector[EbnfRule | GrammarRule]) = grammar.fold("")(_.toString() + "\n" + _.toString()) +def removeDuplicates(grammar: Vector[GrammarRule]): Option[Vector[GrammarRule]] = + val nonterms = grammar.map(_.lhs).toSet + var rightSides: ArrayBuffer[Tuple2[Vector[Vector[GrammarRhs]], String]] = ArrayBuffer() + var cleanGrammar = grammar + for nonterm <- nonterms do + val rightSide = grammar.filter(_.lhs == nonterm).map(_.rhs) + val same = rightSides.find(a => a._2 != nonterm && rightSide.forall(b => a._1.exists(_.sameElements(b))) && a._1.forall(b => rightSide.exists(_.sameElements(b)))) + if same.isEmpty then rightSides.addOne(Tuple2(rightSide, nonterm)) + else + println(s"'$nonterm' already exists as '${same.get._2}' with $rightSide") + cleanGrammar = cleanGrammar.filter(_.lhs != nonterm).map(x => GrammarRule(x.lhs, x.rhs.map(y => if !y.terminal && y.name == nonterm then GrammarRhs(same.get._2) else y))) + if cleanGrammar.size == grammar.size then return None else return Some(cleanGrammar.toVector) + def removeDeadRules(grammar: Vector[GrammarRule], start: String): Vector[GrammarRule] = - var cleanGrammar: ArrayBuffer[GrammarRule] = ArrayBuffer() - for rule <- grammar do - if !cleanGrammar.exists(x => x.lhs == rule.lhs && x.rhs.sameElements(rule.rhs)) then cleanGrammar.addOne(rule) - println(s"Dedup: ${grammar.diff(cleanGrammar)}") + println(grammar.size) + var maybeCleanGrammar = removeDuplicates(grammar) + var cleanGrammar = grammar.toVector + while maybeCleanGrammar.isDefined do + cleanGrammar = maybeCleanGrammar.get + println(cleanGrammar.size) + maybeCleanGrammar = removeDuplicates(cleanGrammar) var result: Vector[GrammarRule] = Vector() - var current = grammar.filter(_.lhs == start) + var current = cleanGrammar.filter(_.lhs == start) assert(current.size > 0) while current.size > 0 do println(s"${current.size} ${result.size}") result = result.concat(current) - current = grammar.filter((rule: GrammarRule) => !result.exists(_.lhs == rule.lhs) && current.exists((a: GrammarRule) => a.rhs.exists(b => !b.terminal && b.name == rule.lhs))) - println(s"Reachability: ${grammar.diff(result)}") + current = cleanGrammar.filter((rule: GrammarRule) => !result.exists(_.lhs == rule.lhs) && current.exists((a: GrammarRule) => a.rhs.exists(b => !b.terminal && b.name == rule.lhs))) + println(s"Reachability: ${cleanGrammar.diff(result)}") + println(result.size) return result def CYK(grammar: Vector[GrammarRule], input: Vector[String], start: String): Option[AmbiguousNode] = @@ -341,13 +359,103 @@ class AmbiguousNode(val content: String, var precedence: Int, var left: Array[Am return ret end AmbiguousNode +def printGrammarAsRustLUT(grammar: Vector[GrammarRule]) = + var builder = StringBuilder() + builder.append("use crate::tokenizer::Token;\n\n") + builder.append("enum Rule\n{\n\tTerminal(usize, Token),\n\tNonTerminal(usize, usize, usize)\n}\n\n") + val nonterms = grammar.map(_.lhs).toSet.toArray + builder.append(s"const NONTERMS: [&str; ${nonterms.length}] =\n[\n") + for nonterm <- nonterms do + builder.append(s"\t\"$nonterm\",\n") + builder.append("];\n\n") + builder.append(s"const GRAMMAR: [Rule; ${grammar.size - 1}] =\n[\n") + for rule <- grammar do + if rule.rhs(0).name != "@" then + builder.append("\tRule::") + if rule.rhs.size == 1 then + assert(rule.rhs(0).terminal) + builder.append(s"Terminal(${nonterms.indexOf(rule.lhs)}, Token::") + builder.append( + rule.rhs(0).name match + case "Name" => "Name(String::new())" + case "and" => "And" + case "break" => "Break" + case "do" => "Do" + case "else" => "Else" + case "elseif" => "Elseif" + case "end" => "End" + case "false" => "False" + case "for" => "For" + case "function" => "Function" + case "goto" => "Goto" + case "if" => "If" + case "in" => "In" + case "local" => "Local" + case "nil" => "Nil" + case "not" => "Not" + case "or" => "Or" + case "repeat" => "Repeat" + case "return" => "Return" + case "then" => "Then" + case "true" => "True" + case "until" => "Until" + case "while" => "While" + case "+" => "Plus" + case "-" => "Minus" + case "*" => "Star" + case "/" => "Slash" + case "%" => "Percent" + case "^" => "Caret" + case "#" => "Hash" + case "&" => "Ampersand" + case "~" => "Tilde" + case "|" => "Pipe" + case "<<" => "LtLt" + case ">>" => "GtGt" + case "//" => "SlashSlash" + case "==" => "EqualsEquals" + case "~=" => "TildeEquals" + case "<=" => "LtEquals" + case ">=" => "GtEquals" + case "<" => "Lt" + case ">" => "Gt" + case "=" => "Equals" + case "(" => "RoundOpen" + case ")" => "RoundClosed" + case "{" => "CurlyOpen" + case "}" => "CurlyClosed" + case "[" => "SquareOpen" + case "]" => "SquareClosed" + case "::" => "ColonColon" + case ";" => "Semicolon" + case ":" => "Colon" + case "," => "Comma" + case "." => "Dot" + case ".." => "DotDot" + case "..." => "DotDotDot" + case "Numeral" => "Numeral(String::new())" + case "LiteralString" => "StringLiteral(String::new())" + case _ => throw Exception(s"unknown terminal ${rule.rhs(0).name}") + ) + builder.append("),\n") + else + assert(rule.rhs.size == 2 && !rule.rhs(0).terminal && !rule.rhs(1).terminal) + builder.append(s"NonTerminal(${nonterms.indexOf(rule.lhs)}, ${nonterms.indexOf(rule.rhs(0).name)}, ${nonterms.indexOf(rule.rhs(1).name)}),\n") + builder.append("];\n") + + println(builder.result()) + val fw = FileWriter(new File("grammar.rs")) + fw.write(builder.result()) + fw.close() + val chomskyGrammar = grammarToChomsky(baseGrammar).sortBy(x => x.lhs) val cleanChomskyGrammar = removeDeadRules(chomskyGrammar, "S_0").sortBy(x => x.lhs) -@main def main = println(printGrammar(cleanChomskyGrammar)) - /* println(printGrammar(chomskyGrammar)) - println("---") +@main def main =// println(printGrammar(cleanChomskyGrammar)) + //println(printGrammar(chomskyGrammar)) + //println("---") println(printGrammar(cleanChomskyGrammar)) val root = CYK(cleanChomskyGrammar, Vector("local", "Name", "=", "Numeral", "+", "Numeral", "+", "(", "Numeral", "*", "Numeral", ")"), "S_0").get println(root) println("===============\n\n\n") - println(disambiguate(root)) */ \ No newline at end of file + println(disambiguate(root)) + printGrammarAsRustLUT(cleanChomskyGrammar) \ No newline at end of file