Multiple improvements to compaction + ability to codegen for rust

This commit is contained in:
0x4261756D 2023-07-09 06:28:43 +02:00
parent 3e7599be7e
commit 9c7ec14baf
2 changed files with 122 additions and 11 deletions

3
.gitignore vendored
View File

@ -30,3 +30,6 @@ metals.sbt
.idea
.idea_modules
/.worksheet/
# project specific
*.rs

View File

@ -111,6 +111,7 @@ def grammarToChomsky(grammar: Vector[GrammarRule]): Vector[GrammarRule] =
val additional = unit.filter(_.lhs == rule.rhs(0).name).map((a: GrammarRule) => GrammarRule(rule.lhs, a.rhs))
unit.addAll(additional)
rest = rest.diff(resolvable)
if nullable.contains("S_0") then unit.addOne(GrammarRule("S_0", Vector(GrammarRhs("@", true))))
return unit.toVector
val blockRhs = EbnfRhs("block")
@ -227,19 +228,36 @@ val baseGrammar = ebnfToGrammar(baseEbnf)
def printGrammar(grammar: Vector[EbnfRule | GrammarRule]) = grammar.fold("")(_.toString() + "\n" + _.toString())
def removeDuplicates(grammar: Vector[GrammarRule]): Option[Vector[GrammarRule]] =
val nonterms = grammar.map(_.lhs).toSet
var rightSides: ArrayBuffer[Tuple2[Vector[Vector[GrammarRhs]], String]] = ArrayBuffer()
var cleanGrammar = grammar
for nonterm <- nonterms do
val rightSide = grammar.filter(_.lhs == nonterm).map(_.rhs)
val same = rightSides.find(a => a._2 != nonterm && rightSide.forall(b => a._1.exists(_.sameElements(b))) && a._1.forall(b => rightSide.exists(_.sameElements(b))))
if same.isEmpty then rightSides.addOne(Tuple2(rightSide, nonterm))
else
println(s"'$nonterm' already exists as '${same.get._2}' with $rightSide")
cleanGrammar = cleanGrammar.filter(_.lhs != nonterm).map(x => GrammarRule(x.lhs, x.rhs.map(y => if !y.terminal && y.name == nonterm then GrammarRhs(same.get._2) else y)))
if cleanGrammar.size == grammar.size then return None else return Some(cleanGrammar.toVector)
def removeDeadRules(grammar: Vector[GrammarRule], start: String): Vector[GrammarRule] =
var cleanGrammar: ArrayBuffer[GrammarRule] = ArrayBuffer()
for rule <- grammar do
if !cleanGrammar.exists(x => x.lhs == rule.lhs && x.rhs.sameElements(rule.rhs)) then cleanGrammar.addOne(rule)
println(s"Dedup: ${grammar.diff(cleanGrammar)}")
println(grammar.size)
var maybeCleanGrammar = removeDuplicates(grammar)
var cleanGrammar = grammar.toVector
while maybeCleanGrammar.isDefined do
cleanGrammar = maybeCleanGrammar.get
println(cleanGrammar.size)
maybeCleanGrammar = removeDuplicates(cleanGrammar)
var result: Vector[GrammarRule] = Vector()
var current = grammar.filter(_.lhs == start)
var current = cleanGrammar.filter(_.lhs == start)
assert(current.size > 0)
while current.size > 0 do
println(s"${current.size} ${result.size}")
result = result.concat(current)
current = grammar.filter((rule: GrammarRule) => !result.exists(_.lhs == rule.lhs) && current.exists((a: GrammarRule) => a.rhs.exists(b => !b.terminal && b.name == rule.lhs)))
println(s"Reachability: ${grammar.diff(result)}")
current = cleanGrammar.filter((rule: GrammarRule) => !result.exists(_.lhs == rule.lhs) && current.exists((a: GrammarRule) => a.rhs.exists(b => !b.terminal && b.name == rule.lhs)))
println(s"Reachability: ${cleanGrammar.diff(result)}")
println(result.size)
return result
def CYK(grammar: Vector[GrammarRule], input: Vector[String], start: String): Option[AmbiguousNode] =
@ -341,13 +359,103 @@ class AmbiguousNode(val content: String, var precedence: Int, var left: Array[Am
return ret
end AmbiguousNode
def printGrammarAsRustLUT(grammar: Vector[GrammarRule]) =
var builder = StringBuilder()
builder.append("use crate::tokenizer::Token;\n\n")
builder.append("enum Rule\n{\n\tTerminal(usize, Token),\n\tNonTerminal(usize, usize, usize)\n}\n\n")
val nonterms = grammar.map(_.lhs).toSet.toArray
builder.append(s"const NONTERMS: [&str; ${nonterms.length}] =\n[\n")
for nonterm <- nonterms do
builder.append(s"\t\"$nonterm\",\n")
builder.append("];\n\n")
builder.append(s"const GRAMMAR: [Rule; ${grammar.size - 1}] =\n[\n")
for rule <- grammar do
if rule.rhs(0).name != "@" then
builder.append("\tRule::")
if rule.rhs.size == 1 then
assert(rule.rhs(0).terminal)
builder.append(s"Terminal(${nonterms.indexOf(rule.lhs)}, Token::")
builder.append(
rule.rhs(0).name match
case "Name" => "Name(String::new())"
case "and" => "And"
case "break" => "Break"
case "do" => "Do"
case "else" => "Else"
case "elseif" => "Elseif"
case "end" => "End"
case "false" => "False"
case "for" => "For"
case "function" => "Function"
case "goto" => "Goto"
case "if" => "If"
case "in" => "In"
case "local" => "Local"
case "nil" => "Nil"
case "not" => "Not"
case "or" => "Or"
case "repeat" => "Repeat"
case "return" => "Return"
case "then" => "Then"
case "true" => "True"
case "until" => "Until"
case "while" => "While"
case "+" => "Plus"
case "-" => "Minus"
case "*" => "Star"
case "/" => "Slash"
case "%" => "Percent"
case "^" => "Caret"
case "#" => "Hash"
case "&" => "Ampersand"
case "~" => "Tilde"
case "|" => "Pipe"
case "<<" => "LtLt"
case ">>" => "GtGt"
case "//" => "SlashSlash"
case "==" => "EqualsEquals"
case "~=" => "TildeEquals"
case "<=" => "LtEquals"
case ">=" => "GtEquals"
case "<" => "Lt"
case ">" => "Gt"
case "=" => "Equals"
case "(" => "RoundOpen"
case ")" => "RoundClosed"
case "{" => "CurlyOpen"
case "}" => "CurlyClosed"
case "[" => "SquareOpen"
case "]" => "SquareClosed"
case "::" => "ColonColon"
case ";" => "Semicolon"
case ":" => "Colon"
case "," => "Comma"
case "." => "Dot"
case ".." => "DotDot"
case "..." => "DotDotDot"
case "Numeral" => "Numeral(String::new())"
case "LiteralString" => "StringLiteral(String::new())"
case _ => throw Exception(s"unknown terminal ${rule.rhs(0).name}")
)
builder.append("),\n")
else
assert(rule.rhs.size == 2 && !rule.rhs(0).terminal && !rule.rhs(1).terminal)
builder.append(s"NonTerminal(${nonterms.indexOf(rule.lhs)}, ${nonterms.indexOf(rule.rhs(0).name)}, ${nonterms.indexOf(rule.rhs(1).name)}),\n")
builder.append("];\n")
println(builder.result())
val fw = FileWriter(new File("grammar.rs"))
fw.write(builder.result())
fw.close()
val chomskyGrammar = grammarToChomsky(baseGrammar).sortBy(x => x.lhs)
val cleanChomskyGrammar = removeDeadRules(chomskyGrammar, "S_0").sortBy(x => x.lhs)
@main def main = println(printGrammar(cleanChomskyGrammar))
/* println(printGrammar(chomskyGrammar))
println("---")
@main def main =// println(printGrammar(cleanChomskyGrammar))
//println(printGrammar(chomskyGrammar))
//println("---")
println(printGrammar(cleanChomskyGrammar))
val root = CYK(cleanChomskyGrammar, Vector("local", "Name", "=", "Numeral", "+", "Numeral", "+", "(", "Numeral", "*", "Numeral", ")"), "S_0").get
println(root)
println("===============\n\n\n")
println(disambiguate(root)) */
println(disambiguate(root))
printGrammarAsRustLUT(cleanChomskyGrammar)