Initial commit, barebones tokenizer working

2022-11-29 02:04:01 +01:00 · 2022-11-29 02:04:01 +01:00 · 9976ef9fe9
commit 9976ef9fe9
5 changed files with 144 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,7 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "kurz"
+version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,8 @@
+[package]
+name = "kurz"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,127 @@
+use std::env;
+use std::fs;
+
+#[derive(Debug)]
+enum Token
+{
+	StringLit(String, i32, i32),
+	Intrinsic(String, i32, i32),
+}
+
+fn main()
+{
+	let args: Vec<String> = env::args().collect();
+	match args[1].as_str()
+	{
+		"-c" | "--compile" =>
+		{
+			let file_content = fs::read_to_string(&args[2]).expect("Could not read the source file");
+			let tokens: Vec<Token> = tokenize(&file_content);
+			println!("{:?}", tokens);
+		}
+		_ => panic!("Unknown option")
+	}
+}
+
+fn tokenize(text: &str) -> Vec<Token>
+{
+	let mut tokens: Vec<Token> = Vec::new();
+	let mut line = 1;
+	let mut col = 1;
+	let mut state = TokenizerState::Whitespace;
+	let mut word = String::new();
+	for ch in text.chars()
+	{
+		match state
+		{
+			TokenizerState::Whitespace =>
+			{
+				// If ch is whitespace, do nothing
+				if !ch.is_whitespace()
+				{
+					match ch
+					{
+						'"' =>
+						{
+							state = TokenizerState::Quote;
+						}
+						_ =>
+						{
+							state = TokenizerState::Rest;
+							word.push(ch);
+						}
+					}
+				}
+			}
+			TokenizerState::Quote =>
+			{
+				if ch == '"'
+				{
+					state = TokenizerState::Whitespace;
+					tokens.push(Token::StringLit(word.clone(), line, col));
+					word.clear();
+				}
+				else
+				{
+					word.push(ch);
+				}
+			}
+			TokenizerState::Rest =>
+			{
+				if ch.is_whitespace()
+				{
+					state = TokenizerState::Whitespace;
+					let token: Token = match word.as_str()
+					{
+						"print" => Token::Intrinsic(word.clone(), line, col),
+						_ => todo!("Unknown word {}", word)
+					};
+					tokens.push(token);
+				}
+				else
+				{
+					match ch
+					{
+						'"' => panic!("Having '\"' in the middle of a word is not allowed"),
+						_ =>
+						{
+							word.push(ch);
+						}
+					}
+				}
+			}
+		}
+		col += 1;
+		if ch == '\n'
+		{
+			col = 1;
+			line += 1;
+		}
+	}
+	match state
+	{
+		TokenizerState::Quote =>
+		{
+			panic!("Encountered EOF before closing string");
+		}
+		TokenizerState::Whitespace => {},
+		TokenizerState::Rest =>
+		{
+			//TODO: extract this as it is duplicate work with Rest handling in the loop
+			let token: Token = match word.as_str()
+			{
+				"print" => Token::Intrinsic(word.clone(), line, col),
+				_ => todo!("Unknown word {}", word)
+			};
+			tokens.push(token);
+		}
+	}
+	tokens
+}
+
+enum TokenizerState
+{
+	Whitespace,
+	Quote,
+	Rest,
+}
--- a/test.qbl
+++ b/test.qbl
@ -0,0 +1 @@
+"Hello, World!\n" print