From a149609f03f4987b4d48998a96a9f3ddc634de13 Mon Sep 17 00:00:00 2001 From: 0x4261756D <38735823+0x4261756D@users.noreply.github.com> Date: Tue, 6 Jun 2023 20:38:40 +0200 Subject: [PATCH] Implement more states for the tokenizer --- src/main.rs | 504 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 493 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index 69154fd..fe9b7f2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,4 @@ -use std::{env, fs, fmt::Error}; +use std::{env, fs}; fn main() { @@ -51,13 +51,14 @@ enum TokenizerState An, Br, Do, El, En, Fa, Fo, Fu, Go, If, In, Lo, Ni, No, Or, Re, Th, Tr, Un, Wh, LtLt, GtGt, SlashSlash, EqualsEquals, TildeEquals, LtEquals, GtEquals, ColonColon, DotDot, - SmallComment, QuoteBackslash, SingleQuoteBackslash, String, HexNumberX, ExpNumber, + SmallCommentStart, QuoteBackslash, SingleQuoteBackslash, String, HexNumberX, ExpNumber, And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi, DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ, - BigComment, + BigCommentLongBracketStart, SmallComment, Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber, + BigComment, BigCommentLongBracketEnd, Break, Elsei, False, Funct, Local, Repea, Retur, Until, While, @@ -78,6 +79,7 @@ fn tokenize(file_content: &String) -> Result, &'static str> let mut index = 0; let mut token: Option = None; let mut token_str: String = String::new(); + let mut long_bracket_level = 0; while index < char_vec.len() { @@ -282,6 +284,48 @@ fn tokenize(file_content: &String) -> Result, &'static str> token = Some(Token::Hash); state = TokenizerState::Hash; } + '|' => + { + last_index = index as i32; + token = Some(Token::Pipe); + state = TokenizerState::Pipe; + } + '&' => + { + last_index = index as i32; + token = Some(Token::Ampersand); + state = TokenizerState::Ampersand; + } + '%' => + { + last_index = index as i32; + token = Some(Token::Percent); + state = TokenizerState::Percent; + } + '*' => + { + last_index = index as i32; + token = Some(Token::Star); + state = TokenizerState::Star; + } + '/' => + { + last_index = index as i32; + token = Some(Token::Slash); + state = TokenizerState::Slash; + } + ';' => + { + last_index = index as i32; + token = Some(Token::Semicolon); + state = TokenizerState::Semicolon; + } + '^' => + { + last_index = index as i32; + token = Some(Token::Caret); + state = TokenizerState::Caret; + } '0' => { last_index = index as i32; @@ -318,7 +362,7 @@ fn tokenize(file_content: &String) -> Result, &'static str> } else { - todo!("State {:?}, Char {}, {:?}", state, ch, tokens); + todo!("State {:?}, Char {}", state, ch); } } } @@ -710,7 +754,10 @@ fn tokenize(file_content: &String) -> Result, &'static str> TokenizerState::Comma | TokenizerState::RoundOpen | TokenizerState::RoundClosed | TokenizerState::CurlyOpen | TokenizerState::CurlyClosed | TokenizerState::Plus | TokenizerState::TildeEquals | TokenizerState::EqualsEquals | TokenizerState::Hash | - TokenizerState::GtEquals | TokenizerState::LtEquals | TokenizerState::SquareOpen | TokenizerState::SquareClosed => + TokenizerState::GtEquals | TokenizerState::LtEquals | TokenizerState::SquareOpen | + TokenizerState::SquareClosed | TokenizerState::Pipe | TokenizerState::Ampersand | + TokenizerState::Percent | TokenizerState::Star | TokenizerState::Semicolon | + TokenizerState::Caret => { if last_index == -1 || token.is_none() { @@ -814,6 +861,74 @@ fn tokenize(file_content: &String) -> Result, &'static str> } } } + TokenizerState::GtGt => + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + token = None; + token_str.clear(); + state = TokenizerState::Start; + tokens.push(Token::GtGt); + } + TokenizerState::LtLt => + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + token = None; + token_str.clear(); + state = TokenizerState::Start; + tokens.push(Token::LtLt); + } + TokenizerState::Slash => + { + match ch + { + '/' => + { + last_index = index as i32; + token = Some(Token::SlashSlash); + state = TokenizerState::SlashSlash; + } + _ => + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.clone().unwrap()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + TokenizerState::SlashSlash => + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + token = None; + token_str.clear(); + state = TokenizerState::Start; + tokens.push(Token::SlashSlash); + } TokenizerState::Dot => { match ch @@ -940,7 +1055,7 @@ fn tokenize(file_content: &String) -> Result, &'static str> { last_index = index as i32; token = None; - state = TokenizerState::SmallComment; + state = TokenizerState::SmallCommentStart; } _ => { @@ -958,15 +1073,14 @@ fn tokenize(file_content: &String) -> Result, &'static str> } } } - TokenizerState::SmallComment => + TokenizerState::SmallCommentStart => { match ch { '[' => { - last_index = index as i32; token = None; - state = TokenizerState::BigComment; + state = TokenizerState::BigCommentLongBracketStart; } '\n' => { @@ -975,10 +1089,71 @@ fn tokenize(file_content: &String) -> Result, &'static str> } _ => { - last_index = index as i32; + state = TokenizerState::SmallComment; } } } + TokenizerState::SmallComment => + { + match ch + { + '\n' => + { + state = TokenizerState::Start; + last_index = -1; + } + _ => { } + } + } + TokenizerState::BigCommentLongBracketStart => + { + match ch + { + '=' => + { + long_bracket_level += 1; + } + '[' => + { + state = TokenizerState::BigComment; + } + _ => return Err("Malformed long bracket at the beginning of a big comment"), + } + } + TokenizerState::BigComment => + { + match ch + { + ']' => + { + state = TokenizerState::BigCommentLongBracketEnd; + } + _ => { } + } + } + TokenizerState::BigCommentLongBracketEnd => + { + match ch + { + '=' => + { + if long_bracket_level == 0 + { + return Err("Long bracket level too big when ending big comment"); + } + long_bracket_level -= 1; + } + ']' => + { + if long_bracket_level != 0 + { + return Err("Long bracket level too small when ending big comment"); + } + state = TokenizerState::Start; + } + _ => return Err("Malformed long bracket when ending big comment"), + } + } TokenizerState::A => { match ch @@ -1772,6 +1947,141 @@ fn tokenize(file_content: &String) -> Result, &'static str> tokens.push(Token::Return); } } + TokenizerState::Rep => + { + match ch + { + 'e' => + { + last_index = index as i32; + token = Some(Token::Name("repe".to_string())); + token_str.push(ch); + state = TokenizerState::Repe; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Repe => + { + match ch + { + 'a' => + { + last_index = index as i32; + token = Some(Token::Name("repea".to_string())); + token_str.push(ch); + state = TokenizerState::Repea; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Repea => + { + match ch + { + 't' => + { + last_index = index as i32; + token = Some(Token::Name("repeat".to_string())); + token_str.push(ch); + state = TokenizerState::Repeat; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Repeat => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + token = None; + token_str.clear(); + state = TokenizerState::Start; + tokens.push(Token::Repeat); + } + } TokenizerState::N => { match ch @@ -3334,7 +3644,179 @@ fn tokenize(file_content: &String) -> Result, &'static str> tokens.push(Token::Local); } } - _ => todo!("State {:?} | {:?}", state, tokens) + TokenizerState::U => + { + match ch + { + 'n' => + { + last_index = index as i32; + token = Some(Token::Name("un".to_string())); + token_str.push(ch); + state = TokenizerState::Un; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.clone().unwrap()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Un => + { + match ch + { + 't' => + { + last_index = index as i32; + token = Some(Token::Name("unt".to_string())); + token_str.push(ch); + state = TokenizerState::Unt; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Unt => + { + match ch + { + 'i' => + { + last_index = index as i32; + token = Some(Token::Name("unti".to_string())); + token_str.push(ch); + state = TokenizerState::Unti; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Unti => + { + match ch + { + 'l' => + { + last_index = index as i32; + token = Some(Token::Name("until".to_string())); + token_str.push(ch); + state = TokenizerState::Until; + } + _ => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + tokens.push(token.unwrap().clone()); + token = None; + token_str.clear(); + state = TokenizerState::Start; + } + } + } + } + TokenizerState::Until => + { + if ch.is_ascii_alphanumeric() || ch == '_' + { + last_index = index as i32; + token_str.push(ch); + token = Some(Token::Name(token_str.clone())); + state = TokenizerState::Name; + } + else + { + if last_index == -1 || token.is_none() + { + println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens); + return Err("Lexerr"); + } + index = last_index as usize; + last_index = -1; + token = None; + token_str.clear(); + state = TokenizerState::Start; + tokens.push(Token::Until); + } + } + _ => todo!("State {:?}", state) } index += 1; }