Implement more states for the tokenizer

This commit is contained in:
0x4261756D 2023-06-06 20:38:40 +02:00
parent 21c5098cf7
commit a149609f03
1 changed files with 493 additions and 11 deletions

View File

@ -1,4 +1,4 @@
use std::{env, fs, fmt::Error};
use std::{env, fs};
fn main()
{
@ -51,13 +51,14 @@ enum TokenizerState
An, Br, Do, El, En, Fa, Fo, Fu, Go, If, In, Lo, Ni, No, Or, Re, Th, Tr, Un, Wh,
LtLt, GtGt, SlashSlash, EqualsEquals, TildeEquals, LtEquals, GtEquals, ColonColon, DotDot,
SmallComment, QuoteBackslash, SingleQuoteBackslash, String, HexNumberX, ExpNumber,
SmallCommentStart, QuoteBackslash, SingleQuoteBackslash, String, HexNumberX, ExpNumber,
And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi,
DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ,
BigComment,
BigCommentLongBracketStart, SmallComment,
Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,
BigComment, BigCommentLongBracketEnd,
Break, Elsei, False, Funct, Local, Repea, Retur, Until, While,
@ -78,6 +79,7 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
let mut index = 0;
let mut token: Option<Token> = None;
let mut token_str: String = String::new();
let mut long_bracket_level = 0;
while index < char_vec.len()
{
@ -282,6 +284,48 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
token = Some(Token::Hash);
state = TokenizerState::Hash;
}
'|' =>
{
last_index = index as i32;
token = Some(Token::Pipe);
state = TokenizerState::Pipe;
}
'&' =>
{
last_index = index as i32;
token = Some(Token::Ampersand);
state = TokenizerState::Ampersand;
}
'%' =>
{
last_index = index as i32;
token = Some(Token::Percent);
state = TokenizerState::Percent;
}
'*' =>
{
last_index = index as i32;
token = Some(Token::Star);
state = TokenizerState::Star;
}
'/' =>
{
last_index = index as i32;
token = Some(Token::Slash);
state = TokenizerState::Slash;
}
';' =>
{
last_index = index as i32;
token = Some(Token::Semicolon);
state = TokenizerState::Semicolon;
}
'^' =>
{
last_index = index as i32;
token = Some(Token::Caret);
state = TokenizerState::Caret;
}
'0' =>
{
last_index = index as i32;
@ -318,7 +362,7 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
}
else
{
todo!("State {:?}, Char {}, {:?}", state, ch, tokens);
todo!("State {:?}, Char {}", state, ch);
}
}
}
@ -710,7 +754,10 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
TokenizerState::Comma | TokenizerState::RoundOpen | TokenizerState::RoundClosed |
TokenizerState::CurlyOpen | TokenizerState::CurlyClosed | TokenizerState::Plus |
TokenizerState::TildeEquals | TokenizerState::EqualsEquals | TokenizerState::Hash |
TokenizerState::GtEquals | TokenizerState::LtEquals | TokenizerState::SquareOpen | TokenizerState::SquareClosed =>
TokenizerState::GtEquals | TokenizerState::LtEquals | TokenizerState::SquareOpen |
TokenizerState::SquareClosed | TokenizerState::Pipe | TokenizerState::Ampersand |
TokenizerState::Percent | TokenizerState::Star | TokenizerState::Semicolon |
TokenizerState::Caret =>
{
if last_index == -1 || token.is_none()
{
@ -814,6 +861,74 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
}
}
}
TokenizerState::GtGt =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::GtGt);
}
TokenizerState::LtLt =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::LtLt);
}
TokenizerState::Slash =>
{
match ch
{
'/' =>
{
last_index = index as i32;
token = Some(Token::SlashSlash);
state = TokenizerState::SlashSlash;
}
_ =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.clone().unwrap());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
TokenizerState::SlashSlash =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::SlashSlash);
}
TokenizerState::Dot =>
{
match ch
@ -940,7 +1055,7 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
{
last_index = index as i32;
token = None;
state = TokenizerState::SmallComment;
state = TokenizerState::SmallCommentStart;
}
_ =>
{
@ -958,15 +1073,14 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
}
}
}
TokenizerState::SmallComment =>
TokenizerState::SmallCommentStart =>
{
match ch
{
'[' =>
{
last_index = index as i32;
token = None;
state = TokenizerState::BigComment;
state = TokenizerState::BigCommentLongBracketStart;
}
'\n' =>
{
@ -975,10 +1089,71 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
}
_ =>
{
last_index = index as i32;
state = TokenizerState::SmallComment;
}
}
}
TokenizerState::SmallComment =>
{
match ch
{
'\n' =>
{
state = TokenizerState::Start;
last_index = -1;
}
_ => { }
}
}
TokenizerState::BigCommentLongBracketStart =>
{
match ch
{
'=' =>
{
long_bracket_level += 1;
}
'[' =>
{
state = TokenizerState::BigComment;
}
_ => return Err("Malformed long bracket at the beginning of a big comment"),
}
}
TokenizerState::BigComment =>
{
match ch
{
']' =>
{
state = TokenizerState::BigCommentLongBracketEnd;
}
_ => { }
}
}
TokenizerState::BigCommentLongBracketEnd =>
{
match ch
{
'=' =>
{
if long_bracket_level == 0
{
return Err("Long bracket level too big when ending big comment");
}
long_bracket_level -= 1;
}
']' =>
{
if long_bracket_level != 0
{
return Err("Long bracket level too small when ending big comment");
}
state = TokenizerState::Start;
}
_ => return Err("Malformed long bracket when ending big comment"),
}
}
TokenizerState::A =>
{
match ch
@ -1772,6 +1947,141 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
tokens.push(Token::Return);
}
}
TokenizerState::Rep =>
{
match ch
{
'e' =>
{
last_index = index as i32;
token = Some(Token::Name("repe".to_string()));
token_str.push(ch);
state = TokenizerState::Repe;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Repe =>
{
match ch
{
'a' =>
{
last_index = index as i32;
token = Some(Token::Name("repea".to_string()));
token_str.push(ch);
state = TokenizerState::Repea;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Repea =>
{
match ch
{
't' =>
{
last_index = index as i32;
token = Some(Token::Name("repeat".to_string()));
token_str.push(ch);
state = TokenizerState::Repeat;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Repeat =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::Repeat);
}
}
TokenizerState::N =>
{
match ch
@ -3334,7 +3644,179 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
tokens.push(Token::Local);
}
}
_ => todo!("State {:?} | {:?}", state, tokens)
TokenizerState::U =>
{
match ch
{
'n' =>
{
last_index = index as i32;
token = Some(Token::Name("un".to_string()));
token_str.push(ch);
state = TokenizerState::Un;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.clone().unwrap());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Un =>
{
match ch
{
't' =>
{
last_index = index as i32;
token = Some(Token::Name("unt".to_string()));
token_str.push(ch);
state = TokenizerState::Unt;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Unt =>
{
match ch
{
'i' =>
{
last_index = index as i32;
token = Some(Token::Name("unti".to_string()));
token_str.push(ch);
state = TokenizerState::Unti;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Unti =>
{
match ch
{
'l' =>
{
last_index = index as i32;
token = Some(Token::Name("until".to_string()));
token_str.push(ch);
state = TokenizerState::Until;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::Until =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::Until);
}
}
_ => todo!("State {:?}", state)
}
index += 1;
}