Implement more tokenizer states + some last_index fixes

This commit is contained in:
0x4261756D 2023-06-06 19:18:27 +02:00
parent 294ecb7712
commit 21c5098cf7

View File

@ -43,7 +43,7 @@ enum Token
enum TokenizerState enum TokenizerState
{ {
Start, Start,
Quote, Name, Number, Zero, Quote, SingleQuote, Name, Number, Zero,
A, B, D, E, F, G, I, L, N, O, R, T, U, W, A, B, D, E, F, G, I, L, N, O, R, T, U, W,
Plus, Minus, Star, Slash, Percent, Caret, Hash, Plus, Minus, Star, Slash, Percent, Caret, Hash,
Ampersand, Tilde, Pipe, Lt, Gt, Equals, RoundOpen, RoundClosed, CurlyOpen, CurlyClosed, SquareOpen, SquareClosed, Ampersand, Tilde, Pipe, Lt, Gt, Equals, RoundOpen, RoundClosed, CurlyOpen, CurlyClosed, SquareOpen, SquareClosed,
@ -51,10 +51,10 @@ enum TokenizerState
An, Br, Do, El, En, Fa, Fo, Fu, Go, If, In, Lo, Ni, No, Or, Re, Th, Tr, Un, Wh, An, Br, Do, El, En, Fa, Fo, Fu, Go, If, In, Lo, Ni, No, Or, Re, Th, Tr, Un, Wh,
LtLt, GtGt, SlashSlash, EqualsEquals, TildeEquals, LtEquals, GtEquals, ColonColon, DotDot, LtLt, GtGt, SlashSlash, EqualsEquals, TildeEquals, LtEquals, GtEquals, ColonColon, DotDot,
SmallComment, QuoteBackslash, String, HexNumberX, ExpNumber, SmallComment, QuoteBackslash, SingleQuoteBackslash, String, HexNumberX, ExpNumber,
And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi, And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi,
DotDotDot, HexNumber, DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ,
BigComment, BigComment,
Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber, Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,
@ -240,6 +240,18 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
token = Some(Token::CurlyClosed); token = Some(Token::CurlyClosed);
state = TokenizerState::CurlyClosed; state = TokenizerState::CurlyClosed;
} }
'[' =>
{
last_index = index as i32;
token = Some(Token::SquareOpen);
state = TokenizerState::SquareOpen;
}
']' =>
{
last_index = index as i32;
token = Some(Token::SquareClosed);
state = TokenizerState::SquareClosed;
}
'+' => '+' =>
{ {
last_index = index as i32; last_index = index as i32;
@ -277,10 +289,20 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
token_str.push(ch); token_str.push(ch);
state = TokenizerState::Zero; state = TokenizerState::Zero;
} }
'"' =>
{
token = None;
state = TokenizerState::Quote;
}
'\'' =>
{
token = None;
state = TokenizerState::SingleQuote;
}
_ => _ =>
{ {
if ch.is_whitespace() { } if ch.is_whitespace() { }
else if ch.is_ascii_alphabetic() else if ch.is_ascii_alphabetic() || ch == '_'
{ {
last_index = index as i32; last_index = index as i32;
token_str.push(ch); token_str.push(ch);
@ -301,6 +323,230 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
} }
} }
} }
TokenizerState::Quote =>
{
match ch
{
'\\' =>
{
state = TokenizerState::QuoteBackslash;
}
'"' =>
{
last_index = index as i32;
token = Some(Token::StringLiteral(token_str.clone()));
state = TokenizerState::String;
}
_ =>
{
token_str.push(ch);
}
}
}
TokenizerState::QuoteBackslash =>
{
match ch
{
'a' =>
{
token_str.push('\u{0007}');
state = TokenizerState::Quote;
}
'b' =>
{
token_str.push('\u{0008}');
state = TokenizerState::Quote;
}
't' =>
{
token_str.push('\t');
state = TokenizerState::Quote;
}
'n' | '\n' =>
{
token_str.push('\n');
state = TokenizerState::Quote;
}
'v' =>
{
token_str.push('\u{000b}');
state = TokenizerState::Quote;
}
'f' =>
{
token_str.push('\u{000c}');
state = TokenizerState::Quote;
}
'r' =>
{
token_str.push('\r');
state = TokenizerState::Quote;
}
'\\' =>
{
token_str.push('\\');
state = TokenizerState::Quote;
}
'"' =>
{
token_str.push('\"');
state = TokenizerState::Quote;
}
'\'' =>
{
token_str.push('\'');
state = TokenizerState::Quote;
}
'z' =>
{
state = TokenizerState::QuoteBackslashZ;
}
_ => return Err("Unknown escape sequence"),
}
}
TokenizerState::QuoteBackslashZ =>
{
match ch
{
'\\' =>
{
state = TokenizerState::QuoteBackslash;
}
'"' =>
{
last_index = index as i32;
token = Some(Token::StringLiteral(token_str.clone()));
state = TokenizerState::String;
}
_ =>
{
if !ch.is_whitespace()
{
token_str.push(ch);
state = TokenizerState::Quote;
}
}
}
}
TokenizerState::SingleQuote =>
{
match ch
{
'\\' =>
{
state = TokenizerState::SingleQuoteBackslash;
}
'\'' =>
{
last_index = index as i32;
token = Some(Token::StringLiteral(token_str.clone()));
state = TokenizerState::String;
}
_ =>
{
token_str.push(ch);
}
}
}
TokenizerState::SingleQuoteBackslash =>
{
match ch
{
'a' =>
{
token_str.push('\u{0007}');
state = TokenizerState::SingleQuote;
}
'b' =>
{
token_str.push('\u{0008}');
state = TokenizerState::SingleQuote;
}
't' =>
{
token_str.push('\t');
state = TokenizerState::SingleQuote;
}
'n' | '\n' =>
{
token_str.push('\n');
state = TokenizerState::SingleQuote;
}
'v' =>
{
token_str.push('\u{000b}');
state = TokenizerState::SingleQuote;
}
'f' =>
{
token_str.push('\u{000c}');
state = TokenizerState::SingleQuote;
}
'r' =>
{
token_str.push('\r');
state = TokenizerState::SingleQuote;
}
'\\' =>
{
token_str.push('\\');
state = TokenizerState::SingleQuote;
}
'"' =>
{
token_str.push('\"');
state = TokenizerState::SingleQuote;
}
'\'' =>
{
token_str.push('\'');
state = TokenizerState::SingleQuote;
}
'z' =>
{
state = TokenizerState::SingleQuoteBackslashZ;
}
_ => return Err("Unknown escape sequence"),
}
}
TokenizerState::SingleQuoteBackslashZ =>
{
match ch
{
'\\' =>
{
state = TokenizerState::SingleQuoteBackslash;
}
'\'' =>
{
last_index = index as i32;
token = Some(Token::StringLiteral(token_str.clone()));
state = TokenizerState::String;
}
_ =>
{
if !ch.is_whitespace()
{
token_str.push(ch);
state = TokenizerState::SingleQuote;
}
}
}
}
TokenizerState::String =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
tokens.push(Token::StringLiteral(token_str.clone()));
token_str.clear();
state = TokenizerState::Start;
}
TokenizerState::Name => TokenizerState::Name =>
{ {
if ch.is_ascii_alphanumeric() || ch == '_' if ch.is_ascii_alphanumeric() || ch == '_'
@ -330,7 +576,6 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
{ {
'x' => 'x' =>
{ {
last_index = index as i32;
token_str.push(ch); token_str.push(ch);
token = None; token = None;
state = TokenizerState::HexNumberX; state = TokenizerState::HexNumberX;
@ -394,7 +639,6 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
{ {
'p' => 'p' =>
{ {
last_index = index as i32;
token_str.push(ch); token_str.push(ch);
token = None; token = None;
state = TokenizerState::HexExpNumber; state = TokenizerState::HexExpNumber;
@ -434,7 +678,6 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
{ {
'e' => 'e' =>
{ {
last_index = index as i32;
token_str.push(ch); token_str.push(ch);
token = None; token = None;
state = TokenizerState::ExpNumber; state = TokenizerState::ExpNumber;
@ -467,7 +710,7 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
TokenizerState::Comma | TokenizerState::RoundOpen | TokenizerState::RoundClosed | TokenizerState::Comma | TokenizerState::RoundOpen | TokenizerState::RoundClosed |
TokenizerState::CurlyOpen | TokenizerState::CurlyClosed | TokenizerState::Plus | TokenizerState::CurlyOpen | TokenizerState::CurlyClosed | TokenizerState::Plus |
TokenizerState::TildeEquals | TokenizerState::EqualsEquals | TokenizerState::Hash | TokenizerState::TildeEquals | TokenizerState::EqualsEquals | TokenizerState::Hash |
TokenizerState::GtEquals | TokenizerState::LtEquals => TokenizerState::GtEquals | TokenizerState::LtEquals | TokenizerState::SquareOpen | TokenizerState::SquareClosed =>
{ {
if last_index == -1 || token.is_none() if last_index == -1 || token.is_none()
{ {
@ -597,6 +840,46 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
} }
} }
} }
TokenizerState::DotDot =>
{
match ch
{
'.' =>
{
last_index = index as i32;
token = Some(Token::DotDotDot);
state = TokenizerState::DotDotDot;
}
_ =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.clone().unwrap());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
TokenizerState::DotDotDot =>
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::And);
}
TokenizerState::Colon => TokenizerState::Colon =>
{ {
match ch match ch
@ -2818,6 +3101,67 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
tokens.push(Token::False); tokens.push(Token::False);
} }
} }
TokenizerState::Fo =>
{
match ch
{
'r' =>
{
last_index = index as i32;
token = Some(Token::Name("for".to_string()));
token_str.push(ch);
state = TokenizerState::For;
}
_ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
}
}
TokenizerState::For =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
token = None;
token_str.clear();
state = TokenizerState::Start;
tokens.push(Token::For);
}
}
TokenizerState::L => TokenizerState::L =>
{ {
match ch match ch
@ -2866,7 +3210,30 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
token_str.push(ch); token_str.push(ch);
state = TokenizerState::Loc; state = TokenizerState::Loc;
} }
_ => todo!("State {:?}, Char {}", state, ch) _ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
} }
} }
TokenizerState::Loc => TokenizerState::Loc =>
@ -2917,7 +3284,30 @@ fn tokenize(file_content: &String) -> Result<Vec<Token>, &'static str>
token_str.push(ch); token_str.push(ch);
state = TokenizerState::Local; state = TokenizerState::Local;
} }
_ => todo!("State {:?}, Char {}", state, ch) _ =>
{
if ch.is_ascii_alphanumeric() || ch == '_'
{
last_index = index as i32;
token_str.push(ch);
token = Some(Token::Name(token_str.clone()));
state = TokenizerState::Name;
}
else
{
if last_index == -1 || token.is_none()
{
println!("{}|{}|{:?} | {:?}", last_index, index, token, tokens);
return Err("Lexerr");
}
index = last_index as usize;
last_index = -1;
tokens.push(token.unwrap().clone());
token = None;
token_str.clear();
state = TokenizerState::Start;
}
}
} }
} }
TokenizerState::Local => TokenizerState::Local =>