diff --git a/Tokenizer.cs b/Tokenizer.cs index db26407..ecee68c 100644 --- a/Tokenizer.cs +++ b/Tokenizer.cs @@ -14,7 +14,7 @@ class Tokenizer int closingLongBracketLevel; Token? currentToken; CodeLocation currentLocation = new(line: 0, col: 0); - int escapeSequenceNumber; + long escapeSequenceNumber; public Token[] Tokenize(string content) { @@ -495,10 +495,68 @@ class Tokenizer state = State.QuoteBackslashX; } break; + case 'u': + { + state = State.QuoteBackslashU; + } + break; default: throw new Exception($"Unknown escape sequence: \\{ch} at {currentLocation}"); } } break; + case State.QuoteBackslashU: + { + if(ch == '{') + { + state = State.QuoteBackslashUBracket; + } + else + { + throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; + case State.QuoteBackslashUBracket: + { + if(char.IsAsciiHexDigit(ch)) + { + state = State.QuoteBackslashUBracketHex; + escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'; + } + else + { + throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; + case State.QuoteBackslashUBracketHex: + { + if(char.IsAsciiHexDigit(ch)) + { + escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'); + if(escapeSequenceNumber > uint.MaxValue) + { + throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted"); + } + } + else if(ch == '}') + { + state = State.Quote; + // TODO: THIS IS WRONG, there is zero padding due to the fixed size array + char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber)); + for(int i = 0; i < chars.Length; i++) + { + AppendDataChar(chars[i]); + } + + escapeSequenceNumber = 0; + } + else + { + throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; case State.QuoteBackslashZ: { if(ch == '\\') @@ -634,10 +692,67 @@ class Tokenizer state = State.SingleQuoteBackslashX; } break; + case 'u': + { + state = State.SingleQuoteBackslashU; + } + break; default: throw new Exception($"Unknown escape sequence: \\{ch}"); } } break; + case State.SingleQuoteBackslashU: + { + if(ch == '{') + { + state = State.SingleQuoteBackslashUBracket; + } + else + { + throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; + case State.SingleQuoteBackslashUBracket: + { + if(char.IsAsciiHexDigit(ch)) + { + state = State.SingleQuoteBackslashUBracketHex; + escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'; + } + else + { + throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; + case State.SingleQuoteBackslashUBracketHex: + { + if(char.IsAsciiHexDigit(ch)) + { + escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'); + if(escapeSequenceNumber > uint.MaxValue) + { + throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted"); + } + } + else if(ch == '}') + { + state = State.SingleQuote; + // TODO: THIS IS WRONG, there is zero padding due to the fixed size array + char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber)); + for(int i = 0; i < chars.Length; i++) + { + AppendDataChar(chars[i]); + } + escapeSequenceNumber = 0; + } + else + { + throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}"); + } + } + break; case State.SingleQuoteBackslashZ: { if(ch == '\\') @@ -689,6 +804,7 @@ class Tokenizer { state = State.SingleQuote; escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'); + // TODO: THIS IS WRONG, there is zero padding due to the fixed size array foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber))) { AppendDataChar(c); @@ -720,6 +836,7 @@ class Tokenizer { state = State.Quote; escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a'); + // TODO: THIS IS WRONG, there is zero padding due to the fixed size array foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber))) { AppendDataChar(c); @@ -3735,6 +3852,8 @@ class Tokenizer And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi, DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ, QuoteBackslashX, SingleQuoteBackslashX, QuoteBackslashXHex, SingleQuoteBackslashXHex, + SingleQuoteBackslashU, SingleQuoteBackslashUBracket, SingleQuoteBackslashUBracketHex, + QuoteBackslashU, QuoteBackslashUBracket, QuoteBackslashUBracketHex, SmallComment, BigComment, BigCommentStartLongBracket, BigCommentEndLongBracket, Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,