Add unfinished implementation for \u escape sequences

Also add a note for \x escape sequences since they are also broken
This commit is contained in:
0x4261756D 2024-02-21 17:38:18 +01:00
parent ab4be05bf4
commit 8cbfb8b941

View File

@ -14,7 +14,7 @@ class Tokenizer
int closingLongBracketLevel;
Token? currentToken;
CodeLocation currentLocation = new(line: 0, col: 0);
int escapeSequenceNumber;
long escapeSequenceNumber;
public Token[] Tokenize(string content)
{
@ -495,10 +495,68 @@ class Tokenizer
state = State.QuoteBackslashX;
}
break;
case 'u':
{
state = State.QuoteBackslashU;
}
break;
default: throw new Exception($"Unknown escape sequence: \\{ch} at {currentLocation}");
}
}
break;
case State.QuoteBackslashU:
{
if(ch == '{')
{
state = State.QuoteBackslashUBracket;
}
else
{
throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.QuoteBackslashUBracket:
{
if(char.IsAsciiHexDigit(ch))
{
state = State.QuoteBackslashUBracketHex;
escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a';
}
else
{
throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.QuoteBackslashUBracketHex:
{
if(char.IsAsciiHexDigit(ch))
{
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
if(escapeSequenceNumber > uint.MaxValue)
{
throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted");
}
}
else if(ch == '}')
{
state = State.Quote;
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber));
for(int i = 0; i < chars.Length; i++)
{
AppendDataChar(chars[i]);
}
escapeSequenceNumber = 0;
}
else
{
throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.QuoteBackslashZ:
{
if(ch == '\\')
@ -634,10 +692,67 @@ class Tokenizer
state = State.SingleQuoteBackslashX;
}
break;
case 'u':
{
state = State.SingleQuoteBackslashU;
}
break;
default: throw new Exception($"Unknown escape sequence: \\{ch}");
}
}
break;
case State.SingleQuoteBackslashU:
{
if(ch == '{')
{
state = State.SingleQuoteBackslashUBracket;
}
else
{
throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.SingleQuoteBackslashUBracket:
{
if(char.IsAsciiHexDigit(ch))
{
state = State.SingleQuoteBackslashUBracketHex;
escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a';
}
else
{
throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.SingleQuoteBackslashUBracketHex:
{
if(char.IsAsciiHexDigit(ch))
{
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
if(escapeSequenceNumber > uint.MaxValue)
{
throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted");
}
}
else if(ch == '}')
{
state = State.SingleQuote;
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber));
for(int i = 0; i < chars.Length; i++)
{
AppendDataChar(chars[i]);
}
escapeSequenceNumber = 0;
}
else
{
throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
}
}
break;
case State.SingleQuoteBackslashZ:
{
if(ch == '\\')
@ -689,6 +804,7 @@ class Tokenizer
{
state = State.SingleQuote;
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
{
AppendDataChar(c);
@ -720,6 +836,7 @@ class Tokenizer
{
state = State.Quote;
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
{
AppendDataChar(c);
@ -3735,6 +3852,8 @@ class Tokenizer
And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi,
DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ, QuoteBackslashX, SingleQuoteBackslashX, QuoteBackslashXHex, SingleQuoteBackslashXHex,
SingleQuoteBackslashU, SingleQuoteBackslashUBracket, SingleQuoteBackslashUBracketHex,
QuoteBackslashU, QuoteBackslashUBracket, QuoteBackslashUBracketHex,
SmallComment, BigComment, BigCommentStartLongBracket, BigCommentEndLongBracket,
Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,