Add unfinished implementation for \u escape sequences
Also add a note for \x escape sequences since they are also broken
This commit is contained in:
parent
ab4be05bf4
commit
8cbfb8b941
121
Tokenizer.cs
121
Tokenizer.cs
@ -14,7 +14,7 @@ class Tokenizer
|
|||||||
int closingLongBracketLevel;
|
int closingLongBracketLevel;
|
||||||
Token? currentToken;
|
Token? currentToken;
|
||||||
CodeLocation currentLocation = new(line: 0, col: 0);
|
CodeLocation currentLocation = new(line: 0, col: 0);
|
||||||
int escapeSequenceNumber;
|
long escapeSequenceNumber;
|
||||||
|
|
||||||
public Token[] Tokenize(string content)
|
public Token[] Tokenize(string content)
|
||||||
{
|
{
|
||||||
@ -495,10 +495,68 @@ class Tokenizer
|
|||||||
state = State.QuoteBackslashX;
|
state = State.QuoteBackslashX;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'u':
|
||||||
|
{
|
||||||
|
state = State.QuoteBackslashU;
|
||||||
|
}
|
||||||
|
break;
|
||||||
default: throw new Exception($"Unknown escape sequence: \\{ch} at {currentLocation}");
|
default: throw new Exception($"Unknown escape sequence: \\{ch} at {currentLocation}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case State.QuoteBackslashU:
|
||||||
|
{
|
||||||
|
if(ch == '{')
|
||||||
|
{
|
||||||
|
state = State.QuoteBackslashUBracket;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State.QuoteBackslashUBracket:
|
||||||
|
{
|
||||||
|
if(char.IsAsciiHexDigit(ch))
|
||||||
|
{
|
||||||
|
state = State.QuoteBackslashUBracketHex;
|
||||||
|
escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State.QuoteBackslashUBracketHex:
|
||||||
|
{
|
||||||
|
if(char.IsAsciiHexDigit(ch))
|
||||||
|
{
|
||||||
|
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
||||||
|
if(escapeSequenceNumber > uint.MaxValue)
|
||||||
|
{
|
||||||
|
throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(ch == '}')
|
||||||
|
{
|
||||||
|
state = State.Quote;
|
||||||
|
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
|
||||||
|
char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber));
|
||||||
|
for(int i = 0; i < chars.Length; i++)
|
||||||
|
{
|
||||||
|
AppendDataChar(chars[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
escapeSequenceNumber = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case State.QuoteBackslashZ:
|
case State.QuoteBackslashZ:
|
||||||
{
|
{
|
||||||
if(ch == '\\')
|
if(ch == '\\')
|
||||||
@ -634,10 +692,67 @@ class Tokenizer
|
|||||||
state = State.SingleQuoteBackslashX;
|
state = State.SingleQuoteBackslashX;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 'u':
|
||||||
|
{
|
||||||
|
state = State.SingleQuoteBackslashU;
|
||||||
|
}
|
||||||
|
break;
|
||||||
default: throw new Exception($"Unknown escape sequence: \\{ch}");
|
default: throw new Exception($"Unknown escape sequence: \\{ch}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case State.SingleQuoteBackslashU:
|
||||||
|
{
|
||||||
|
if(ch == '{')
|
||||||
|
{
|
||||||
|
state = State.SingleQuoteBackslashUBracket;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected `{{` to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State.SingleQuoteBackslashUBracket:
|
||||||
|
{
|
||||||
|
if(char.IsAsciiHexDigit(ch))
|
||||||
|
{
|
||||||
|
state = State.SingleQuoteBackslashUBracketHex;
|
||||||
|
escapeSequenceNumber = char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case State.SingleQuoteBackslashUBracketHex:
|
||||||
|
{
|
||||||
|
if(char.IsAsciiHexDigit(ch))
|
||||||
|
{
|
||||||
|
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
||||||
|
if(escapeSequenceNumber > uint.MaxValue)
|
||||||
|
{
|
||||||
|
throw new Exception($"{currentLocation}: \\u escape sequence has a value > 2^31 which is not permitted");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(ch == '}')
|
||||||
|
{
|
||||||
|
state = State.SingleQuote;
|
||||||
|
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
|
||||||
|
char[] chars = Encoding.UTF8.GetChars(BitConverter.GetBytes((uint)escapeSequenceNumber));
|
||||||
|
for(int i = 0; i < chars.Length; i++)
|
||||||
|
{
|
||||||
|
AppendDataChar(chars[i]);
|
||||||
|
}
|
||||||
|
escapeSequenceNumber = 0;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw new Exception($"Expected second hex digit to continue \\u escape sequence at {currentLocation}, got {ch}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case State.SingleQuoteBackslashZ:
|
case State.SingleQuoteBackslashZ:
|
||||||
{
|
{
|
||||||
if(ch == '\\')
|
if(ch == '\\')
|
||||||
@ -689,6 +804,7 @@ class Tokenizer
|
|||||||
{
|
{
|
||||||
state = State.SingleQuote;
|
state = State.SingleQuote;
|
||||||
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
||||||
|
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
|
||||||
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
|
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
|
||||||
{
|
{
|
||||||
AppendDataChar(c);
|
AppendDataChar(c);
|
||||||
@ -720,6 +836,7 @@ class Tokenizer
|
|||||||
{
|
{
|
||||||
state = State.Quote;
|
state = State.Quote;
|
||||||
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
escapeSequenceNumber = (escapeSequenceNumber * 16) + (char.IsAsciiDigit(ch) ? ch - '0' : 10 + char.ToLower(ch) - 'a');
|
||||||
|
// TODO: THIS IS WRONG, there is zero padding due to the fixed size array
|
||||||
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
|
foreach(char c in Encoding.UTF8.GetChars(BitConverter.GetBytes(escapeSequenceNumber)))
|
||||||
{
|
{
|
||||||
AppendDataChar(c);
|
AppendDataChar(c);
|
||||||
@ -3735,6 +3852,8 @@ class Tokenizer
|
|||||||
|
|
||||||
And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi,
|
And, Bre, Els, End, Fal, For, Fun, Got, Loc, Nil, Not, Rep, Ret, The, Tru, Unt, Whi,
|
||||||
DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ, QuoteBackslashX, SingleQuoteBackslashX, QuoteBackslashXHex, SingleQuoteBackslashXHex,
|
DotDotDot, HexNumber, QuoteBackslashZ, SingleQuoteBackslashZ, QuoteBackslashX, SingleQuoteBackslashX, QuoteBackslashXHex, SingleQuoteBackslashXHex,
|
||||||
|
SingleQuoteBackslashU, SingleQuoteBackslashUBracket, SingleQuoteBackslashUBracketHex,
|
||||||
|
QuoteBackslashU, QuoteBackslashUBracket, QuoteBackslashUBracketHex,
|
||||||
SmallComment, BigComment, BigCommentStartLongBracket, BigCommentEndLongBracket,
|
SmallComment, BigComment, BigCommentStartLongBracket, BigCommentEndLongBracket,
|
||||||
|
|
||||||
Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,
|
Brea, Else, Fals, Func, Goto, Loca, Repe, Retu, Then, True, Unti, Whil, HexExpNumber,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user