314 lines
9.4 KiB
C++
314 lines
9.4 KiB
C++
////////////////////////////////
|
|
//~ sixten: Token Type Functions
|
|
|
|
static string T_StringFromToken(string Text, token Token)
|
|
{
|
|
string Result = Substring(Text, Token.Range);
|
|
return(Result);
|
|
}
|
|
|
|
static void T_TokenChunkListPush(arena *Arena, token_chunk_list *List, token Token, s64 MaxTokenCountPerNode)
|
|
{
|
|
token_chunk_node *Node = List->Last;
|
|
if(!Node || Node->Count >= Node->MaxCount)
|
|
{
|
|
Node = PushStruct(Arena, token_chunk_node);
|
|
Node->Count = 0;
|
|
Node->MaxCount = MaxTokenCountPerNode;
|
|
Node->Tokens = PushArrayNoClear(Arena, token, Node->MaxCount);
|
|
QueuePush(List->First, List->Last, Node);
|
|
}
|
|
Node->Tokens[Node->Count] = Token;
|
|
Node->Count += 1;
|
|
List->Count += 1;
|
|
}
|
|
|
|
static token_array T_TokenArrayFromList(arena *Arena, token_chunk_list *List)
|
|
{
|
|
token_array Result = {};
|
|
Result.Tokens = PushArrayNoClear(Arena, token, List->Count);
|
|
Result.Count = List->Count;
|
|
s64 Index = 0;
|
|
for(token_chunk_node *Node = List->First; Node != 0; Node = Node->Next)
|
|
{
|
|
Copy(Result.Tokens + Index, Node->Tokens, sizeof(token)*Node->Count);
|
|
Index += Node->Count;
|
|
}
|
|
return(Result);
|
|
}
|
|
|
|
////////////////////////////////
|
|
//~ sixten: Tokenizer Message Functions
|
|
|
|
static void T_MessageListPush(arena *Arena, tokenizer_message_list *List, tokenizer_message_kind Kind, s64 Offset, string String)
|
|
{
|
|
tokenizer_message *Message = PushStructNoClear(Arena, tokenizer_message);
|
|
Message->Kind = Kind;
|
|
Message->Offset = Offset;
|
|
Message->String = String;
|
|
QueuePush(List->First, List->Last, Message);
|
|
}
|
|
|
|
////////////////////////////////
|
|
//~ sixten: Text -> Token Functions
|
|
|
|
static tokenize_result T_TokenizeFromText(arena *Arena, string Text, tokenizer_filter_function *ExcludeFilter)
|
|
{
|
|
temp Scratch = GetScratch(&Arena, 1);
|
|
token_chunk_list Tokens = {};
|
|
tokenizer_message_list Messages = {};
|
|
u8 *TextStart = Text.Data;
|
|
u8 *TextEnd = TextStart + Text.Count;
|
|
u8 *Byte = TextStart;
|
|
s64 Line = 0;
|
|
|
|
//- sixten: scan string & produce tokens
|
|
for(;Byte < TextEnd;)
|
|
{
|
|
token_kind TokenKind = TokenKind_None;
|
|
u8 *TokenStart = 0;
|
|
u8 *TokenEnd = 0;
|
|
|
|
//- sixten: whitespace
|
|
if(TokenKind == TokenKind_None && (*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r'))
|
|
{
|
|
TokenKind = TokenKind_Whitespace;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte;
|
|
Byte += 1;
|
|
for(;Byte <= TextEnd; Byte += 1)
|
|
{
|
|
TokenEnd += 1;
|
|
if(Byte == TextEnd || !(*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r'))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//- sixten: newlines
|
|
if(TokenKind == TokenKind_None && *Byte == '\n')
|
|
{
|
|
TokenKind = TokenKind_Newline;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte + 1;
|
|
Line += 1;
|
|
Byte += 1;
|
|
}
|
|
|
|
//- sixten: single-line comments
|
|
if(TokenKind == TokenKind_None && (Byte[0] == '/' && Byte[1] == '/'))
|
|
{
|
|
TokenKind = TokenKind_Comment;;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte + 2;
|
|
Byte += 2;
|
|
for(;Byte <= TextEnd; Byte += 1)
|
|
{
|
|
if(Byte == TextEnd || *Byte == '\n' || *Byte == '\r')
|
|
{
|
|
break;
|
|
}
|
|
TokenEnd += 1;
|
|
}
|
|
}
|
|
|
|
//- sixten: multi-line comments
|
|
if(TokenKind == TokenKind_None && (Byte+1 < TextEnd && Byte[0] == '/' && Byte[1] == '*'))
|
|
{
|
|
TokenKind = TokenKind_Comment;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte + 2;
|
|
Byte += 2;
|
|
for(;Byte <= TextEnd; Byte += 1)
|
|
{
|
|
// sixten(NOTE): This could potentially be wrong. The TokenEnd += 1 statement could currently make the token include the EOF.
|
|
if(Byte == TextEnd)
|
|
{
|
|
TokenKind = TokenKind_BrokenComment;
|
|
break;
|
|
}
|
|
TokenEnd += 1;
|
|
if(Byte+1 < TextEnd && Byte[0] == '*' && Byte[1] == '/')
|
|
{
|
|
TokenEnd += 1;
|
|
Byte += 2;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//- sixten: identifiers
|
|
if(TokenKind == TokenKind_None && (('A' <= *Byte && *Byte <= 'Z') ||
|
|
('a' <= *Byte && *Byte <= 'z') ||
|
|
(UTF8Lengths[*Byte>>3] > 1) ||
|
|
*Byte == '_'))
|
|
{
|
|
TokenKind = TokenKind_Identifier;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte;
|
|
Byte += UTF8Lengths[*Byte>>3];
|
|
for(;Byte <= TextEnd; Byte += UTF8Lengths[*Byte>>3])
|
|
{
|
|
if(Byte == TextEnd || !(('A' <= *Byte && *Byte <= 'Z') ||
|
|
('a' <= *Byte && *Byte <= 'z') ||
|
|
('0' <= *Byte && *Byte <= '9') ||
|
|
(UTF8Lengths[*Byte>>3] > 1) ||
|
|
*Byte == '_'))
|
|
{
|
|
TokenEnd = Byte;
|
|
break;
|
|
}
|
|
}
|
|
|
|
string String = MakeString(TokenStart, TokenEnd-TokenStart);
|
|
if(0) {}
|
|
else if(AreEqual(String, StrLit("branch"))) { TokenKind = TokenKind_Branch; }
|
|
else if(AreEqual(String, StrLit("else"))) { TokenKind = TokenKind_Else; }
|
|
else if(AreEqual(String, StrLit("false"))) { TokenKind = TokenKind_False; }
|
|
else if(AreEqual(String, StrLit("if"))) { TokenKind = TokenKind_If; }
|
|
else if(AreEqual(String, StrLit("then"))) { TokenKind = TokenKind_Then; }
|
|
else if(AreEqual(String, StrLit("jump"))) { TokenKind = TokenKind_Jump; }
|
|
else if(AreEqual(String, StrLit("proc"))) { TokenKind = TokenKind_Proc; }
|
|
else if(AreEqual(String, StrLit("true"))) { TokenKind = TokenKind_True; }
|
|
else if(AreEqual(String, StrLit("let"))) { TokenKind = TokenKind_Let; }
|
|
else if(AreEqual(String, StrLit("while"))) { TokenKind = TokenKind_While; }
|
|
else if(AreEqual(String, StrLit("call"))) { TokenKind = TokenKind_Call; }
|
|
else if(AreEqual(String, StrLit("wait"))) { TokenKind = TokenKind_Wait; }
|
|
else if(AreEqual(String, StrLit("end"))) { TokenKind = TokenKind_End; }
|
|
}
|
|
|
|
//- sixten: numerics
|
|
if(TokenKind == TokenKind_None && (('0' <= *Byte && *Byte <= '9') ||
|
|
(*Byte == '-' && Byte + 1 < TextEnd && '0' <= Byte[1] && Byte[1] <= '9')))
|
|
{
|
|
TokenKind = TokenKind_Numeric;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte;
|
|
Byte += 1;
|
|
for(;Byte <= TextEnd; Byte += 1)
|
|
{
|
|
TokenEnd += 1;
|
|
if(Byte == TextEnd ||
|
|
!(('0' <= *Byte && *Byte <= '9') ||
|
|
*Byte == '_' || *Byte == '.'))
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//- sixten: string literals
|
|
if(TokenKind == TokenKind_None && *Byte == '"')
|
|
{
|
|
TokenKind = TokenKind_StringLiteral;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte;
|
|
Byte += 1;
|
|
for(;Byte <= TextEnd; Byte += 1)
|
|
{
|
|
TokenEnd += 1;
|
|
if(Byte == TextEnd || *Byte == '\n')
|
|
{
|
|
TokenKind = TokenKind_BrokenStringLiteral;
|
|
break;
|
|
}
|
|
if(*Byte == '"')
|
|
{
|
|
Byte += 1;
|
|
TokenEnd += 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//- sixten: symbols
|
|
if(TokenKind == TokenKind_None && (*Byte == '{' || *Byte == '}' || *Byte == '(' || *Byte == ')' ||
|
|
*Byte == ',' || *Byte == '.' || *Byte == '$' || *Byte == ';' ||
|
|
*Byte == '+' || *Byte == '-' || *Byte == '*' || *Byte == '/'))
|
|
{
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte+1;
|
|
|
|
switch(*Byte)
|
|
{
|
|
case '{': { TokenKind = TokenKind_CurlyOpen; } break;
|
|
case '}': { TokenKind = TokenKind_CurlyClose; } break;
|
|
case '(': { TokenKind = TokenKind_ParenthesisOpen; } break;
|
|
case ')': { TokenKind = TokenKind_ParenthesisClose; } break;
|
|
case ',': { TokenKind = TokenKind_Comma; } break;
|
|
case '.': { TokenKind = TokenKind_Dot; } break;
|
|
case '$': { TokenKind = TokenKind_DollarSign; } break;
|
|
case ';': { TokenKind = TokenKind_Semicolon; } break;
|
|
case '+': { TokenKind = TokenKind_Plus; } break;
|
|
case '-': { TokenKind = TokenKind_Minus; } break;
|
|
case '*': { TokenKind = TokenKind_Star; } break;
|
|
case '/': { TokenKind = TokenKind_Slash; } break;
|
|
InvalidDefaultCase;
|
|
}
|
|
|
|
Byte += 1;
|
|
}
|
|
if(TokenKind == TokenKind_None && (*Byte == '!' || *Byte == '=' || *Byte == '>' || *Byte == '<'))
|
|
{
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte+1;
|
|
|
|
switch(*Byte)
|
|
{
|
|
case '!': { TokenKind = TokenKind_Bang; } break;
|
|
case '=': { TokenKind = TokenKind_Equal; } break;
|
|
case '>': { TokenKind = TokenKind_Greater; } break;
|
|
case '<': { TokenKind = TokenKind_Less; } break;
|
|
InvalidDefaultCase;
|
|
}
|
|
|
|
Byte += 1;
|
|
|
|
if(Byte < TextEnd && (*Byte == '='))
|
|
{
|
|
TokenKind = (token_kind)(TokenKind + 1);
|
|
TokenEnd += 1;
|
|
Byte += 1;
|
|
}
|
|
}
|
|
|
|
//- sixten: bad character
|
|
if(TokenKind == TokenKind_None)
|
|
{
|
|
TokenKind = TokenKind_BadCharacter;
|
|
TokenStart = Byte;
|
|
TokenEnd = Byte + 1;
|
|
Byte += 1;
|
|
}
|
|
|
|
//- sixten: push token
|
|
if(TokenKind != 0 && (!ExcludeFilter || (ExcludeFilter && !ExcludeFilter(TokenKind))) && TokenStart != 0 && TokenEnd > TokenStart)
|
|
{
|
|
token Token = {TokenKind, {TokenStart - TextStart, TokenEnd - TextStart}, Line};
|
|
T_TokenChunkListPush(Scratch.Arena, &Tokens, Token, 4096);
|
|
}
|
|
|
|
if(TokenKind == TokenKind_BrokenComment)
|
|
{
|
|
string Message = StrLit("broken comment");
|
|
T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message);
|
|
}
|
|
|
|
if(TokenKind == TokenKind_BrokenStringLiteral)
|
|
{
|
|
string Message = StrLit("broken string literal");
|
|
T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message);
|
|
}
|
|
}
|
|
|
|
tokenize_result Result =
|
|
{
|
|
T_TokenArrayFromList(Arena, &Tokens),
|
|
Messages
|
|
};
|
|
ReleaseScratch(Scratch);
|
|
return(Result);
|
|
}
|
|
|