//////////////////////////////// //~ sixten: Token Type Functions static string T_StringFromToken(string Text, token Token) { string Result = Substring(Text, Token.Range); return(Result); } static void T_TokenChunkListPush(arena *Arena, token_chunk_list *List, token Token, s64 MaxTokenCountPerNode) { token_chunk_node *Node = List->Last; if(!Node || Node->Count >= Node->MaxCount) { Node = PushStruct(Arena, token_chunk_node); Node->Count = 0; Node->MaxCount = MaxTokenCountPerNode; Node->Tokens = PushArrayNoClear(Arena, token, Node->MaxCount); QueuePush(List->First, List->Last, Node); } Node->Tokens[Node->Count] = Token; Node->Count += 1; List->Count += 1; } static token_array T_TokenArrayFromList(arena *Arena, token_chunk_list *List) { token_array Result = {}; Result.Tokens = PushArrayNoClear(Arena, token, List->Count); Result.Count = List->Count; s64 Index = 0; for(token_chunk_node *Node = List->First; Node != 0; Node = Node->Next) { Copy(Result.Tokens + Index, Node->Tokens, sizeof(token)*Node->Count); Index += Node->Count; } return(Result); } //////////////////////////////// //~ sixten: Tokenizer Message Functions static void T_MessageListPush(arena *Arena, tokenizer_message_list *List, tokenizer_message_kind Kind, s64 Offset, string String) { tokenizer_message *Message = PushStructNoClear(Arena, tokenizer_message); Message->Kind = Kind; Message->Offset = Offset; Message->String = String; QueuePush(List->First, List->Last, Message); } //////////////////////////////// //~ sixten: Text -> Token Functions static tokenize_result T_TokenizeFromText(arena *Arena, string Text, tokenizer_filter_function *ExcludeFilter) { temp Scratch = GetScratch(&Arena, 1); token_chunk_list Tokens = {}; tokenizer_message_list Messages = {}; u8 *TextStart = Text.Data; u8 *TextEnd = TextStart + Text.Count; u8 *Byte = TextStart; s64 Line = 0; //- sixten: scan string & produce tokens for(;Byte < TextEnd;) { token_kind TokenKind = TokenKind_None; u8 *TokenStart = 0; u8 *TokenEnd = 0; //- sixten: whitespace if(TokenKind == TokenKind_None && (*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r')) { TokenKind = TokenKind_Whitespace; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r')) { break; } } } //- sixten: newlines if(TokenKind == TokenKind_None && *Byte == '\n') { TokenKind = TokenKind_Newline; TokenStart = Byte; TokenEnd = Byte + 1; Line += 1; Byte += 1; } //- sixten: single-line comments if(TokenKind == TokenKind_None && (Byte[0] == '/' && Byte[1] == '/')) { TokenKind = TokenKind_Comment;; TokenStart = Byte; TokenEnd = Byte + 2; Byte += 2; for(;Byte <= TextEnd; Byte += 1) { if(Byte == TextEnd || *Byte == '\n' || *Byte == '\r') { break; } TokenEnd += 1; } } //- sixten: multi-line comments if(TokenKind == TokenKind_None && (Byte+1 < TextEnd && Byte[0] == '/' && Byte[1] == '*')) { TokenKind = TokenKind_Comment; TokenStart = Byte; TokenEnd = Byte + 2; Byte += 2; for(;Byte <= TextEnd; Byte += 1) { // sixten(NOTE): This could potentially be wrong. The TokenEnd += 1 statement could currently make the token include the EOF. if(Byte == TextEnd) { TokenKind = TokenKind_BrokenComment; break; } TokenEnd += 1; if(Byte+1 < TextEnd && Byte[0] == '*' && Byte[1] == '/') { TokenEnd += 1; Byte += 2; break; } } } //- sixten: identifiers if(TokenKind == TokenKind_None && (('A' <= *Byte && *Byte <= 'Z') || ('a' <= *Byte && *Byte <= 'z') || (UTF8Lengths[*Byte>>3] > 1) || *Byte == '_')) { TokenKind = TokenKind_Identifier; TokenStart = Byte; TokenEnd = Byte; Byte += UTF8Lengths[*Byte>>3]; for(;Byte <= TextEnd; Byte += UTF8Lengths[*Byte>>3]) { if(Byte == TextEnd || !(('A' <= *Byte && *Byte <= 'Z') || ('a' <= *Byte && *Byte <= 'z') || ('0' <= *Byte && *Byte <= '9') || (UTF8Lengths[*Byte>>3] > 1) || *Byte == '_')) { TokenEnd = Byte; break; } } string String = MakeString(TokenStart, TokenEnd-TokenStart); if(0) {} else if(AreEqual(String, StrLit("branch"))) { TokenKind = TokenKind_Branch; } else if(AreEqual(String, StrLit("else"))) { TokenKind = TokenKind_Else; } else if(AreEqual(String, StrLit("false"))) { TokenKind = TokenKind_False; } else if(AreEqual(String, StrLit("if"))) { TokenKind = TokenKind_If; } else if(AreEqual(String, StrLit("then"))) { TokenKind = TokenKind_Then; } else if(AreEqual(String, StrLit("jump"))) { TokenKind = TokenKind_Jump; } else if(AreEqual(String, StrLit("proc"))) { TokenKind = TokenKind_Proc; } else if(AreEqual(String, StrLit("true"))) { TokenKind = TokenKind_True; } else if(AreEqual(String, StrLit("let"))) { TokenKind = TokenKind_Let; } else if(AreEqual(String, StrLit("while"))) { TokenKind = TokenKind_While; } else if(AreEqual(String, StrLit("call"))) { TokenKind = TokenKind_Call; } else if(AreEqual(String, StrLit("wait"))) { TokenKind = TokenKind_Wait; } else if(AreEqual(String, StrLit("end"))) { TokenKind = TokenKind_End; } } //- sixten: numerics if(TokenKind == TokenKind_None && (('0' <= *Byte && *Byte <= '9') || (*Byte == '-' && Byte + 1 < TextEnd && '0' <= Byte[1] && Byte[1] <= '9'))) { TokenKind = TokenKind_Numeric; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(('0' <= *Byte && *Byte <= '9') || *Byte == '_' || *Byte == '.')) { break; } } } //- sixten: string literals if(TokenKind == TokenKind_None && *Byte == '"') { TokenKind = TokenKind_StringLiteral; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || *Byte == '\n') { TokenKind = TokenKind_BrokenStringLiteral; break; } if(*Byte == '"') { Byte += 1; TokenEnd += 1; break; } } } //- sixten: symbols if(TokenKind == TokenKind_None && (*Byte == '{' || *Byte == '}' || *Byte == '(' || *Byte == ')' || *Byte == ',' || *Byte == '.' || *Byte == '$' || *Byte == ';' || *Byte == '+' || *Byte == '-' || *Byte == '*' || *Byte == '/')) { TokenStart = Byte; TokenEnd = Byte+1; switch(*Byte) { case '{': { TokenKind = TokenKind_CurlyOpen; } break; case '}': { TokenKind = TokenKind_CurlyClose; } break; case '(': { TokenKind = TokenKind_ParenthesisOpen; } break; case ')': { TokenKind = TokenKind_ParenthesisClose; } break; case ',': { TokenKind = TokenKind_Comma; } break; case '.': { TokenKind = TokenKind_Dot; } break; case '$': { TokenKind = TokenKind_DollarSign; } break; case ';': { TokenKind = TokenKind_Semicolon; } break; case '+': { TokenKind = TokenKind_Plus; } break; case '-': { TokenKind = TokenKind_Minus; } break; case '*': { TokenKind = TokenKind_Star; } break; case '/': { TokenKind = TokenKind_Slash; } break; InvalidDefaultCase; } Byte += 1; } if(TokenKind == TokenKind_None && (*Byte == '!' || *Byte == '=' || *Byte == '>' || *Byte == '<')) { TokenStart = Byte; TokenEnd = Byte+1; switch(*Byte) { case '!': { TokenKind = TokenKind_Bang; } break; case '=': { TokenKind = TokenKind_Equal; } break; case '>': { TokenKind = TokenKind_Greater; } break; case '<': { TokenKind = TokenKind_Less; } break; InvalidDefaultCase; } Byte += 1; if(Byte < TextEnd && (*Byte == '=')) { TokenKind = (token_kind)(TokenKind + 1); TokenEnd += 1; Byte += 1; } } //- sixten: bad character if(TokenKind == TokenKind_None) { TokenKind = TokenKind_BadCharacter; TokenStart = Byte; TokenEnd = Byte + 1; Byte += 1; } //- sixten: push token if(TokenKind != 0 && (!ExcludeFilter || (ExcludeFilter && !ExcludeFilter(TokenKind))) && TokenStart != 0 && TokenEnd > TokenStart) { token Token = {TokenKind, {TokenStart - TextStart, TokenEnd - TextStart}, Line}; T_TokenChunkListPush(Scratch.Arena, &Tokens, Token, 4096); } if(TokenKind == TokenKind_BrokenComment) { string Message = StrLit("broken comment"); T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message); } if(TokenKind == TokenKind_BrokenStringLiteral) { string Message = StrLit("broken string literal"); T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message); } } tokenize_result Result = { T_TokenArrayFromList(Arena, &Tokens), Messages }; ReleaseScratch(Scratch); return(Result); }