//////////////////////////////// //~ sixten: Token Type Functions static string T_StringFromToken(string Text, token Token) { string Result = Substring(Text, Token.Range); return(Result); } static b32 T_TokenMatches(token Token, token_flags Flags, string Text, string String) { b32 Result = (Token.Flags & Flags && AreEqual(T_StringFromToken(Text, Token), String)); return(Result); } static void T_TokenChunkListPush(memory_arena *Arena, token_chunk_list *List, token Token, s64 MaxTokenCountPerNode) { token_chunk_node *Node = List->Last; if(!Node || Node->Count >= Node->MaxCount) { Node = PushStruct(Arena, token_chunk_node); Node->Count = 0; Node->MaxCount = MaxTokenCountPerNode; Node->Tokens = PushArrayNoClear(Arena, token, Node->MaxCount); QueuePush(List->First, List->Last, Node); } Node->Tokens[Node->Count] = Token; Node->Count += 1; List->Count += 1; } static token_array T_TokenArrayFromList(memory_arena *Arena, token_chunk_list *List) { token_array Result = {}; Result.Tokens = PushArrayNoClear(Arena, token, List->Count); Result.Count = List->Count; s64 Index = 0; for(token_chunk_node *Node = List->First; Node != 0; Node = Node->Next) { Copy(Result.Tokens + Index, Node->Tokens, sizeof(token)*Node->Count); Index += Node->Count; } return(Result); } //////////////////////////////// //~ sixten: Tokenizer Message Functions static void T_MessageListPush(memory_arena *Arena, tokenizer_message_list *List, tokenizer_message_kind Kind, s64 Offset, string String) { tokenizer_message *Message = PushStructNoClear(Arena, tokenizer_message); Message->Kind = Kind; Message->Offset = Offset; Message->String = String; QueuePush(List->First, List->Last, Message); } //////////////////////////////// //~ sixten: Text -> Token Functions static tokenize_result T_TokenizeFromText(memory_arena *Arena, string Filename, string Text, token_flags ExcludeFilter) { temporary_memory Scratch = GetScratch(&Arena, 1); token_chunk_list Tokens = {}; tokenizer_message_list Messages = {}; u8 *TextStart = Text.Data; u8 *TextEnd = TextStart + Text.Count; u8 *Byte = TextStart; //- sixten: scan string & produce tokens for(;Byte < TextEnd;) { token_flags TokenFlags = 0; u8 *TokenStart = 0; u8 *TokenEnd = 0; //- sixten: whitespace if(TokenFlags == 0 && (*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r')) { TokenFlags = TokenFlag_Whitespace; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(*Byte == ' ' || *Byte == '\t' || *Byte == '\v' || *Byte == '\r')) { break; } } } //- sixten: newlines if(TokenFlags == 0 && *Byte == '\n') { TokenFlags = TokenFlag_Newline; TokenStart = Byte; TokenEnd = Byte + 1; Byte += 1; } //- sixten: single-line comments if(TokenFlags == 0 && (Byte[0] == '/' && Byte[1] == '/')) { TokenFlags = TokenFlag_Comment; TokenStart = Byte; TokenEnd = Byte + 2; Byte += 2; for(;Byte <= TextEnd; Byte += 1) { if(Byte == TextEnd || *Byte == '\n' || *Byte == '\r') { break; } TokenEnd += 1; } } //- sixten: multi-line comments if(TokenFlags == 0 && (Byte+1 < TextEnd && Byte[0] == '/' && Byte[1] == '*')) { TokenFlags = TokenFlag_Comment; TokenStart = Byte; TokenEnd = Byte + 2; Byte += 2; for(;Byte <= TextEnd; Byte += 1) { // sixten(NOTE): This could potentially be wrong. The TokenEnd += 1 statement could currently make the token include the EOF. if(Byte == TextEnd) { TokenFlags |= TokenFlag_BrokenComment; break; } TokenEnd += 1; if(Byte+1 < TextEnd && Byte[0] == '*' && Byte[1] == '/') { TokenEnd += 1; Byte += 2; break; } } } //- sixten: identifiers if(TokenFlags == 0 && (('A' <= *Byte && *Byte <= 'Z') || ('a' <= *Byte && *Byte <= 'z') || (UTF8Lengths[*Byte>>3] > 1) || *Byte == '_')) { TokenFlags = TokenFlag_Identifier; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(('A' <= *Byte && *Byte <= 'Z') || ('a' <= *Byte && *Byte <= 'z') || ('0' <= *Byte && *Byte <= '9') || (UTF8Lengths[*Byte>>3] > 1) || *Byte == '_')) { break; } } } //- sixten: numerics if(TokenFlags == 0 && (('0' <= *Byte && *Byte <= '9') || *Byte == '.' || ((*Byte == '+' || *Byte == '-') && Byte + 1 < TextEnd && '0' <= Byte[1] && Byte[1] <= '9'))) { TokenFlags = TokenFlag_Numeric; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(('A' <= *Byte && *Byte <= 'Z') || ('a' <= *Byte && *Byte <= 'z') || ('0' <= *Byte && *Byte <= '9') || *Byte == '_' || *Byte == '.')) { break; } } } //- sixten: string literals if(TokenFlags == 0 && *Byte == '"') { TokenFlags = TokenFlag_StringLiteral; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || *Byte == '\n') { TokenFlags |= TokenFlag_BrokenStringLiteral; break; } if(*Byte == '"') { Byte += 1; TokenEnd += 1; break; } } } //- sixten: multi-char symbols if(TokenFlags == 0 && (*Byte == '!' || *Byte == '%' || *Byte == '&' || *Byte == '|' || *Byte == '/' || *Byte == '=' || *Byte == '?' || *Byte == '^' || *Byte == '*' || *Byte == '+' || *Byte == '-' || *Byte == '$' || *Byte == '<' || *Byte == '>' || *Byte == '~' || *Byte == '\'')) { TokenFlags = TokenFlag_Symbol; TokenStart = Byte; TokenEnd = Byte; Byte += 1; for(;Byte <= TextEnd; Byte += 1) { TokenEnd += 1; if(Byte == TextEnd || !(*Byte == '!' || *Byte == '%' || *Byte == '&' || *Byte == '|' || *Byte == '/' || *Byte == '=' || *Byte == '?' || *Byte == '^' || *Byte == '*' || *Byte == '+' || *Byte == '-' || *Byte == '$' || *Byte == '<' || *Byte == '>' || *Byte == '~' || *Byte == '\'')) { break; } } } //- sixten: single-char symbols if(TokenFlags == 0 && (*Byte == '{' || *Byte == '}' || *Byte == '(' || *Byte == ')' || *Byte == '[' || *Byte == ']' || *Byte == ',' || *Byte == ';' || *Byte == ':' || *Byte == '@' || *Byte == '#')) { TokenFlags = TokenFlag_Reserved; TokenStart = Byte; TokenEnd = Byte + 1; Byte += 1; } //- sixten: bad character if(TokenFlags == 0) { TokenFlags = TokenFlag_BadCharacter; TokenStart = Byte; TokenStart = Byte + 1; Byte += 1; } //- sixten: push token if(TokenFlags != 0 && !(TokenFlags & ExcludeFilter) && TokenStart != 0 && TokenEnd > TokenStart) { token Token = {TokenFlags, {TokenStart - TextStart, TokenEnd - TextStart}}; T_TokenChunkListPush(Scratch.Arena, &Tokens, Token, 4096); } if(TokenFlags & TokenFlag_BrokenComment) { string Message = StrLit("broken comment"); T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message); } if(TokenFlags & TokenFlag_BrokenStringLiteral) { string Message = StrLit("broken string literal"); T_MessageListPush(Arena, &Messages, T_MessageKind_Error, TokenStart - TextStart, Message); } } tokenize_result Result = { T_TokenArrayFromList(Arena, &Tokens), Messages }; ReleaseScratch(Scratch); return(Result); }