2023-07-19 15:09:41 +00:00
|
|
|
/* date = July 9th 2023 8:04 pm */
|
2023-06-19 17:12:26 +00:00
|
|
|
|
|
|
|
#ifndef VN_TOKENIZER_H
|
|
|
|
#define VN_TOKENIZER_H
|
|
|
|
|
2023-07-19 15:09:41 +00:00
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Token Types
|
2023-08-06 10:35:09 +00:00
|
|
|
enum token_kind
|
2023-06-19 17:12:26 +00:00
|
|
|
{
|
2023-08-06 10:35:09 +00:00
|
|
|
TokenKind_None,
|
2023-07-19 15:09:41 +00:00
|
|
|
|
2023-08-06 10:35:09 +00:00
|
|
|
// sixten: labels
|
|
|
|
TokenKind_Identifier,
|
|
|
|
TokenKind_Numeric,
|
|
|
|
TokenKind_StringLiteral,
|
|
|
|
|
|
|
|
// sixten: symbols
|
|
|
|
TokenKind_SymbolsBegin,
|
|
|
|
// sixten: (single char)
|
|
|
|
TokenKind_CurlyOpen,
|
|
|
|
TokenKind_CurlyClose,
|
|
|
|
TokenKind_ParenthesisOpen,
|
|
|
|
TokenKind_ParenthesisClose,
|
|
|
|
TokenKind_Comma,
|
|
|
|
TokenKind_Dot,
|
|
|
|
TokenKind_At,
|
|
|
|
TokenKind_PoundSign,
|
|
|
|
TokenKind_Semicolon,
|
|
|
|
TokenKind_Plus,
|
|
|
|
TokenKind_Minus,
|
|
|
|
TokenKind_Star,
|
|
|
|
TokenKind_Slash,
|
|
|
|
// sixten: (one or two chars)
|
|
|
|
TokenKind_Bang,
|
|
|
|
TokenKind_BangEqual,
|
|
|
|
TokenKind_Equal,
|
|
|
|
TokenKind_EqualEqual,
|
|
|
|
TokenKind_Greater,
|
|
|
|
TokenKind_GreaterEqual,
|
|
|
|
TokenKind_Less,
|
|
|
|
TokenKind_LessEqual,
|
|
|
|
TokenKind_SymbolsEnd,
|
|
|
|
|
|
|
|
// sixten: keywords
|
|
|
|
TokenKind_KeywordsBegin,
|
|
|
|
TokenKind_And,
|
|
|
|
TokenKind_Branch,
|
|
|
|
TokenKind_Else,
|
|
|
|
TokenKind_False,
|
|
|
|
TokenKind_For,
|
|
|
|
TokenKind_If,
|
|
|
|
TokenKind_Jump,
|
|
|
|
TokenKind_Or,
|
|
|
|
TokenKind_Proc,
|
|
|
|
TokenKind_True,
|
|
|
|
TokenKind_Var,
|
|
|
|
TokenKind_While,
|
|
|
|
TokenKind_KeywordsEnd,
|
|
|
|
|
|
|
|
// sixten: whitespace
|
|
|
|
TokenKind_Whitespace,
|
|
|
|
TokenKind_Newline,
|
|
|
|
TokenKind_Comment,
|
|
|
|
|
|
|
|
// sixten: invalid kinds
|
|
|
|
TokenKind_BrokenComment,
|
|
|
|
TokenKind_BrokenStringLiteral,
|
|
|
|
TokenKind_BadCharacter,
|
2023-06-19 17:12:26 +00:00
|
|
|
};
|
|
|
|
|
2023-08-06 10:35:09 +00:00
|
|
|
typedef b32 tokenizer_filter_function(token_kind Kind);
|
|
|
|
|
|
|
|
inline b32 T_IsComment(token_kind Kind) { return(Kind == TokenKind_Comment); }
|
|
|
|
inline b32 T_IsWhitespace(token_kind Kind) { return(Kind == TokenKind_Whitespace ||
|
|
|
|
Kind == TokenKind_Newline); }
|
|
|
|
inline b32 T_IsIrregular(token_kind Kind) { return(T_IsComment(Kind) ||
|
|
|
|
T_IsWhitespace(Kind)); }
|
|
|
|
inline b32 T_IsRegular(token_kind Kind) { return(!T_IsIrregular(Kind)); }
|
|
|
|
inline b32 T_IsInvalid(token_kind Kind) { return(Kind == TokenKind_None ||
|
|
|
|
Kind == TokenKind_BrokenComment ||
|
|
|
|
Kind == TokenKind_BrokenStringLiteral ||
|
|
|
|
Kind == TokenKind_BadCharacter); }
|
2023-06-19 17:12:26 +00:00
|
|
|
|
|
|
|
struct token
|
|
|
|
{
|
2023-08-06 10:35:09 +00:00
|
|
|
token_kind Kind;
|
2023-07-19 15:09:41 +00:00
|
|
|
range1_s64 Range;
|
2023-06-19 17:12:26 +00:00
|
|
|
};
|
|
|
|
|
2023-07-19 15:09:41 +00:00
|
|
|
struct token_chunk_node
|
2023-06-19 17:12:26 +00:00
|
|
|
{
|
2023-07-19 15:09:41 +00:00
|
|
|
token *Tokens;
|
|
|
|
s64 MaxCount;
|
|
|
|
s64 Count;
|
|
|
|
token_chunk_node *Next;
|
|
|
|
};
|
2023-06-19 17:12:26 +00:00
|
|
|
|
2023-07-19 15:09:41 +00:00
|
|
|
struct token_chunk_list
|
2023-06-19 17:12:26 +00:00
|
|
|
{
|
2023-07-19 15:09:41 +00:00
|
|
|
token_chunk_node *First;
|
|
|
|
token_chunk_node *Last;
|
|
|
|
s64 Count;
|
|
|
|
};
|
2023-06-19 17:12:26 +00:00
|
|
|
|
2023-07-19 15:09:41 +00:00
|
|
|
struct token_array
|
2023-06-19 17:12:26 +00:00
|
|
|
{
|
2023-07-19 15:09:41 +00:00
|
|
|
token *Tokens;
|
|
|
|
s64 Count;
|
|
|
|
};
|
2023-06-19 17:12:26 +00:00
|
|
|
|
2023-07-19 15:09:41 +00:00
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Tokenizer Message Types
|
|
|
|
enum tokenizer_message_kind
|
2023-06-19 17:12:26 +00:00
|
|
|
{
|
2023-07-19 15:09:41 +00:00
|
|
|
T_MessageKind_Invalid,
|
|
|
|
T_MessageKind_Note,
|
|
|
|
T_MessageKind_Warning,
|
|
|
|
T_MessageKind_Error,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tokenizer_message
|
|
|
|
{
|
|
|
|
tokenizer_message *Next;
|
|
|
|
tokenizer_message_kind Kind;
|
|
|
|
s64 Offset;
|
|
|
|
string String;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tokenizer_message_list
|
|
|
|
{
|
|
|
|
tokenizer_message *First;
|
|
|
|
tokenizer_message *Last;
|
|
|
|
s64 Count;
|
|
|
|
};
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Text -> Token Types
|
|
|
|
struct tokenize_result
|
|
|
|
{
|
|
|
|
token_array Tokens;
|
|
|
|
tokenizer_message_list Messages;
|
|
|
|
};
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Token Type Functions
|
|
|
|
static string T_StringFromToken(string Text, token Token);
|
|
|
|
static void T_TokenChunkListPush(memory_arena *Arena, token_chunk_list *List, token Token, s64 MaxTokenCountPerNode);
|
|
|
|
static token_array T_TokenArrayFromChunkList(memory_arena *Arena, token_chunk_list *List);
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Tokenizer Message Functions
|
|
|
|
static void T_MessageListPush(memory_arena *Arena, tokenizer_message_list *List, tokenizer_message_kind Kind, s64 Offset, string String);
|
|
|
|
|
|
|
|
////////////////////////////////
|
|
|
|
//~ sixten: Text -> Token Functions
|
2023-08-06 10:35:09 +00:00
|
|
|
static tokenize_result T_TokenizeFromText(memory_arena *Arena, string Text, tokenizer_filter_function *ExcludeFilter = 0);
|
2023-06-19 17:12:26 +00:00
|
|
|
|
|
|
|
#endif //VN_TOKENIZER_H
|