website

Website contents
git clone git://git.reagancfischer.dev/website.git
Log | Files | Refs

commit 361ae256638c968e45658d52e4049ea12b05ca92
parent d3789f291f6b8e22a481a8b443a77d0c53966926
Author: Reagan <rfische2@uccs.edu>
Date:   Sat, 24 Aug 2024 11:46:04 -0600

lex refactor

Diffstat:
Mprojects/cminus/code/hash_table.c | 248+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Mprojects/cminus/code/hash_table.h | 14++++++++------
Mprojects/cminus/code/lexer.lit | 1320+++++++++++++++++++++++++++++++++++++++----------------------------------------
Mprojects/cminus/code/makefile | 5+++--
Rprojects/style.css -> projects/cminus/code/style.css | 0
Mprojects/cminus/code/token.c | 474++++++++++++++++++++++++++++++++++++++-----------------------------------------
Mprojects/cminus/code/token.h | 194+++++++++++++++++++++++++++++++++++++------------------------------------------
Mprojects/cminus/code/tokenizer.c | 357++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mprojects/cminus/lexer.html | 1762++++++++++++++++++++++++++++++++++++++++---------------------------------------
Dprojects/cminus/lexer_new.html | 288-------------------------------------------------------------------------------
Dprojects/cminus/lexer_new.lit | 615-------------------------------------------------------------------------------
Dprojects/style_old.css | 108-------------------------------------------------------------------------------
12 files changed, 2184 insertions(+), 3201 deletions(-)

diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c @@ -6,158 +6,170 @@ /* Hash Table Data Structure */ struct hash_table { - struct hash_table_entry **entries; - int size; - hash_table_cmp_fn cmp; - hash_table_hash_fn hash; - hash_table_dtor dtor; + struct hash_table_entry **entries; + int size; + hash_table_cmp_fn cmp; + hash_table_hash_fn hash; + hash_table_dtor dtor; }; /* Hash Table Entry Data Structure */ struct hash_table_entry { - void *key; - void *value; - struct hash_table_entry *next; + void *key; + void *value; + struct hash_table_entry *next; }; hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) { - /* Allocate and Initialize Hash Table */ - hash_table_t *table = malloc(sizeof(struct hash_table)); - if (table == NULL) { - fputs("Out of memory, could not allocate hash table\n", stderr); - exit(1); - } - table->entries = calloc(size, sizeof(struct hash_table_entry *)); - if (table->entries == NULL) { - fputs("Out of memory, could not allocate hash table entries\n", stderr); - exit(1); - } - table->size = size; - table->cmp = cmp; - table->hash = hash; - table->dtor = dtor; - - return table; + /* Allocate and Initialize Hash Table */ + hash_table_t *table = malloc(sizeof(struct hash_table)); + if (!table) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table\n"); + exit(EXIT_FAILURE); + } + table->entries = calloc(size, sizeof(struct hash_table_entry *)); + if (!table->entries) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entries\n"); + free(table); + exit(EXIT_FAILURE); + } + table->size = size; + table->cmp = cmp; + table->hash = hash; + table->dtor = dtor; + + return table; } void hash_table_destroy(hash_table_t *table) { - /* Destroy Entries */ - for (int i = 0; i < table->size; i++) { - struct hash_table_entry *entry = table->entries[i]; - while (entry != NULL) { - struct hash_table_entry *next = entry->next; - if (table->dtor != NULL) { - table->dtor(entry->key, 1); - table->dtor(entry->value, 0); - } - free(entry); - entry = next; + if (!table) return; + /* Destroy Entries */ + for (int i = 0; i < table->size; i++) { + struct hash_table_entry *entry = table->entries[i]; + while (entry) { + struct hash_table_entry *next = entry->next; + if (table->dtor) { + table->dtor(entry->key, 1); + table->dtor(entry->value, 0); + } + free(entry); + entry = next; + } } - } - free(table->entries); - free(table); + free(table->entries); + free(table); } -void *hash_table_get(hash_table_t *table, void *key) { - /* Get Entry By Hash */ - unsigned int hash = table->hash(key) % table->size; - struct hash_table_entry *entry = table->entries[hash]; - - /* Loop Through Entries and Return Value if Match */ - while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - return entry->value; +void *hash_table_get(const hash_table_t *table, const void *key) { + if (!table || !key) return NULL; + /* Get Entry By Hash */ + unsigned int hash = table->hash(key) % table->size; + struct hash_table_entry *entry = table->entries[hash]; + + /* Loop Through Entries and Return Value if Match */ + while (entry) { + if (table->cmp(entry->key, key) == 0) { + return entry->value; + } + entry = entry->next; } - entry = entry->next; - } - return NULL; + return NULL; } void hash_table_put(hash_table_t *table, void *key, void *value) { - /* Get Entry By Hash */ - unsigned int hash = table->hash(key) % table->size; - struct hash_table_entry *entry = table->entries[hash]; - - /* Loop Through Entries and Replace Value if Key Matches */ - while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - entry->value = value; - return; + if (!table || !key) return; + /* Get Entry By Hash */ + unsigned int hash = table->hash(key) % table->size; + struct hash_table_entry *entry = table->entries[hash]; + + /* Loop Through Entries and Replace Value if Key Matches */ + while (entry) { + if (table->cmp(entry->key, key) == 0) { + if (table->dtor) table->dtor(entry->value, 0); + entry->value = value; + return; + } + entry = entry->next; } - entry = entry->next; - } - - /* Allocate New Entry if No Match */ - struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); - if (new_entry == NULL) { - fputs("Out of memory, could not allocate hash table entry\n", stderr); - exit(1); - } - new_entry->key = key; - new_entry->value = value; - new_entry->next = table->entries[hash]; - table->entries[hash] = new_entry; + + /* Allocate New Entry if No Match */ + struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); + if (!new_entry) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entry\n"); + exit(EXIT_FAILURE); + } + new_entry->key = key; + new_entry->value = value; + new_entry->next = table->entries[hash]; + table->entries[hash] = new_entry; } -void hash_table_remove(hash_table_t *table, void *key) { - /* Get Entry By Hash */ - unsigned int hash = table->hash(key) % table->size; - struct hash_table_entry *entry = table->entries[hash]; - - /* Loop Through Entries and Remove Entry if Key Matches */ - struct hash_table_entry *prev = NULL; - while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - if (prev == NULL) { - table->entries[hash] = entry->next; - } else { - prev->next = entry->next; - } - if (table->dtor != NULL) { - table->dtor(entry->key, 1); - table->dtor(entry->value, 0); - } - free(entry); - return; - } - prev = entry; - entry = entry->next; - } +void hash_table_remove(hash_table_t *table, const void *key) { + if (!table || !key) return; + /* Get Entry By Hash */ + unsigned int hash = table->hash(key) % table->size; + struct hash_table_entry *entry = table->entries[hash]; + + /* Loop Through Entries and Remove Entry if Key Matches */ + struct hash_table_entry *prev = NULL; + while (entry) { + if (table->cmp(entry->key, key) == 0) { + if (prev) + prev->next = entry->next; + else + table->entries[hash] = entry->next; + + if (table->dtor) { + table->dtor(entry->key, 1); + table->dtor(entry->value, 0); + } + free(entry); + return; + } + prev = entry; + entry = entry->next; + } } #ifdef TEST_HASH_TABLE #include <assert.h> -#include <stdio.h> -#include <string.h> -int string_cmp(void *key1, void *key2) { - return strcmp((char *)key1, (char *)key2); +static int string_cmp(const void *key1, const void *key2) { + return strcmp((const char *)key1, (const char *)key2); } -unsigned long string_hash(void *key) { - unsigned long hash = 5381; - char *str = (char *)key; - while (*str != '\0') { - hash = ((hash << 5) + hash) + *str; - str++; - } - return hash; +static unsigned int string_hash(const void *key) { + unsigned int hash = 5381; + const unsigned char *str = key; + int c; + + while ((c = *str++)) + { + hash = ((hash << 5) + hash) + c; + } + + return hash; } -int main() { - hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); - hash_table_put(table, "foo", "bar"); - hash_table_put(table, "foo", "baz"); - assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0); - hash_table_remove(table, "foo"); - assert(hash_table_get(table, "foo") == NULL); - hash_table_destroy(table); - return 0; +int main(void) { + hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); + assert(table != NULL); + + hash_table_put(table, "foo", "bar"); + hash_table_put(table, "foo", "baz"); + assert(strcmp((const char *)hash_table_get(table, "foo"), "baz") == 0); + + hash_table_remove(table, "foo"); + assert(hash_table_get(table, "foo") == NULL); + + hash_table_destroy(table); + printf("All tests passed!\n"); + return 0; } #endif diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h @@ -1,20 +1,22 @@ /* hash_table.h */ #ifndef HASH_TABLE_H #define HASH_TABLE_H + /* Hash Table Opaque Types */ typedef struct hash_table hash_table_t; -typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned int (*hash_table_hash_fn)(void *key); -typedef void (*hash_table_dtor)(void *value, int is_key); +typedef int (*hash_table_cmp_fn)(const void *key1, const void *key2); +typedef unsigned int (*hash_table_hash_fn)(const void *key); +typedef void (*hash_table_dtor)(void *data, int is_key); /* Hash Table Creation and Destruction */ hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor); void hash_table_destroy(hash_table_t *table); /* Hash Table Access */ -void *hash_table_get(hash_table_t *table, void *key); +void *hash_table_get(const hash_table_t *table, const void *key); void hash_table_put(hash_table_t *table, void *key, void *value); -void hash_table_remove(hash_table_t *table, void *key); +void hash_table_remove(hash_table_t *table, const void *key); + -#endif +#endif /* HASH_TABLE_H */ diff --git a/projects/cminus/code/lexer.lit b/projects/cminus/code/lexer.lit @@ -1,6 +1,6 @@ @code_type c .c @comment_type /* %s */ -@compiler lit -t lexer.lit && gcc -Wall -Wextra -Wstrict-aliasing=3 -Wwrite-strings -Wvla -Wcast-align=strict -Wstrict-prototypes -Wstringop-overflow=4 -Wshadow -fanalyzer tokenizer.c input.c hash_table.c token.c -D TEST_TOKENIZER -g -O0 && rm a.out +@compiler lit -t lexer.lit && gcc -Wall -Wextra -Wstrict-aliasing=3 -Wwrite-strings -Wvla -Wcast-align=strict -Wstrict-prototypes -Wstringop-overflow=4 -Wshadow -fanalyzer tokenizer.c input.c hash_table.c token.c -D TEST_TOKENIZER -g -O0 -o lextest && valgrind --leak-check=full ./lextest tokenizer.c && rm lextest @title Lexer @add_css ./style.css @@ -66,21 +66,27 @@ THIS IS A LITERATE PROGRAM! Go to [this link](https://reagancfischer.dev/project @s The Lexer -A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). +A lexical analyzer reads source code and produces tokens, which are the smallest units of meaning in a language. These tokens are then used by the parser to build an abstract syntax tree (AST), which represents the structure of the program. + +For example, in the C programming language, tokens include keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). Given a string like `int main() { return 0; }`, the lexer would produce a series of tokens like `INT`, `IDENTIFIER(main)`, `LPAREN`, `RPAREN`, `LBRACE`, `RETURN`, `INTCONSTANT(0)`, `SEMICOLON`, `RBRACE`. @s Design -I'll break the lexer up into a couple of modules. `token.c` will contain the token data structure and functions to create and destroy tokens. `input.c` will contain the input data structure and functions to read from the input file. `tokenizer.c` will contain the main lexer logic. +I'll break the lexer into several modules: +- `token.c` will contain the token data structure and functions to create and destroy tokens. +- `input.c` will contain the input data structure and functions to read from the input file. +- `tokenizer.c` will contain the main lexer logic. @s Token Interface -Tokens are the smallest unit of meaning in a language. They're used by the parser to build an abstract syntax tree (AST). We'll need a couple of things to represent a token: -* The type of token. This will be an enum, with values like `TOK_CTK_IF` or `TOK_CONST_INTEGER_U32`. -* The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do. -* The line and column of the token. This is used for error messages. -As I mentioned earlier, we're trying to implement a sort of class system in C. For that, we'll need to hide the token implementation details behind an opaque pointer. We could just have a `void` pointer, but that stops us from being able to use compile-time type checking. Instead, we'll use a forward declaration of the token type in the header file, and then define the token type in the implementation file. +We'll need several components to represent a token: +- The type of token, which will be an enum with values like `TOK_IF` or `TOK_INTEGER_U32`. +- The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do. +- The line and column of the token, used for error messages. + +To implement a class system in C, we'll hide the token implementation details behind an opaque pointer. We'll use a forward declaration of the token type in the header file and define the token type in the implementation file. @s --- Opaque Token Type @@ -88,155 +94,142 @@ typedef struct token token_t; --- @s -We'll need a couple of functions to create and destroy tokens. +We'll need functions to create and destroy tokens. --- Token Creation and Destruction Interface -token_t *token_data_create(c_token_types kind, int lin, int col, int len); - token_t *token_create(c_token_types kind, int lin, int col, int len); - token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len); - token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len); - token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len); - token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len); - void token_destroy(token_t *token); --- + @s -We'll also need some functions to access the token data. +We'll also need functions to access the token data. --- Token Interface c_token_types token_type(token_t *token); - int64_t token_int(token_t *token); - double token_float(token_t *token); - const char *token_string(token_t *token); - char token_char(token_t *token); - int token_line(token_t *token); - int token_column(token_t *token); - void print_token(token_t *tok); --- @s -We'll need some types to represent the different kinds of tokens. +We'll need types to represent the different kinds of tokens. --- Token Types typedef enum { // Control Keywords - TOK_CTK_IF, - TOK_CTK_ELSE, - TOK_CTK_SWITCH, - TOK_CTK_CASE, - TOK_CTK_DEFAULT, - TOK_CTK_WHILE, - TOK_CTK_DO, - TOK_CTK_FOR, - TOK_CTK_CONTINUE, - TOK_CTK_BREAK, - TOK_CTK_RETURN, - TOK_CTK_GOTO, + TOK_IF, + TOK_ELSE, + TOK_SWITCH, + TOK_CASE, + TOK_DEFAULT, + TOK_WHILE, + TOK_DO, + TOK_FOR, + TOK_CONTINUE, + TOK_BREAK, + TOK_RETURN, + TOK_GOTO, // Type Keywords - TOK_TK_VOID, - TOK_TK_CHAR, - TOK_TK_SHORT, - TOK_TK_INT, - TOK_TK_LONG, - TOK_TK_FLOAT, - TOK_TK_DOUBLE, - TOK_TK_SIGNED, - TOK_TK_UNSIGNED, - TOK_TK_STRUCT, - TOK_TK_UNION, - TOK_TK_ENUM, - TOK_TK_TYPEDEF, + TOK_VOID, + TOK_CHAR, + TOK_SHORT, + TOK_INT, + TOK_LONG, + TOK_FLOAT, + TOK_DOUBLE, + TOK_SIGNED, + TOK_UNSIGNED, + TOK_STRUCT, + TOK_UNION, + TOK_ENUM, + TOK_TYPEDEF, // Storage Class/Specifier Keywords - TOK_SCSK_AUTO, - TOK_SCSK_REGISTER, - TOK_SCSK_STATIC, - TOK_SCSK_EXTERN, - TOK_SCSK_CONST, - TOK_SCSK_VOLATILE, + TOK_AUTO, + TOK_REGISTER, + TOK_STATIC, + TOK_EXTERN, + TOK_CONST, + TOK_VOLATILE, // Misc Keywords - TOK_MK_SIZEOF, + TOK_SIZEOF, // Operators - TOK_OP_ADD, // + - TOK_OP_SUB, // - - TOK_OP_MUL, // * - TOK_OP_DIV, // / - TOK_OP_MOD, // % - TOK_OP_BIT_AND, // & - TOK_OP_BIT_OR, // | - TOK_OP_BIT_XOR, // ^ - TOK_OP_BIT_NOT, // ~ - TOK_OP_LSHIFT, // << - TOK_OP_RSHIFT, // >> - TOK_OP_NOT, // ! - TOK_OP_ASSIGN, // = - TOK_OP_LT, // < - TOK_OP_GT, // > - TOK_OP_INC, // ++ - TOK_OP_DEC, // -- - TOK_OP_EQ, // == - TOK_OP_NE, // != - TOK_OP_LE, // <= - TOK_OP_GE, // >= - TOK_OP_AND, // && - TOK_OP_OR, // || - TOK_OP_MEMBER_POINTER, // -> - TOK_OP_MEMBER, // . - TOK_OP_COND_DECISION, // : - TOK_OP_COND, // ? - TOK_OP_ASSIGN_ADD, // += - TOK_OP_ASSIGN_SUB, // -= - TOK_OP_ASSIGN_MUL, // *= - TOK_OP_ASSIGN_DIV, // /= - TOK_OP_ASSIGN_MOD, // %= - TOK_OP_ASSIGN_BITAND, // &= - TOK_OP_ASSIGN_BITOR, // |= - TOK_OP_ASSIGN_BITXOR, // ^= - TOK_OP_ASSIGN_LSHIFT, // <<= - TOK_OP_ASSIGN_RSHIFT, // >>= + TOK_ADD, // + + TOK_SUB, // - + TOK_MUL, // * + TOK_DIV, // / + TOK_MOD, // % + TOK_BIT_AND, // & + TOK_BIT_OR, // | + TOK_BIT_XOR, // ^ + TOK_BIT_NOT, // ~ + TOK_LSHIFT, // << + TOK_RSHIFT, // >> + TOK_NOT, // ! + TOK_ASSIGN, // = + TOK_LT, // < + TOK_GT, // > + TOK_INC, // ++ + TOK_DEC, // -- + TOK_EQ, // == + TOK_NE, // != + TOK_LE, // <= + TOK_GE, // >= + TOK_AND, // && + TOK_OR, // || + TOK_MEMBER_POINTER, // -> + TOK_MEMBER, // . + TOK_COND_DECISION, // : + TOK_COND, // ? + TOK_ASSIGN_ADD, // += + TOK_ASSIGN_SUB, // -= + TOK_ASSIGN_MUL, // *= + TOK_ASSIGN_DIV, // /= + TOK_ASSIGN_MOD, // %= + TOK_ASSIGN_BITAND, // &= + TOK_ASSIGN_BITOR, // |= + TOK_ASSIGN_BITXOR, // ^= + TOK_ASSIGN_LSHIFT, // <<= + TOK_ASSIGN_RSHIFT, // >>= // Separators - TOK_SEP_LEFT_PAREN, // ( - TOK_SEP_RIGHT_PAREN, // ) - TOK_SEP_LEFT_BRACKET, // [ - TOK_SEP_RIGHT_BRACKET, // ] - TOK_SEP_LEFT_BRACE, // { - TOK_SEP_RIGHT_BRACE, // } - TOK_SEP_COMMA, // , - TOK_SEP_SEMICOLON, // ; - TOK_SEP_DOT, // . - TOK_SEP_ELLIPSIS, // ... - TOK_SEP_HASH, // # + TOK_LEFT_PAREN, // ( + TOK_RIGHT_PAREN, // ) + TOK_LEFT_BRACKET, // [ + TOK_RIGHT_BRACKET, // ] + TOK_LEFT_BRACE, // { + TOK_RIGHT_BRACE, // } + TOK_COMMA, // , + TOK_SEMICOLON, // ; + TOK_DOT, // . + TOK_ELLIPSIS, // ... + TOK_HASH, // # // Identifiers TOK_ID, // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' - TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) + TOK_INTEGER_U32, // u + TOK_INTEGER_U64, // ul + TOK_INTEGER_S32, // (no suffix) + TOK_INTEGER_S64, // l + TOK_FLOAT_32, // f + TOK_FLOAT_64, // (no suffix) + TOK_CHAR_CONST, // 'c' + TOK_STRING_ASCII, // "string" (width of 8 bits) // Special - TOK_SPECIAL_EOF, - TOK_SPECIAL_ERROR, + TOK_EOF, + TOK_ERROR, } c_token_types; --- @@ -245,7 +238,7 @@ We bring this all together in `token.h`. Line and column are exposed as global v --- token.h #ifndef TOKEN_H #define TOKEN_H -#include <stdint.h> // We use this for int64_t +#include <stdint.h> // For int64_t @{Token Types} @{Opaque Token Type} @{Token Creation and Destruction} @@ -256,15 +249,15 @@ extern int line; --- @s Token Implementation -Now that we have the interface, we can implement the token data structure. We'll need a couple of things: -* The token type. -* A way to store extra data. -* Implementations of the functions we defined in the interface. -@s -One problem is we haven't defined a way to verify that the token we're getting isn't corrupt. We'll use a tag for that. +Now that we have the interface, we can implement the token data structure. We'll need: +- The token type. +- A way to store extra data. +- Implementations of the functions defined in the interface. + +To verify the token isn't corrupt, we'll use a tag. `TOK_MAGIC_1` represents a token with optional data, and `TOK_MAGIC_2` represents a token without optional data. -You might notice that a zero-length array is used in the token data structure. This is a GCC extension that allows us to allocate memory for the token data structure and the token data in one allocation. This is a bit of a hack, but it's a common pattern in C code. +A zero-length array is used in the token data structure. This GCC extension allows us to allocate memory for the token data structure and the token data in one allocation. --- Token Data Structure #define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" @@ -295,7 +288,7 @@ int line = 1; @s -We'll need to implement an interface for accessing the token data and a macro for accessing optional data. +We'll implement an interface for accessing the token data and a macro for accessing optional data. --- Token Data Access #define token_data(token) ((struct token_data *)((token)->opt_data)) @@ -305,29 +298,25 @@ c_token_types token_type(token_t *token) { } int64_t token_int(token_t *token) { - assert(token->kind == TOK_CONST_INTEGER_U32 || - token->kind == TOK_CONST_INTEGER_U64 || - token->kind == TOK_CONST_INTEGER_S32 || - token->kind == TOK_CONST_INTEGER_S64); + assert(token->kind == TOK_INTEGER_U32 || token->kind == TOK_INTEGER_U64 || token->kind == TOK_INTEGER_S32 || token->kind == TOK_INTEGER_S64); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.i; } double token_float(token_t *token) { - assert(token->kind == TOK_CONST_FLOAT_32 || - token->kind == TOK_CONST_FLOAT_64); + assert(token->kind == TOK_FLOAT_32 || token->kind == TOK_FLOAT_64); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.f; } const char *token_string(token_t *token) { - assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID); + assert(token->kind == TOK_STRING_ASCII || token->kind == TOK_ID); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.s; } char token_char(token_t *token) { - assert(token->kind == TOK_CONST_CHAR); + assert(token->kind == TOK_CHAR_CONST); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.c; } @@ -344,280 +333,275 @@ int token_column(token_t *token) { --- @s -For debugging, I'll add a function to print the token type. +For debugging, we'll add a function to print the token type. --- Token Debugging +@{Token Type Enum to String} +@{Unescape String} +@{Print Token} +--- + +@s +This function returns a string with the token type name. +--- Token Type Enum to String const char *token_name_from_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: - return "TOK_CTK_IF"; - case TOK_CTK_ELSE: - return "TOK_CTK_ELSE"; - case TOK_CTK_SWITCH: - return "TOK_CTK_SWITCH"; - case TOK_CTK_CASE: - return "TOK_CTK_CASE"; - case TOK_CTK_DEFAULT: - return "TOK_CTK_DEFAULT"; - case TOK_CTK_WHILE: - return "TOK_CTK_WHILE"; - case TOK_CTK_DO: - return "TOK_CTK_DO"; - case TOK_CTK_FOR: - return "TOK_CTK_FOR"; - case TOK_CTK_CONTINUE: - return "TOK_CTK_CONTINUE"; - case TOK_CTK_BREAK: - return "TOK_CTK_BREAK"; - case TOK_CTK_RETURN: - return "TOK_CTK_RETURN"; - case TOK_CTK_GOTO: - return "TOK_CTK_GOTO"; - case TOK_TK_VOID: - return "TOK_TK_VOID"; - case TOK_TK_CHAR: - return "TOK_TK_CHAR"; - case TOK_TK_SHORT: - return "TOK_TK_SHORT"; - case TOK_TK_INT: - return "TOK_TK_INT"; - case TOK_TK_LONG: - return "TOK_TK_LONG"; - case TOK_TK_FLOAT: - return "TOK_TK_FLOAT"; - case TOK_TK_DOUBLE: - return "TOK_TK_DOUBLE"; - case TOK_TK_SIGNED: - return "TOK_TK_SIGNED"; - case TOK_TK_UNSIGNED: - return "TOK_TK_UNSIGNED"; - case TOK_TK_STRUCT: - return "TOK_TK_STRUCT"; - case TOK_TK_UNION: - return "TOK_TK_UNION"; - case TOK_TK_ENUM: - return "TOK_TK_ENUM"; - case TOK_TK_TYPEDEF: - return "TOK_TK_TYPEDEF"; - case TOK_SCSK_AUTO: - return "TOK_SCSK_AUTO"; - case TOK_SCSK_REGISTER: - return "TOK_SCSK_REGISTER"; - case TOK_SCSK_STATIC: - return "TOK_SCSK_STATIC"; - case TOK_SCSK_EXTERN: - return "TOK_SCSK_EXTERN"; - case TOK_SCSK_CONST: - return "TOK_SCSK_CONST"; - case TOK_SCSK_VOLATILE: - return "TOK_SCSK_VOLATILE"; - case TOK_MK_SIZEOF: - return "TOK_MK_SIZEOF"; - case TOK_OP_ADD: - return "TOK_OP_ADD"; - case TOK_OP_SUB: - return "TOK_OP_SUB"; - case TOK_OP_MUL: - return "TOK_OP_MUL"; - case TOK_OP_DIV: - return "TOK_OP_DIV"; - case TOK_OP_MOD: - return "TOK_OP_MOD"; - case TOK_OP_BIT_AND: - return "TOK_OP_BIT_AND"; - case TOK_OP_BIT_OR: - return "TOK_OP_BIT_OR"; - case TOK_OP_BIT_XOR: - return "TOK_OP_BIT_XOR"; - case TOK_OP_BIT_NOT: - return "TOK_OP_BIT_NOT"; - case TOK_OP_LSHIFT: - return "TOK_OP_LSHIFT"; - case TOK_OP_RSHIFT: - return "TOK_OP_RSHIFT"; - case TOK_OP_NOT: - return "TOK_OP_NOT"; - case TOK_OP_ASSIGN: - return "TOK_OP_ASSIGN"; - case TOK_OP_LT: - return "TOK_OP_LT"; - case TOK_OP_GT: - return "TOK_OP_GT"; - case TOK_OP_INC: - return "TOK_OP_INC"; - case TOK_OP_DEC: - return "TOK_OP_DEC"; - case TOK_OP_EQ: - return "TOK_OP_EQ"; - case TOK_OP_NE: - return "TOK_OP_NE"; - case TOK_OP_LE: - return "TOK_OP_LE"; - case TOK_OP_GE: - return "TOK_OP_GE"; - case TOK_OP_AND: - return "TOK_OP_AND"; - case TOK_OP_OR: - return "TOK_OP_OR"; - case TOK_OP_MEMBER_POINTER: - return "TOK_OP_MEMBER_POINTER"; - case TOK_OP_MEMBER: - return "TOK_OP_MEMBER"; - case TOK_OP_COND_DECISION: - return "TOK_OP_COND_DECISION"; - case TOK_OP_COND: - return "TOK_OP_COND"; - case TOK_OP_ASSIGN_ADD: - return "TOK_OP_ASSIGN_ADD"; - case TOK_OP_ASSIGN_SUB: - return "TOK_OP_ASSIGN_SUB"; - case TOK_OP_ASSIGN_MUL: - return "TOK_OP_ASSIGN_MUL"; - case TOK_OP_ASSIGN_DIV: - return "TOK_OP_ASSIGN_DIV"; - case TOK_OP_ASSIGN_MOD: - return "TOK_OP_ASSIGN_MOD"; - case TOK_OP_ASSIGN_BITAND: - return "TOK_OP_ASSIGN_BITAND"; - case TOK_OP_ASSIGN_BITOR: - return "TOK_OP_ASSIGN_BITOR"; - case TOK_OP_ASSIGN_BITXOR: - return "TOK_OP_ASSIGN_BITXOR"; - case TOK_OP_ASSIGN_LSHIFT: - return "TOK_OP_ASSIGN_LSHIFT"; - case TOK_OP_ASSIGN_RSHIFT: - return "TOK_OP_ASSIGN_RSHIFT"; - case TOK_SEP_HASH: - return "TOK_SEP_HASH"; + case TOK_IF: + return "TOK_IF"; + case TOK_ELSE: + return "TOK_ELSE"; + case TOK_SWITCH: + return "TOK_SWITCH"; + case TOK_CASE: + return "TOK_CASE"; + case TOK_DEFAULT: + return "TOK_DEFAULT"; + case TOK_WHILE: + return "TOK_WHILE"; + case TOK_DO: + return "TOK_DO"; + case TOK_FOR: + return "TOK_FOR"; + case TOK_CONTINUE: + return "TOK_CONTINUE"; + case TOK_BREAK: + return "TOK_BREAK"; + case TOK_RETURN: + return "TOK_RETURN"; + case TOK_GOTO: + return "TOK_GOTO"; + case TOK_VOID: + return "TOK_VOID"; + case TOK_CHAR: + return "TOK_CHAR"; + case TOK_SHORT: + return "TOK_SHORT"; + case TOK_INT: + return "TOK_INT"; + case TOK_LONG: + return "TOK_LONG"; + case TOK_FLOAT: + return "TOK_FLOAT"; + case TOK_DOUBLE: + return "TOK_DOUBLE"; + case TOK_SIGNED: + return "TOK_SIGNED"; + case TOK_UNSIGNED: + return "TOK_UNSIGNED"; + case TOK_STRUCT: + return "TOK_STRUCT"; + case TOK_UNION: + return "TOK_UNION"; + case TOK_ENUM: + return "TOK_ENUM"; + case TOK_TYPEDEF: + return "TOK_TYPEDEF"; + case TOK_AUTO: + return "TOK_AUTO"; + case TOK_REGISTER: + return "TOK_REGISTER"; + case TOK_STATIC: + return "TOK_STATIC"; + case TOK_EXTERN: + return "TOK_EXTERN"; + case TOK_CONST: + return "TOK_CONST"; + case TOK_VOLATILE: + return "TOK_VOLATILE"; + case TOK_SIZEOF: + return "TOK_SIZEOF"; + case TOK_ADD: + return "TOK_ADD"; + case TOK_SUB: + return "TOK_SUB"; + case TOK_MUL: + return "TOK_MUL"; + case TOK_DIV: + return "TOK_DIV"; + case TOK_MOD: + return "TOK_MOD"; + case TOK_BIT_AND: + return "TOK_BIT_AND"; + case TOK_BIT_OR: + return "TOK_BIT_OR"; + case TOK_BIT_XOR: + return "TOK_BIT_XOR"; + case TOK_BIT_NOT: + return "TOK_BIT_NOT"; + case TOK_LSHIFT: + return "TOK_LSHIFT"; + case TOK_RSHIFT: + return "TOK_RSHIFT"; + case TOK_NOT: + return "TOK_NOT"; + case TOK_ASSIGN: + return "TOK_ASSIGN"; + case TOK_LT: + return "TOK_LT"; + case TOK_GT: + return "TOK_GT"; + case TOK_INC: + return "TOK_INC"; + case TOK_DEC: + return "TOK_DEC"; + case TOK_EQ: + return "TOK_EQ"; + case TOK_NE: + return "TOK_NE"; + case TOK_LE: + return "TOK_LE"; + case TOK_GE: + return "TOK_GE"; + case TOK_AND: + return "TOK_AND"; + case TOK_OR: + return "TOK_OR"; + case TOK_MEMBER_POINTER: + return "TOK_MEMBER_POINTER"; + case TOK_MEMBER: + return "TOK_MEMBER"; + case TOK_COND_DECISION: + return "TOK_COND_DECISION"; + case TOK_COND: + return "TOK_COND"; + case TOK_ASSIGN_ADD: + return "TOK_ASSIGN_ADD"; + case TOK_ASSIGN_SUB: + return "TOK_ASSIGN_SUB"; + case TOK_ASSIGN_MUL: + return "TOK_ASSIGN_MUL"; + case TOK_ASSIGN_DIV: + return "TOK_ASSIGN_DIV"; + case TOK_ASSIGN_MOD: + return "TOK_ASSIGN_MOD"; + case TOK_ASSIGN_BITAND: + return "TOK_ASSIGN_BITAND"; + case TOK_ASSIGN_BITOR: + return "TOK_ASSIGN_BITOR"; + case TOK_ASSIGN_BITXOR: + return "TOK_ASSIGN_BITXOR"; + case TOK_ASSIGN_LSHIFT: + return "TOK_ASSIGN_LSHIFT"; + case TOK_ASSIGN_RSHIFT: + return "TOK_ASSIGN_RSHIFT"; + case TOK_HASH: + return "TOK_HASH"; case TOK_ID: return "TOK_ID"; - case TOK_CONST_INTEGER_U32: - return "TOK_CONST_INTEGER_U32"; - case TOK_CONST_INTEGER_U64: - return "TOK_CONST_INTEGER_U64"; - case TOK_CONST_INTEGER_S32: - return "TOK_CONST_INTEGER_S32"; - case TOK_CONST_INTEGER_S64: - return "TOK_CONST_INTEGER_S64"; - case TOK_CONST_FLOAT_32: - return "TOK_CONST_FLOAT_32"; - case TOK_CONST_FLOAT_64: - return "TOK_CONST_FLOAT_64"; - case TOK_CONST_CHAR: - return "TOK_CONST_CHAR"; - case TOK_CONST_STRING_ASCII: - return "TOK_CONST_STRING_ASCII"; - case TOK_SPECIAL_EOF: - return "TOK_SPECIAL_EOF"; - case TOK_SPECIAL_ERROR: - return "TOK_SPECIAL_ERROR"; - case TOK_SEP_LEFT_PAREN: - return "TOK_SEP_LEFT_PAREN"; - case TOK_SEP_RIGHT_PAREN: - return "TOK_SEP_RIGHT_PAREN"; - case TOK_SEP_LEFT_BRACKET: - return "TOK_SEP_LEFT_BRACKET"; - case TOK_SEP_RIGHT_BRACKET: - return "TOK_SEP_RIGHT_BRACKET"; - case TOK_SEP_LEFT_BRACE: - return "TOK_SEP_LEFT_BRACE"; - case TOK_SEP_RIGHT_BRACE: - return "TOK_SEP_RIGHT_BRACE"; - case TOK_SEP_COMMA: - return "TOK_SEP_COMMA"; - case TOK_SEP_SEMICOLON: - return "TOK_SEP_SEMICOLON"; - case TOK_SEP_DOT: - return "TOK_SEP_DOT"; - case TOK_SEP_ELLIPSIS: - return "TOK_SEP_ELLIPSIS"; + case TOK_INTEGER_U32: + return "TOK_INTEGER_U32"; + case TOK_INTEGER_U64: + return "TOK_INTEGER_U64"; + case TOK_INTEGER_S32: + return "TOK_INTEGER_S32"; + case TOK_INTEGER_S64: + return "TOK_INTEGER_S64"; + case TOK_FLOAT_32: + return "TOK_FLOAT_32"; + case TOK_FLOAT_64: + return "TOK_FLOAT_64"; + case TOK_CHAR_CONST: + return "TOK_CHAR_CONST"; + case TOK_STRING_ASCII: + return "TOK_STRING_ASCII"; + case TOK_EOF: + return "TOK_EOF"; + case TOK_ERROR: + return "TOK_ERROR"; + case TOK_LEFT_PAREN: + return "TOK_LEFT_PAREN"; + case TOK_RIGHT_PAREN: + return "TOK_RIGHT_PAREN"; + case TOK_LEFT_BRACKET: + return "TOK_LEFT_BRACKET"; + case TOK_RIGHT_BRACKET: + return "TOK_RIGHT_BRACKET"; + case TOK_LEFT_BRACE: + return "TOK_LEFT_BRACE"; + case TOK_RIGHT_BRACE: + return "TOK_RIGHT_BRACE"; + case TOK_COMMA: + return "TOK_COMMA"; + case TOK_SEMICOLON: + return "TOK_SEMICOLON"; + case TOK_DOT: + return "TOK_DOT"; + case TOK_ELLIPSIS: + return "TOK_ELLIPSIS"; } return "UNKNOWN"; } +--- +@s +This function adds escape characters to a string for printing. +--- Unescape String +#define clamp(x, min, max) ((x) < (min) ? (min) : (x) > (max) ? (max) : (x)) char *re_escape_string(const char *str) { int len = strlen(str); char *buf = malloc(len * 2 + 1); - if (buf == NULL) { + if (!buf) { fprintf(stderr, "Out of memory. Cannot escape string\n"); exit(1); } int i = 0; for (int j = 0; j < len; j++) { switch (str[j]) { - case '\a': - buf[i++] = '\\'; - buf[i++] = 'a'; - break; - case '\b': - buf[i++] = '\\'; - buf[i++] = 'b'; - break; - case '\f': - buf[i++] = '\\'; - buf[i++] = 'f'; - break; - case '\n': - buf[i++] = '\\'; - buf[i++] = 'n'; - break; - case '\r': - buf[i++] = '\\'; - buf[i++] = 'r'; - break; - case '\t': - buf[i++] = '\\'; - buf[i++] = 't'; - break; - case '\v': - buf[i++] = '\\'; - buf[i++] = 'v'; - break; - case '\\': - buf[i++] = '\\'; - buf[i++] = '\\'; - break; - case '\'': - buf[i++] = '\\'; - buf[i++] = '\''; - break; - case '"': - buf[i++] = '\\'; - buf[i++] = '"'; - break; - default: - buf[i++] = str[j]; - break; + case '\a': buf[i++] = '\\'; buf[i++] = 'a'; break; + case '\b': buf[i++] = '\\'; buf[i++] = 'b'; break; + case '\f': buf[i++] = '\\'; buf[i++] = 'f'; break; + case '\n': buf[i++] = '\\'; buf[i++] = 'n'; break; + case '\r': buf[i++] = '\\'; buf[i++] = 'r'; break; + case '\t': buf[i++] = '\\'; buf[i++] = 't'; break; + case '\v': buf[i++] = '\\'; buf[i++] = 'v'; break; + case '\\': buf[i++] = '\\'; buf[i++] = '\\'; break; + case '\'': buf[i++] = '\\'; buf[i++] = '\''; break; + case '"': buf[i++] = '\\'; buf[i++] = '"'; break; + default: { + if (isprint(str[j])) { + buf[i++] = str[j]; + } else { + buf[i++] = '\\'; + buf[i++] = 'x'; + buf[i++] = "0123456789abcdef"[clamp(str[j] >> 4, 0, 0xf)]; + buf[i++] = "0123456789abcdef"[clamp(str[j] & 0xf, 0, 0xf)]; + } + } } } buf[i] = '\0'; return buf; } +--- + +@s +This function prints the token type and value. +--- Print Token void print_token(token_t *tok) { - if (tok == NULL) { + if (!tok) { printf("NULL\n"); return; } const char *name = token_name_from_type(tok->kind); switch (tok->kind) { case TOK_ID: - case TOK_CONST_STRING_ASCII: { + case TOK_STRING_ASCII: { char *escaped = re_escape_string(token_string(tok)); printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column); free(escaped); break; } - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column); break; - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_S64: - case TOK_CONST_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_U32: + case TOK_INTEGER_S64: + case TOK_INTEGER_U64: printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column); break; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column); break; default: @@ -628,11 +612,11 @@ void print_token(token_t *tok) { --- @s -Now we can implement the functions we defined in the interface. +Now we can implement functions to create and destroy tokens. We'll start with the easy ones. --- Token Creation and Destruction token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -646,7 +630,7 @@ token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -678,9 +662,6 @@ token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len void token_destroy(token_t *token) { if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) { - if (token->kind == TOK_CONST_STRING_ASCII) { - free((char *)token_data(token)->data.s); - } free(token); } else { fputs("Corrupt token\n", stderr); @@ -705,20 +686,21 @@ There's an issue with this approach. `token_create_string` will be called for ev To fix this, we use a hash table to store the strings. We'll define a hash table in `hash_table.h` and `hash_table.c`. @s Hash Table -A hash table is a data structure that maps keys to values. It's commonly used to store information, such as variables and functions in a symbol table. To implement a generic hash table, we'll need several things: -* A function to hash the keys. -* A function to compare keys. -* An opaque type for the hash table. -* A function to destroy deleted keys and values. +A hash table is a data structure that maps keys to values. It's commonly used for implementing symbol tables to store variables and functions. To create a generic hash table, we'll need: -Let's start with the interface. +1. A hash function to convert keys into array indices +2. A comparison function to check if two keys are equal +3. An opaque type to represent the hash table +4. A destructor function to clean up keys and values when removing entries + +Let's start with the interface: @s --- Hash Table Opaque Types typedef struct hash_table hash_table_t; -typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned int (*hash_table_hash_fn)(void *key); -typedef void (*hash_table_dtor)(void *value, int is_key); +typedef int (*hash_table_cmp_fn)(const void *key1, const void *key2); +typedef unsigned int (*hash_table_hash_fn)(const void *key); +typedef void (*hash_table_dtor)(void *data, int is_key); --- @s @@ -729,23 +711,25 @@ void hash_table_destroy(hash_table_t *table); @s --- Hash Table Access -void *hash_table_get(hash_table_t *table, void *key); +void *hash_table_get(const hash_table_t *table, const void *key); void hash_table_put(hash_table_t *table, void *key, void *value); -void hash_table_remove(hash_table_t *table, void *key); +void hash_table_remove(hash_table_t *table, const void *key); --- @s --- hash_table.h #ifndef HASH_TABLE_H #define HASH_TABLE_H + @{Hash Table Opaque Types} @{Hash Table Creation and Destruction} @{Hash Table Access} -#endif + +#endif /* HASH_TABLE_H */ --- @s -Let's implement the hash table now. +Now let's implement the hash table: --- hash_table.c #include <stdlib.h> @@ -757,102 +741,114 @@ Let's implement the hash table now. @{Hash Table Entry Data Structure} hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) { - @{Allocate and Initialize Hash Table} - return table; + @{Allocate and Initialize Hash Table} + return table; } void hash_table_destroy(hash_table_t *table) { - @{Destroy Entries} - free(table->entries); - free(table); + if (!table) return; + @{Destroy Entries} + free(table->entries); + free(table); } -void *hash_table_get(hash_table_t *table, void *key) { - @{Get Entry By Hash} - @{Loop Through Entries and Return Value if Match} - return NULL; +void *hash_table_get(const hash_table_t *table, const void *key) { + if (!table || !key) return NULL; + @{Get Entry By Hash} + @{Loop Through Entries and Return Value if Match} + return NULL; } void hash_table_put(hash_table_t *table, void *key, void *value) { - @{Get Entry By Hash} - @{Loop Through Entries and Replace Value if Key Matches} - @{Allocate New Entry if No Match} + if (!table || !key) return; + @{Get Entry By Hash} + @{Loop Through Entries and Replace Value if Key Matches} + @{Allocate New Entry if No Match} } -void hash_table_remove(hash_table_t *table, void *key) { - @{Get Entry By Hash} - @{Loop Through Entries and Remove Entry if Key Matches} +void hash_table_remove(hash_table_t *table, const void *key) { + if (!table || !key) return; + @{Get Entry By Hash} + @{Loop Through Entries and Remove Entry if Key Matches} } #ifdef TEST_HASH_TABLE #include <assert.h> -#include <stdio.h> -#include <string.h> -int string_cmp(void *key1, void *key2) { - return strcmp((char *)key1, (char *)key2); +static int string_cmp(const void *key1, const void *key2) { + return strcmp((const char *)key1, (const char *)key2); } -unsigned long string_hash(void *key) { - unsigned long hash = 5381; - char *str = (char *)key; - while (*str != '\0') { - hash = ((hash << 5) + hash) + *str; - str++; - } - return hash; +static unsigned int string_hash(const void *key) { + unsigned int hash = 5381; + const unsigned char *str = key; + int c; + + while ((c = *str++)) + { + hash = ((hash << 5) + hash) + c; + } + + return hash; } -int main() { - hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); - hash_table_put(table, "foo", "bar"); - hash_table_put(table, "foo", "baz"); - assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0); - hash_table_remove(table, "foo"); - assert(hash_table_get(table, "foo") == NULL); - hash_table_destroy(table); - return 0; +int main(void) { + hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); + assert(table != NULL); + + hash_table_put(table, "foo", "bar"); + hash_table_put(table, "foo", "baz"); + assert(strcmp((const char *)hash_table_get(table, "foo"), "baz") == 0); + + hash_table_remove(table, "foo"); + assert(hash_table_get(table, "foo") == NULL); + + hash_table_destroy(table); + printf("All tests passed!\n"); + return 0; } #endif --- @s -For the hash table data structure, we'll define a pointer to an array of entries, the size of the array, and the hash/comparison functions. +The hash table data structure contains an array of entry pointers, the size of the array, and function pointers for comparison, hashing, and destruction. --- Hash Table Data Structure struct hash_table { - struct hash_table_entry **entries; - int size; - hash_table_cmp_fn cmp; - hash_table_hash_fn hash; - hash_table_dtor dtor; + struct hash_table_entry **entries; + int size; + hash_table_cmp_fn cmp; + hash_table_hash_fn hash; + hash_table_dtor dtor; }; --- @s -Entries in the hash table will have a key, a value, and a link to the next entry in the chain. +Each entry in the hash table contains a key, a value, and a pointer to the next entry in the chain for collision resolution via chaining. --- Hash Table Entry Data Structure struct hash_table_entry { - void *key; - void *value; - struct hash_table_entry *next; + void *key; + void *value; + struct hash_table_entry *next; }; --- + @s -Allocating a hash table involves allocating memory for the hash table itself and the entries, zeroing out the entries, and setting the hash and comparison functions. +To allocate a hash table, we allocate memory for the table structure and its entries, initialize the entries to NULL, and set the function pointers. --- Allocate and Initialize Hash Table hash_table_t *table = malloc(sizeof(struct hash_table)); -if (table == NULL) { - fputs("Out of memory, could not allocate hash table\n", stderr); - exit(1); +if (!table) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table\n"); + exit(EXIT_FAILURE); } table->entries = calloc(size, sizeof(struct hash_table_entry *)); -if (table->entries == NULL) { - fputs("Out of memory, could not allocate hash table entries\n", stderr); - exit(1); +if (!table->entries) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entries\n"); + free(table); + exit(EXIT_FAILURE); } table->size = size; table->cmp = cmp; @@ -861,25 +857,24 @@ table->dtor = dtor; --- @s -To destroy a hash table, we loop through the entries, freeing the keys and values, and then free the entries and the table itself. - +To destroy the entries in a hash table, we iterate through all entries, free the keys and values using the destructor if provided, and free the entry itself. --- Destroy Entries for (int i = 0; i < table->size; i++) { - struct hash_table_entry *entry = table->entries[i]; - while (entry != NULL) { - struct hash_table_entry *next = entry->next; - if (table->dtor != NULL) { - table->dtor(entry->key, 1); - table->dtor(entry->value, 0); + struct hash_table_entry *entry = table->entries[i]; + while (entry) { + struct hash_table_entry *next = entry->next; + if (table->dtor) { + table->dtor(entry->key, 1); + table->dtor(entry->value, 0); + } + free(entry); + entry = next; } - free(entry); - entry = next; - } } --- @s -To get an entry from the hash table, we hash the key, loop through the entries, and return the value if we find a match. +To retrieve an entry's hash bucket, we apply the hash function to the key and take the modulus of the result with the table size. --- Get Entry By Hash unsigned int hash = table->hash(key) % table->size; @@ -887,28 +882,29 @@ struct hash_table_entry *entry = table->entries[hash]; --- @s -To put an entry in the hash table, we hash the key, loop through the entries, and replace the value if we find a match. +When putting a new entry in the table, we first check if the key already exists. If it does, we update the value; otherwise, we create a new entry. --- Loop Through Entries and Replace Value if Key Matches -while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - entry->value = value; - return; - } - entry = entry->next; +while (entry) { + if (table->cmp(entry->key, key) == 0) { + if (table->dtor) table->dtor(entry->value, 0); + entry->value = value; + return; + } + entry = entry->next; } --- @s -If we don't find a match, we allocate a new entry, set the key and value, and insert it at the head of the linked list. +If no matching key is found, we create a new entry and insert it at the beginning of the hash bucket. -This exploits a property in computer science called locality of reference. The gist of that is that when you write to a piece of memory, you're likely to read from it again soon. By putting the new entry at the head of the linked list, we increase the chances that we'll find it quickly next time. +This can possibly improve performance due to a property called temporal locality. When we access an entry, we're likely to access it again soon. Since the entry is at the beginning of the list, it's likely to be accessed again soon. --- Allocate New Entry if No Match struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); -if (new_entry == NULL) { - fputs("Out of memory, could not allocate hash table entry\n", stderr); - exit(1); +if (!new_entry) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entry\n"); + exit(EXIT_FAILURE); } new_entry->key = key; new_entry->value = value; @@ -917,18 +913,18 @@ table->entries[hash] = new_entry; --- @s -To remove an entry from the hash table, we hash the key, loop through the entries, and remove the entry if we find a match. +To remove an entry, we find its bucket, update the linked list to bypass it, then free the entry and its contents. --- Loop Through Entries and Remove Entry if Key Matches struct hash_table_entry *prev = NULL; -while (entry != NULL) { +while (entry) { if (table->cmp(entry->key, key) == 0) { - if (prev == NULL) { - table->entries[hash] = entry->next; - } else { + if (prev) prev->next = entry->next; - } - if (table->dtor != NULL) { + else + table->entries[hash] = entry->next; + + if (table->dtor) { table->dtor(entry->key, 1); table->dtor(entry->value, 0); } @@ -941,14 +937,14 @@ while (entry != NULL) { --- @s -To find a value associated with a given key in the hash table, we hash the string, loop through the entries, and return the value if a match is found. +To retrieve a value from a given bucket, we just walk the list and return the value if a matching key is found. --- Loop Through Entries and Return Value if Match -while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - return entry->value; - } - entry = entry->next; +while (entry) { + if (table->cmp(entry->key, key) == 0) { + return entry->value; + } + entry = entry->next; } --- @@ -959,7 +955,7 @@ Hash functions are a very interesting topic and there's a lot of good research o We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful. -Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 376 were unique. That's a collision rate of 99.999999%! +Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 3760 were unique, which is terrible. Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well. @@ -982,9 +978,9 @@ This is a bit slow on modern processors because it's not very cache-friendly. We As you can see in the code below, this function avoids extra operations and should be much faster. --- Hash Function := -static unsigned int hash_string(void *key) { +static unsigned int hash_string(const void *key) { unsigned long hash = 0, hi = 0; - char *p = key; + const char *p = key; hash = *p; if (hash != 0 && p[1] != 0) { hash = (hash << 4) + p[1]; @@ -1012,7 +1008,7 @@ static unsigned int hash_string(void *key) { @s We also need a comparison function for strings. --- String Comparison -static int cmp_string(void *key1, void *key2) { +static int cmp_string(const void *key1, const void *key2) { return strcmp((char *)key1, (char *)key2); } --- @@ -1052,7 +1048,6 @@ token_t *token_create_string(c_token_types kind, int lin, int col, } --- - @s We'll add an external declaration for `string_table` in `token.h` so other programs can take advantage of it. --- token.h := @@ -1077,6 +1072,7 @@ Finally, we implement the token data structure in `token.c`. #include <string.h> #include <stdio.h> #include <assert.h> +#include <ctype.h> #include "token.h" #include "hash_table.h" @{Token Data Structure} @@ -1313,183 +1309,183 @@ We'll need a helper function to convert token types to strings. It's pretty simp --- Stringify Type const char *stringify_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: + case TOK_IF: return "if"; - case TOK_CTK_ELSE: + case TOK_ELSE: return "else"; - case TOK_CTK_SWITCH: + case TOK_SWITCH: return "switch"; - case TOK_CTK_CASE: + case TOK_CASE: return "case"; - case TOK_CTK_DEFAULT: + case TOK_DEFAULT: return "default"; - case TOK_CTK_WHILE: + case TOK_WHILE: return "while"; - case TOK_CTK_DO: + case TOK_DO: return "do"; - case TOK_CTK_FOR: + case TOK_FOR: return "for"; - case TOK_CTK_CONTINUE: + case TOK_CONTINUE: return "continue"; - case TOK_CTK_BREAK: + case TOK_BREAK: return "break"; - case TOK_CTK_RETURN: + case TOK_RETURN: return "return"; - case TOK_CTK_GOTO: + case TOK_GOTO: return "goto"; - case TOK_TK_VOID: + case TOK_VOID: return "void"; - case TOK_TK_CHAR: + case TOK_CHAR: return "char"; - case TOK_TK_SHORT: + case TOK_SHORT: return "short"; - case TOK_TK_INT: + case TOK_INT: return "int"; - case TOK_TK_LONG: + case TOK_LONG: return "long"; - case TOK_TK_FLOAT: + case TOK_FLOAT: return "float"; - case TOK_TK_DOUBLE: + case TOK_DOUBLE: return "double"; - case TOK_TK_SIGNED: + case TOK_SIGNED: return "signed"; - case TOK_TK_UNSIGNED: + case TOK_UNSIGNED: return "unsigned"; - case TOK_TK_STRUCT: + case TOK_STRUCT: return "struct"; - case TOK_TK_UNION: + case TOK_UNION: return "union"; - case TOK_TK_ENUM: + case TOK_ENUM: return "enum"; - case TOK_TK_TYPEDEF: + case TOK_TYPEDEF: return "typedef"; - case TOK_SCSK_AUTO: + case TOK_AUTO: return "auto"; - case TOK_SCSK_REGISTER: + case TOK_REGISTER: return "register"; - case TOK_SCSK_STATIC: + case TOK_STATIC: return "static"; - case TOK_SCSK_EXTERN: + case TOK_EXTERN: return "extern"; - case TOK_SCSK_CONST: + case TOK_CONST: return "const"; - case TOK_SCSK_VOLATILE: + case TOK_VOLATILE: return "volatile"; - case TOK_MK_SIZEOF: + case TOK_SIZEOF: return "sizeof"; - case TOK_OP_ADD: + case TOK_ADD: return "+"; - case TOK_OP_SUB: + case TOK_SUB: return "-"; - case TOK_OP_MUL: + case TOK_MUL: return "*"; - case TOK_OP_DIV: + case TOK_DIV: return "/"; - case TOK_OP_MOD: + case TOK_MOD: return "%"; - case TOK_OP_BIT_AND: + case TOK_BIT_AND: return "&"; - case TOK_OP_BIT_OR: + case TOK_BIT_OR: return "|"; - case TOK_OP_BIT_XOR: + case TOK_BIT_XOR: return "^"; - case TOK_OP_BIT_NOT: + case TOK_BIT_NOT: return "~"; - case TOK_OP_LSHIFT: + case TOK_LSHIFT: return "<<"; - case TOK_OP_RSHIFT: + case TOK_RSHIFT: return ">>"; - case TOK_OP_NOT: + case TOK_NOT: return "!"; - case TOK_OP_ASSIGN: + case TOK_ASSIGN: return "="; - case TOK_OP_LT: + case TOK_LT: return "<"; - case TOK_OP_GT: + case TOK_GT: return ">"; - case TOK_OP_INC: + case TOK_INC: return "++"; - case TOK_OP_DEC: + case TOK_DEC: return "--"; - case TOK_OP_EQ: + case TOK_EQ: return "=="; - case TOK_OP_NE: + case TOK_NE: return "!="; - case TOK_OP_LE: + case TOK_LE: return "<="; - case TOK_OP_GE: + case TOK_GE: return ">="; - case TOK_OP_AND: + case TOK_AND: return "&&"; - case TOK_OP_OR: + case TOK_OR: return "||"; - case TOK_OP_MEMBER_POINTER: + case TOK_MEMBER_POINTER: return "->"; - case TOK_OP_MEMBER: + case TOK_MEMBER: return "."; - case TOK_OP_COND_DECISION: + case TOK_COND_DECISION: return ":"; - case TOK_OP_COND: + case TOK_COND: return "?"; - case TOK_OP_ASSIGN_ADD: + case TOK_ASSIGN_ADD: return "+="; - case TOK_OP_ASSIGN_SUB: + case TOK_ASSIGN_SUB: return "-="; - case TOK_OP_ASSIGN_MUL: + case TOK_ASSIGN_MUL: return "*="; - case TOK_OP_ASSIGN_DIV: + case TOK_ASSIGN_DIV: return "/="; - case TOK_OP_ASSIGN_MOD: + case TOK_ASSIGN_MOD: return "%="; - case TOK_OP_ASSIGN_BITAND: + case TOK_ASSIGN_BITAND: return "&="; - case TOK_OP_ASSIGN_BITOR: + case TOK_ASSIGN_BITOR: return "|="; - case TOK_OP_ASSIGN_BITXOR: + case TOK_ASSIGN_BITXOR: return "^="; - case TOK_OP_ASSIGN_LSHIFT: + case TOK_ASSIGN_LSHIFT: return "<<="; - case TOK_OP_ASSIGN_RSHIFT: + case TOK_ASSIGN_RSHIFT: return ">>="; - case TOK_SEP_HASH: + case TOK_HASH: return "#"; case TOK_ID: return "identifier"; - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_U64: - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_S64: + case TOK_INTEGER_U32: + case TOK_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_S64: return "integer constant"; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: return "floating constant"; - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: return "character constant"; - case TOK_CONST_STRING_ASCII: + case TOK_STRING_ASCII: return "string constant"; - case TOK_SPECIAL_EOF: + case TOK_EOF: return "EOF"; - case TOK_SPECIAL_ERROR: + case TOK_ERROR: return "error"; - case TOK_SEP_LEFT_PAREN: + case TOK_LEFT_PAREN: return "("; - case TOK_SEP_RIGHT_PAREN: + case TOK_RIGHT_PAREN: return ")"; - case TOK_SEP_LEFT_BRACKET: + case TOK_LEFT_BRACKET: return "["; - case TOK_SEP_RIGHT_BRACKET: + case TOK_RIGHT_BRACKET: return "]"; - case TOK_SEP_LEFT_BRACE: + case TOK_LEFT_BRACE: return "{"; - case TOK_SEP_RIGHT_BRACE: + case TOK_RIGHT_BRACE: return "}"; - case TOK_SEP_COMMA: + case TOK_COMMA: return ","; - case TOK_SEP_SEMICOLON: + case TOK_SEMICOLON: return ";"; - case TOK_SEP_DOT: + case TOK_DOT: return "."; - case TOK_SEP_ELLIPSIS: + case TOK_ELLIPSIS: return "..."; } return "UNKNOWN"; @@ -1633,9 +1629,9 @@ static token_t *skip_whitespace(void) { } } else { // Handled here to simplify the code. if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } } else { input_ungetc(c); @@ -1689,31 +1685,31 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[0]) { case 'a': if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') - return TOK_SCSK_AUTO; + return TOK_AUTO; break; case 'b': if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && buf[4] == 'k') - return TOK_CTK_BREAK; + return TOK_BREAK; break; case 'c': switch (buf[1]) { case 'a': if (len == 4 && buf[2] == 's' && buf[3] == 'e') - return TOK_CTK_CASE; + return TOK_CASE; break; case 'h': if (len == 4 && buf[2] == 'a' && buf[3] == 'r') - return TOK_TK_CHAR; + return TOK_CHAR; break; case 'o': if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') - return TOK_SCSK_CONST; + return TOK_CONST; if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') - return TOK_CTK_CONTINUE; + return TOK_CONTINUE; break; } break; @@ -1723,14 +1719,14 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && buf[5] == 'l' && buf[6] == 't') - return TOK_CTK_DEFAULT; + return TOK_DEFAULT; break; case 'o': if (len == 2 && buf[2] == '\0') - return TOK_CTK_DO; + return TOK_DO; if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && buf[5] == 'e') - return TOK_TK_DOUBLE; + return TOK_DOUBLE; break; } break; @@ -1739,16 +1735,16 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 4 && buf[2] == 's' && buf[3] == 'e') - return TOK_CTK_ELSE; + return TOK_ELSE; break; case 'n': if (len == 4 && buf[2] == 'u' && buf[3] == 'm') - return TOK_TK_ENUM; + return TOK_ENUM; break; case 'x': if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && buf[5] == 'n') - return TOK_SCSK_EXTERN; + return TOK_EXTERN; break; } break; @@ -1757,36 +1753,36 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') - return TOK_TK_FLOAT; + return TOK_FLOAT; break; case 'o': if (len == 3 && buf[2] == 'r') - return TOK_CTK_FOR; + return TOK_FOR; break; } break; case 'g': if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') - return TOK_CTK_GOTO; + return TOK_GOTO; break; case 'i': switch (buf[1]) { case 'f': if (len == 2 && buf[2] == '\0') - return TOK_CTK_IF; + return TOK_IF; break; case 'n': if (len == 3 && buf[2] == 't') - return TOK_TK_INT; + return TOK_INT; break; } break; case 'l': if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') - return TOK_TK_LONG; + return TOK_LONG; break; case 'r': @@ -1794,10 +1790,10 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') - return TOK_SCSK_REGISTER; + return TOK_REGISTER; if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && buf[5] == 'n') - return TOK_CTK_RETURN; + return TOK_RETURN; break; } break; @@ -1806,29 +1802,29 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'h': if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') - return TOK_TK_SHORT; + return TOK_SHORT; break; case 't': if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'c') - return TOK_SCSK_STATIC; + return TOK_STATIC; break; case 'i': if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && buf[5] == 'd') - return TOK_TK_SIGNED; + return TOK_SIGNED; if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && buf[5] == 'f') - return TOK_MK_SIZEOF; + return TOK_SIZEOF; break; case 'r': if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') - return TOK_TK_STRUCT; + return TOK_STRUCT; break; case 'w': if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && buf[5] == 'h') - return TOK_CTK_SWITCH; + return TOK_SWITCH; break; } break; @@ -1836,17 +1832,17 @@ c_token_types get_keyword(const char *buf, int len) { case 't': if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') - return TOK_TK_TYPEDEF; + return TOK_TYPEDEF; break; case 'u': switch (buf[1]) { case 'n': if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') - return TOK_TK_UNION; + return TOK_UNION; if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') - return TOK_TK_UNSIGNED; + return TOK_UNSIGNED; break; } break; @@ -1855,10 +1851,10 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'o': if (len == 4 && buf[2] == 'i' && buf[3] == 'd') - return TOK_TK_VOID; + return TOK_VOID; if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') - return TOK_SCSK_VOLATILE; + return TOK_VOLATILE; break; } break; @@ -1866,7 +1862,7 @@ c_token_types get_keyword(const char *buf, int len) { case 'w': if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && buf[4] == 'e') - return TOK_CTK_WHILE; + return TOK_WHILE; break; default: @@ -1887,65 +1883,65 @@ token_t *read_operator(void) { case '!': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_NE, line, column, 2); + return token_create(TOK_NE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_NOT, line, column, 1); + return token_create(TOK_NOT, line, column, 1); } case '%': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MOD, line, column, 2); + return token_create(TOK_ASSIGN_MOD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MOD, line, column, 1); + return token_create(TOK_MOD, line, column, 1); } case '&': { c = input_getc(); if (c == '&') - return token_create(TOK_OP_AND, line, column, 2); + return token_create(TOK_AND, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2); + return token_create(TOK_ASSIGN_BITAND, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_AND, line, column, 1); + return token_create(TOK_BIT_AND, line, column, 1); } case '(': - return token_create(TOK_SEP_LEFT_PAREN, line, column, 1); + return token_create(TOK_LEFT_PAREN, line, column, 1); case ')': - return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1); + return token_create(TOK_RIGHT_PAREN, line, column, 1); case '*': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MUL, line, column, 2); + return token_create(TOK_ASSIGN_MUL, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MUL, line, column, 1); + return token_create(TOK_MUL, line, column, 1); } case '+': { c = input_getc(); if (c == '+') - return token_create(TOK_OP_INC, line, column, 2); + return token_create(TOK_INC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_ADD, line, column, 2); + return token_create(TOK_ASSIGN_ADD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ADD, line, column, 2); + return token_create(TOK_ADD, line, column, 2); } case ',': - return token_create(TOK_SEP_COMMA, line, column, 1); + return token_create(TOK_COMMA, line, column, 1); case '-': { c = input_getc(); if (c == '-') - return token_create(TOK_OP_DEC, line, column, 2); + return token_create(TOK_DEC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_SUB, line, column, 2); + return token_create(TOK_ASSIGN_SUB, line, column, 2); if (c == '>') - return token_create(TOK_OP_MEMBER_POINTER, line, column, 2); + return token_create(TOK_MEMBER_POINTER, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_SUB, line, column, 1); + return token_create(TOK_SUB, line, column, 1); } case '.': { c = input_getc(); if (c == '.') { c = input_getc(); if (c == '.') { - return token_create(TOK_SEP_ELLIPSIS, line, column, 3); + return token_create(TOK_ELLIPSIS, line, column, 3); } else { // Bail out, can't store more than one unget tok_error("Unexpected character '.' at line %d, column %d\n", line, @@ -1958,77 +1954,77 @@ token_t *read_operator(void) { case '/': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } case ':': - return token_create(TOK_OP_COND_DECISION, line, column, 1); + return token_create(TOK_COND_DECISION, line, column, 1); case ';': - return token_create(TOK_SEP_SEMICOLON, line, column, 1); + return token_create(TOK_SEMICOLON, line, column, 1); case '<': { c = input_getc(); if (c == '<') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_LSHIFT, line, column, 2); + return token_create(TOK_LSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_LE, line, column, 2); + return token_create(TOK_LE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_LT, line, column, 1); + return token_create(TOK_LT, line, column, 1); } case '=': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN, line, column, 2); + return token_create(TOK_ASSIGN, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ASSIGN, line, column, 1); + return token_create(TOK_ASSIGN, line, column, 1); } case '>': { c = input_getc(); if (c == '>') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_RSHIFT, line, column, 2); + return token_create(TOK_RSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_GE, line, column, 2); + return token_create(TOK_GE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_GT, line, column, 1); + return token_create(TOK_GT, line, column, 1); } case '?': - return token_create(TOK_OP_COND, line, column, 1); + return token_create(TOK_COND, line, column, 1); case '[': - return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1); + return token_create(TOK_LEFT_BRACKET, line, column, 1); case ']': - return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1); + return token_create(TOK_RIGHT_BRACKET, line, column, 1); case '^': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2); + return token_create(TOK_ASSIGN_BITXOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_XOR, line, column, 1); + return token_create(TOK_BIT_XOR, line, column, 1); } case '{': - return token_create(TOK_SEP_LEFT_BRACE, line, column, 1); + return token_create(TOK_LEFT_BRACE, line, column, 1); case '|': { c = input_getc(); if (c == '|') - return token_create(TOK_OP_OR, line, column, 2); + return token_create(TOK_OR, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2); + return token_create(TOK_ASSIGN_BITOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_OR, line, column, 1); + return token_create(TOK_BIT_OR, line, column, 1); } case '}': - return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1); + return token_create(TOK_RIGHT_BRACE, line, column, 1); case '~': - return token_create(TOK_OP_BIT_NOT, line, column, 1); + return token_create(TOK_BIT_NOT, line, column, 1); default: input_ungetc(c); return NULL; @@ -2079,7 +2075,7 @@ To determine if a character is a valid prefix for a number, we need to check if char cnext = input_getc(); if (!isdigit(cnext)) { input_ungetc(cnext); - return token_create(TOK_OP_MEMBER, line, column, 1); + return token_create(TOK_MEMBER, line, column, 1); } input_ungetc(cnext); } @@ -2210,7 +2206,7 @@ If we find conflicting suffixes, we print a warning and ignore the suffixes. --- @s -If the constant is a floating-point number, we convert it to a float. We need to make sure that the number is in range for the given type and check for errors from strtod +If the string contains a float, we pass it to strtod. We need to make sure that the number is in range for the given type and check for errors from strtod --- Convert to float errno = 0; @@ -2235,13 +2231,13 @@ If the constant is a floating-point number, we convert it to a float. We need to "precision\n", f); } - return token_create_float(is_single ? TOK_CONST_FLOAT_32 - : TOK_CONST_FLOAT_64, + return token_create_float(is_single ? TOK_FLOAT_32 + : TOK_FLOAT_64, line, column, f, i); --- @s -If the constant is an integer, we convert it to an integer. We need to make sure that the number is in range for the given type and check for errors from strtoll +If the string contains a number, we pass it to stroll. We need to make sure that the number is in range for the given type and check for errors from strtoll --- Convert to integer errno = 0; @@ -2253,7 +2249,7 @@ If the constant is an integer, we convert it to an integer. We need to make sure } if (is_unsigned) { if (is_long) { - return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i); + return token_create_int(TOK_INTEGER_U64, line, column, int_, i); } else { if (int_ > UINT32_MAX) { tok_warn( @@ -2261,7 +2257,7 @@ If the constant is an integer, we convert it to an integer. We need to make sure "int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i); + return token_create_int(TOK_INTEGER_U32, line, column, int_, i); } } else { if (is_long) { @@ -2272,13 +2268,13 @@ If the constant is an integer, we convert it to an integer. We need to make sure "Warning: Integer constant %lld is out of range for long long\n", i); } - return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i); + return token_create_int(TOK_INTEGER_S64, line, column, int_, i); } else { if (int_ & (1UL << 31)) { tok_warn("Warning: Integer constant %lld is out of range for int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i); + return token_create_int(TOK_INTEGER_S32, line, column, int_, i); } } --- @@ -2310,7 +2306,7 @@ static token_t *read_char_constant(void) { return NULL; } len++; - return token_create_char(TOK_CONST_CHAR, line, column, val, len); + return token_create_char(TOK_CHAR_CONST, line, column, val, len); } --- @@ -2369,7 +2365,7 @@ static token_t *read_string_literal(void) { return NULL; } - token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, + token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, i + esc_pad + 2); if (buf != s_buf) { free(buf); @@ -2380,7 +2376,6 @@ static token_t *read_string_literal(void) { @s Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases. - --- Read Escape Sequence static char read_escape_sequence(int *len) { int c = input_getc(); @@ -2472,6 +2467,7 @@ int main(int argc, char **argv) { destroy_tokenizer(); remove(preprocessed); free(preprocessed); + hash_table_destroy(string_table); return 0; } --- @@ -2481,7 +2477,7 @@ int main(int argc, char **argv) { I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed. * had `buffer_pos == buffer_size - 1`, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to `buffer_size == 0` check which if true returns EOF, preventing input initialization. Fixed by changing to `buffer_pos == buffer_size`. -* assertion `token->kind == TOK_CONST_STRING_ASCII` failed in token\_string. Forgot to expand check for identifiers which also use token\_string. Fixed by changing to `token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID`. +* assertion `token->kind == TOK_STRING_ASCII` failed in token\_string. Forgot to expand check for identifiers which also use token\_string. Fixed by changing to `token->kind == TOK_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID`. * token\_create\_string - call to `hash_table_get` with freed key. Fixed by moving the call to free after the call to `hash_table_get`. * ibid - Design of hash table and call to `hash_table_get` in token\_create\_string created double free. Fixed by rewriting part of function. * Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them. diff --git a/projects/cminus/code/makefile b/projects/cminus/code/makefile @@ -7,4 +7,5 @@ clean: rm -f *.h rm -f *.c rm -f *.html - rm -f ../lexer.html- \ No newline at end of file + rm -f ../lexer.html + rm -f vgcore.*+ \ No newline at end of file diff --git a/projects/style.css b/projects/cminus/code/style.css diff --git a/projects/cminus/code/token.c b/projects/cminus/code/token.c @@ -3,6 +3,7 @@ #include <string.h> #include <stdio.h> #include <assert.h> +#include <ctype.h> #include "token.h" #include "hash_table.h" /* Token Data Structure */ @@ -41,29 +42,25 @@ c_token_types token_type(token_t *token) { } int64_t token_int(token_t *token) { - assert(token->kind == TOK_CONST_INTEGER_U32 || - token->kind == TOK_CONST_INTEGER_U64 || - token->kind == TOK_CONST_INTEGER_S32 || - token->kind == TOK_CONST_INTEGER_S64); + assert(token->kind == TOK_INTEGER_U32 || token->kind == TOK_INTEGER_U64 || token->kind == TOK_INTEGER_S32 || token->kind == TOK_INTEGER_S64); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.i; } double token_float(token_t *token) { - assert(token->kind == TOK_CONST_FLOAT_32 || - token->kind == TOK_CONST_FLOAT_64); + assert(token->kind == TOK_FLOAT_32 || token->kind == TOK_FLOAT_64); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.f; } const char *token_string(token_t *token) { - assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID); + assert(token->kind == TOK_STRING_ASCII || token->kind == TOK_ID); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.s; } char token_char(token_t *token) { - assert(token->kind == TOK_CONST_CHAR); + assert(token->kind == TOK_CHAR_CONST); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.c; } @@ -81,7 +78,7 @@ int token_column(token_t *token) { /* Token Creation and Destruction */ token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -95,7 +92,7 @@ token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -127,9 +124,6 @@ token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len void token_destroy(token_t *token) { if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) { - if (token->kind == TOK_CONST_STRING_ASCII) { - free((char *)token_data(token)->data.s); - } free(token); } else { fputs("Corrupt token\n", stderr); @@ -139,14 +133,14 @@ void token_destroy(token_t *token) { /* Token Create String */ /* String Comparison */ -static int cmp_string(void *key1, void *key2) { +static int cmp_string(const void *key1, const void *key2) { return strcmp((char *)key1, (char *)key2); } /* Hash Function */ -static unsigned int hash_string(void *key) { +static unsigned int hash_string(const void *key) { unsigned long hash = 0, hi = 0; - char *p = key; + const char *p = key; hash = *p; if (hash != 0 && p[1] != 0) { hash = (hash << 4) + p[1]; @@ -194,278 +188,259 @@ token_t *token_create_string(c_token_types kind, int lin, int col, } /* Token Debugging */ +/* Token Type Enum to String */ const char *token_name_from_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: - return "TOK_CTK_IF"; - case TOK_CTK_ELSE: - return "TOK_CTK_ELSE"; - case TOK_CTK_SWITCH: - return "TOK_CTK_SWITCH"; - case TOK_CTK_CASE: - return "TOK_CTK_CASE"; - case TOK_CTK_DEFAULT: - return "TOK_CTK_DEFAULT"; - case TOK_CTK_WHILE: - return "TOK_CTK_WHILE"; - case TOK_CTK_DO: - return "TOK_CTK_DO"; - case TOK_CTK_FOR: - return "TOK_CTK_FOR"; - case TOK_CTK_CONTINUE: - return "TOK_CTK_CONTINUE"; - case TOK_CTK_BREAK: - return "TOK_CTK_BREAK"; - case TOK_CTK_RETURN: - return "TOK_CTK_RETURN"; - case TOK_CTK_GOTO: - return "TOK_CTK_GOTO"; - case TOK_TK_VOID: - return "TOK_TK_VOID"; - case TOK_TK_CHAR: - return "TOK_TK_CHAR"; - case TOK_TK_SHORT: - return "TOK_TK_SHORT"; - case TOK_TK_INT: - return "TOK_TK_INT"; - case TOK_TK_LONG: - return "TOK_TK_LONG"; - case TOK_TK_FLOAT: - return "TOK_TK_FLOAT"; - case TOK_TK_DOUBLE: - return "TOK_TK_DOUBLE"; - case TOK_TK_SIGNED: - return "TOK_TK_SIGNED"; - case TOK_TK_UNSIGNED: - return "TOK_TK_UNSIGNED"; - case TOK_TK_STRUCT: - return "TOK_TK_STRUCT"; - case TOK_TK_UNION: - return "TOK_TK_UNION"; - case TOK_TK_ENUM: - return "TOK_TK_ENUM"; - case TOK_TK_TYPEDEF: - return "TOK_TK_TYPEDEF"; - case TOK_SCSK_AUTO: - return "TOK_SCSK_AUTO"; - case TOK_SCSK_REGISTER: - return "TOK_SCSK_REGISTER"; - case TOK_SCSK_STATIC: - return "TOK_SCSK_STATIC"; - case TOK_SCSK_EXTERN: - return "TOK_SCSK_EXTERN"; - case TOK_SCSK_CONST: - return "TOK_SCSK_CONST"; - case TOK_SCSK_VOLATILE: - return "TOK_SCSK_VOLATILE"; - case TOK_MK_SIZEOF: - return "TOK_MK_SIZEOF"; - case TOK_OP_ADD: - return "TOK_OP_ADD"; - case TOK_OP_SUB: - return "TOK_OP_SUB"; - case TOK_OP_MUL: - return "TOK_OP_MUL"; - case TOK_OP_DIV: - return "TOK_OP_DIV"; - case TOK_OP_MOD: - return "TOK_OP_MOD"; - case TOK_OP_BIT_AND: - return "TOK_OP_BIT_AND"; - case TOK_OP_BIT_OR: - return "TOK_OP_BIT_OR"; - case TOK_OP_BIT_XOR: - return "TOK_OP_BIT_XOR"; - case TOK_OP_BIT_NOT: - return "TOK_OP_BIT_NOT"; - case TOK_OP_LSHIFT: - return "TOK_OP_LSHIFT"; - case TOK_OP_RSHIFT: - return "TOK_OP_RSHIFT"; - case TOK_OP_NOT: - return "TOK_OP_NOT"; - case TOK_OP_ASSIGN: - return "TOK_OP_ASSIGN"; - case TOK_OP_LT: - return "TOK_OP_LT"; - case TOK_OP_GT: - return "TOK_OP_GT"; - case TOK_OP_INC: - return "TOK_OP_INC"; - case TOK_OP_DEC: - return "TOK_OP_DEC"; - case TOK_OP_EQ: - return "TOK_OP_EQ"; - case TOK_OP_NE: - return "TOK_OP_NE"; - case TOK_OP_LE: - return "TOK_OP_LE"; - case TOK_OP_GE: - return "TOK_OP_GE"; - case TOK_OP_AND: - return "TOK_OP_AND"; - case TOK_OP_OR: - return "TOK_OP_OR"; - case TOK_OP_MEMBER_POINTER: - return "TOK_OP_MEMBER_POINTER"; - case TOK_OP_MEMBER: - return "TOK_OP_MEMBER"; - case TOK_OP_COND_DECISION: - return "TOK_OP_COND_DECISION"; - case TOK_OP_COND: - return "TOK_OP_COND"; - case TOK_OP_ASSIGN_ADD: - return "TOK_OP_ASSIGN_ADD"; - case TOK_OP_ASSIGN_SUB: - return "TOK_OP_ASSIGN_SUB"; - case TOK_OP_ASSIGN_MUL: - return "TOK_OP_ASSIGN_MUL"; - case TOK_OP_ASSIGN_DIV: - return "TOK_OP_ASSIGN_DIV"; - case TOK_OP_ASSIGN_MOD: - return "TOK_OP_ASSIGN_MOD"; - case TOK_OP_ASSIGN_BITAND: - return "TOK_OP_ASSIGN_BITAND"; - case TOK_OP_ASSIGN_BITOR: - return "TOK_OP_ASSIGN_BITOR"; - case TOK_OP_ASSIGN_BITXOR: - return "TOK_OP_ASSIGN_BITXOR"; - case TOK_OP_ASSIGN_LSHIFT: - return "TOK_OP_ASSIGN_LSHIFT"; - case TOK_OP_ASSIGN_RSHIFT: - return "TOK_OP_ASSIGN_RSHIFT"; - case TOK_SEP_HASH: - return "TOK_SEP_HASH"; + case TOK_IF: + return "TOK_IF"; + case TOK_ELSE: + return "TOK_ELSE"; + case TOK_SWITCH: + return "TOK_SWITCH"; + case TOK_CASE: + return "TOK_CASE"; + case TOK_DEFAULT: + return "TOK_DEFAULT"; + case TOK_WHILE: + return "TOK_WHILE"; + case TOK_DO: + return "TOK_DO"; + case TOK_FOR: + return "TOK_FOR"; + case TOK_CONTINUE: + return "TOK_CONTINUE"; + case TOK_BREAK: + return "TOK_BREAK"; + case TOK_RETURN: + return "TOK_RETURN"; + case TOK_GOTO: + return "TOK_GOTO"; + case TOK_VOID: + return "TOK_VOID"; + case TOK_CHAR: + return "TOK_CHAR"; + case TOK_SHORT: + return "TOK_SHORT"; + case TOK_INT: + return "TOK_INT"; + case TOK_LONG: + return "TOK_LONG"; + case TOK_FLOAT: + return "TOK_FLOAT"; + case TOK_DOUBLE: + return "TOK_DOUBLE"; + case TOK_SIGNED: + return "TOK_SIGNED"; + case TOK_UNSIGNED: + return "TOK_UNSIGNED"; + case TOK_STRUCT: + return "TOK_STRUCT"; + case TOK_UNION: + return "TOK_UNION"; + case TOK_ENUM: + return "TOK_ENUM"; + case TOK_TYPEDEF: + return "TOK_TYPEDEF"; + case TOK_AUTO: + return "TOK_AUTO"; + case TOK_REGISTER: + return "TOK_REGISTER"; + case TOK_STATIC: + return "TOK_STATIC"; + case TOK_EXTERN: + return "TOK_EXTERN"; + case TOK_CONST: + return "TOK_CONST"; + case TOK_VOLATILE: + return "TOK_VOLATILE"; + case TOK_SIZEOF: + return "TOK_SIZEOF"; + case TOK_ADD: + return "TOK_ADD"; + case TOK_SUB: + return "TOK_SUB"; + case TOK_MUL: + return "TOK_MUL"; + case TOK_DIV: + return "TOK_DIV"; + case TOK_MOD: + return "TOK_MOD"; + case TOK_BIT_AND: + return "TOK_BIT_AND"; + case TOK_BIT_OR: + return "TOK_BIT_OR"; + case TOK_BIT_XOR: + return "TOK_BIT_XOR"; + case TOK_BIT_NOT: + return "TOK_BIT_NOT"; + case TOK_LSHIFT: + return "TOK_LSHIFT"; + case TOK_RSHIFT: + return "TOK_RSHIFT"; + case TOK_NOT: + return "TOK_NOT"; + case TOK_ASSIGN: + return "TOK_ASSIGN"; + case TOK_LT: + return "TOK_LT"; + case TOK_GT: + return "TOK_GT"; + case TOK_INC: + return "TOK_INC"; + case TOK_DEC: + return "TOK_DEC"; + case TOK_EQ: + return "TOK_EQ"; + case TOK_NE: + return "TOK_NE"; + case TOK_LE: + return "TOK_LE"; + case TOK_GE: + return "TOK_GE"; + case TOK_AND: + return "TOK_AND"; + case TOK_OR: + return "TOK_OR"; + case TOK_MEMBER_POINTER: + return "TOK_MEMBER_POINTER"; + case TOK_MEMBER: + return "TOK_MEMBER"; + case TOK_COND_DECISION: + return "TOK_COND_DECISION"; + case TOK_COND: + return "TOK_COND"; + case TOK_ASSIGN_ADD: + return "TOK_ASSIGN_ADD"; + case TOK_ASSIGN_SUB: + return "TOK_ASSIGN_SUB"; + case TOK_ASSIGN_MUL: + return "TOK_ASSIGN_MUL"; + case TOK_ASSIGN_DIV: + return "TOK_ASSIGN_DIV"; + case TOK_ASSIGN_MOD: + return "TOK_ASSIGN_MOD"; + case TOK_ASSIGN_BITAND: + return "TOK_ASSIGN_BITAND"; + case TOK_ASSIGN_BITOR: + return "TOK_ASSIGN_BITOR"; + case TOK_ASSIGN_BITXOR: + return "TOK_ASSIGN_BITXOR"; + case TOK_ASSIGN_LSHIFT: + return "TOK_ASSIGN_LSHIFT"; + case TOK_ASSIGN_RSHIFT: + return "TOK_ASSIGN_RSHIFT"; + case TOK_HASH: + return "TOK_HASH"; case TOK_ID: return "TOK_ID"; - case TOK_CONST_INTEGER_U32: - return "TOK_CONST_INTEGER_U32"; - case TOK_CONST_INTEGER_U64: - return "TOK_CONST_INTEGER_U64"; - case TOK_CONST_INTEGER_S32: - return "TOK_CONST_INTEGER_S32"; - case TOK_CONST_INTEGER_S64: - return "TOK_CONST_INTEGER_S64"; - case TOK_CONST_FLOAT_32: - return "TOK_CONST_FLOAT_32"; - case TOK_CONST_FLOAT_64: - return "TOK_CONST_FLOAT_64"; - case TOK_CONST_CHAR: - return "TOK_CONST_CHAR"; - case TOK_CONST_STRING_ASCII: - return "TOK_CONST_STRING_ASCII"; - case TOK_SPECIAL_EOF: - return "TOK_SPECIAL_EOF"; - case TOK_SPECIAL_ERROR: - return "TOK_SPECIAL_ERROR"; - case TOK_SEP_LEFT_PAREN: - return "TOK_SEP_LEFT_PAREN"; - case TOK_SEP_RIGHT_PAREN: - return "TOK_SEP_RIGHT_PAREN"; - case TOK_SEP_LEFT_BRACKET: - return "TOK_SEP_LEFT_BRACKET"; - case TOK_SEP_RIGHT_BRACKET: - return "TOK_SEP_RIGHT_BRACKET"; - case TOK_SEP_LEFT_BRACE: - return "TOK_SEP_LEFT_BRACE"; - case TOK_SEP_RIGHT_BRACE: - return "TOK_SEP_RIGHT_BRACE"; - case TOK_SEP_COMMA: - return "TOK_SEP_COMMA"; - case TOK_SEP_SEMICOLON: - return "TOK_SEP_SEMICOLON"; - case TOK_SEP_DOT: - return "TOK_SEP_DOT"; - case TOK_SEP_ELLIPSIS: - return "TOK_SEP_ELLIPSIS"; + case TOK_INTEGER_U32: + return "TOK_INTEGER_U32"; + case TOK_INTEGER_U64: + return "TOK_INTEGER_U64"; + case TOK_INTEGER_S32: + return "TOK_INTEGER_S32"; + case TOK_INTEGER_S64: + return "TOK_INTEGER_S64"; + case TOK_FLOAT_32: + return "TOK_FLOAT_32"; + case TOK_FLOAT_64: + return "TOK_FLOAT_64"; + case TOK_CHAR_CONST: + return "TOK_CHAR_CONST"; + case TOK_STRING_ASCII: + return "TOK_STRING_ASCII"; + case TOK_EOF: + return "TOK_EOF"; + case TOK_ERROR: + return "TOK_ERROR"; + case TOK_LEFT_PAREN: + return "TOK_LEFT_PAREN"; + case TOK_RIGHT_PAREN: + return "TOK_RIGHT_PAREN"; + case TOK_LEFT_BRACKET: + return "TOK_LEFT_BRACKET"; + case TOK_RIGHT_BRACKET: + return "TOK_RIGHT_BRACKET"; + case TOK_LEFT_BRACE: + return "TOK_LEFT_BRACE"; + case TOK_RIGHT_BRACE: + return "TOK_RIGHT_BRACE"; + case TOK_COMMA: + return "TOK_COMMA"; + case TOK_SEMICOLON: + return "TOK_SEMICOLON"; + case TOK_DOT: + return "TOK_DOT"; + case TOK_ELLIPSIS: + return "TOK_ELLIPSIS"; } return "UNKNOWN"; } +/* Unescape String */ +#define clamp(x, min, max) ((x) < (min) ? (min) : (x) > (max) ? (max) : (x)) char *re_escape_string(const char *str) { int len = strlen(str); char *buf = malloc(len * 2 + 1); - if (buf == NULL) { + if (!buf) { fprintf(stderr, "Out of memory. Cannot escape string\n"); exit(1); } int i = 0; for (int j = 0; j < len; j++) { switch (str[j]) { - case '\a': - buf[i++] = '\\'; - buf[i++] = 'a'; - break; - case '\b': - buf[i++] = '\\'; - buf[i++] = 'b'; - break; - case '\f': - buf[i++] = '\\'; - buf[i++] = 'f'; - break; - case '\n': - buf[i++] = '\\'; - buf[i++] = 'n'; - break; - case '\r': - buf[i++] = '\\'; - buf[i++] = 'r'; - break; - case '\t': - buf[i++] = '\\'; - buf[i++] = 't'; - break; - case '\v': - buf[i++] = '\\'; - buf[i++] = 'v'; - break; - case '\\': - buf[i++] = '\\'; - buf[i++] = '\\'; - break; - case '\'': - buf[i++] = '\\'; - buf[i++] = '\''; - break; - case '"': - buf[i++] = '\\'; - buf[i++] = '"'; - break; - default: - buf[i++] = str[j]; - break; + case '\a': buf[i++] = '\\'; buf[i++] = 'a'; break; + case '\b': buf[i++] = '\\'; buf[i++] = 'b'; break; + case '\f': buf[i++] = '\\'; buf[i++] = 'f'; break; + case '\n': buf[i++] = '\\'; buf[i++] = 'n'; break; + case '\r': buf[i++] = '\\'; buf[i++] = 'r'; break; + case '\t': buf[i++] = '\\'; buf[i++] = 't'; break; + case '\v': buf[i++] = '\\'; buf[i++] = 'v'; break; + case '\\': buf[i++] = '\\'; buf[i++] = '\\'; break; + case '\'': buf[i++] = '\\'; buf[i++] = '\''; break; + case '"': buf[i++] = '\\'; buf[i++] = '"'; break; + default: { + if (isprint(str[j])) { + buf[i++] = str[j]; + } else { + buf[i++] = '\\'; + buf[i++] = 'x'; + buf[i++] = "0123456789abcdef"[clamp(str[j] >> 4, 0, 0xf)]; + buf[i++] = "0123456789abcdef"[clamp(str[j] & 0xf, 0, 0xf)]; + } + } } } buf[i] = '\0'; return buf; } +/* Print Token */ void print_token(token_t *tok) { - if (tok == NULL) { + if (!tok) { printf("NULL\n"); return; } const char *name = token_name_from_type(tok->kind); switch (tok->kind) { case TOK_ID: - case TOK_CONST_STRING_ASCII: { + case TOK_STRING_ASCII: { char *escaped = re_escape_string(token_string(tok)); printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column); free(escaped); break; } - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column); break; - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_S64: - case TOK_CONST_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_U32: + case TOK_INTEGER_S64: + case TOK_INTEGER_U64: printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column); break; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column); break; default: @@ -475,3 +450,4 @@ void print_token(token_t *tok) { } + diff --git a/projects/cminus/code/token.h b/projects/cminus/code/token.h @@ -6,148 +6,134 @@ /* Token Types */ typedef enum { // Control Keywords - TOK_CTK_IF, - TOK_CTK_ELSE, - TOK_CTK_SWITCH, - TOK_CTK_CASE, - TOK_CTK_DEFAULT, - TOK_CTK_WHILE, - TOK_CTK_DO, - TOK_CTK_FOR, - TOK_CTK_CONTINUE, - TOK_CTK_BREAK, - TOK_CTK_RETURN, - TOK_CTK_GOTO, + TOK_IF, + TOK_ELSE, + TOK_SWITCH, + TOK_CASE, + TOK_DEFAULT, + TOK_WHILE, + TOK_DO, + TOK_FOR, + TOK_CONTINUE, + TOK_BREAK, + TOK_RETURN, + TOK_GOTO, // Type Keywords - TOK_TK_VOID, - TOK_TK_CHAR, - TOK_TK_SHORT, - TOK_TK_INT, - TOK_TK_LONG, - TOK_TK_FLOAT, - TOK_TK_DOUBLE, - TOK_TK_SIGNED, - TOK_TK_UNSIGNED, - TOK_TK_STRUCT, - TOK_TK_UNION, - TOK_TK_ENUM, - TOK_TK_TYPEDEF, + TOK_VOID, + TOK_CHAR, + TOK_SHORT, + TOK_INT, + TOK_LONG, + TOK_FLOAT, + TOK_DOUBLE, + TOK_SIGNED, + TOK_UNSIGNED, + TOK_STRUCT, + TOK_UNION, + TOK_ENUM, + TOK_TYPEDEF, // Storage Class/Specifier Keywords - TOK_SCSK_AUTO, - TOK_SCSK_REGISTER, - TOK_SCSK_STATIC, - TOK_SCSK_EXTERN, - TOK_SCSK_CONST, - TOK_SCSK_VOLATILE, + TOK_AUTO, + TOK_REGISTER, + TOK_STATIC, + TOK_EXTERN, + TOK_CONST, + TOK_VOLATILE, // Misc Keywords - TOK_MK_SIZEOF, + TOK_SIZEOF, // Operators - TOK_OP_ADD, // + - TOK_OP_SUB, // - - TOK_OP_MUL, // * - TOK_OP_DIV, // / - TOK_OP_MOD, // % - TOK_OP_BIT_AND, // & - TOK_OP_BIT_OR, // | - TOK_OP_BIT_XOR, // ^ - TOK_OP_BIT_NOT, // ~ - TOK_OP_LSHIFT, // << - TOK_OP_RSHIFT, // >> - TOK_OP_NOT, // ! - TOK_OP_ASSIGN, // = - TOK_OP_LT, // < - TOK_OP_GT, // > - TOK_OP_INC, // ++ - TOK_OP_DEC, // -- - TOK_OP_EQ, // == - TOK_OP_NE, // != - TOK_OP_LE, // <= - TOK_OP_GE, // >= - TOK_OP_AND, // && - TOK_OP_OR, // || - TOK_OP_MEMBER_POINTER, // -> - TOK_OP_MEMBER, // . - TOK_OP_COND_DECISION, // : - TOK_OP_COND, // ? - TOK_OP_ASSIGN_ADD, // += - TOK_OP_ASSIGN_SUB, // -= - TOK_OP_ASSIGN_MUL, // *= - TOK_OP_ASSIGN_DIV, // /= - TOK_OP_ASSIGN_MOD, // %= - TOK_OP_ASSIGN_BITAND, // &= - TOK_OP_ASSIGN_BITOR, // |= - TOK_OP_ASSIGN_BITXOR, // ^= - TOK_OP_ASSIGN_LSHIFT, // <<= - TOK_OP_ASSIGN_RSHIFT, // >>= + TOK_ADD, // + + TOK_SUB, // - + TOK_MUL, // * + TOK_DIV, // / + TOK_MOD, // % + TOK_BIT_AND, // & + TOK_BIT_OR, // | + TOK_BIT_XOR, // ^ + TOK_BIT_NOT, // ~ + TOK_LSHIFT, // << + TOK_RSHIFT, // >> + TOK_NOT, // ! + TOK_ASSIGN, // = + TOK_LT, // < + TOK_GT, // > + TOK_INC, // ++ + TOK_DEC, // -- + TOK_EQ, // == + TOK_NE, // != + TOK_LE, // <= + TOK_GE, // >= + TOK_AND, // && + TOK_OR, // || + TOK_MEMBER_POINTER, // -> + TOK_MEMBER, // . + TOK_COND_DECISION, // : + TOK_COND, // ? + TOK_ASSIGN_ADD, // += + TOK_ASSIGN_SUB, // -= + TOK_ASSIGN_MUL, // *= + TOK_ASSIGN_DIV, // /= + TOK_ASSIGN_MOD, // %= + TOK_ASSIGN_BITAND, // &= + TOK_ASSIGN_BITOR, // |= + TOK_ASSIGN_BITXOR, // ^= + TOK_ASSIGN_LSHIFT, // <<= + TOK_ASSIGN_RSHIFT, // >>= // Separators - TOK_SEP_LEFT_PAREN, // ( - TOK_SEP_RIGHT_PAREN, // ) - TOK_SEP_LEFT_BRACKET, // [ - TOK_SEP_RIGHT_BRACKET, // ] - TOK_SEP_LEFT_BRACE, // { - TOK_SEP_RIGHT_BRACE, // } - TOK_SEP_COMMA, // , - TOK_SEP_SEMICOLON, // ; - TOK_SEP_DOT, // . - TOK_SEP_ELLIPSIS, // ... - TOK_SEP_HASH, // # + TOK_LEFT_PAREN, // ( + TOK_RIGHT_PAREN, // ) + TOK_LEFT_BRACKET, // [ + TOK_RIGHT_BRACKET, // ] + TOK_LEFT_BRACE, // { + TOK_RIGHT_BRACE, // } + TOK_COMMA, // , + TOK_SEMICOLON, // ; + TOK_DOT, // . + TOK_ELLIPSIS, // ... + TOK_HASH, // # // Identifiers TOK_ID, // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' - TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) + TOK_INTEGER_U32, // u + TOK_INTEGER_U64, // ul + TOK_INTEGER_S32, // (no suffix) + TOK_INTEGER_S64, // l + TOK_FLOAT_32, // f + TOK_FLOAT_64, // (no suffix) + TOK_CHAR_CONST, // 'c' + TOK_STRING_ASCII, // "string" (width of 8 bits) // Special - TOK_SPECIAL_EOF, - TOK_SPECIAL_ERROR, + TOK_EOF, + TOK_ERROR, } c_token_types; /* Opaque Token Type */ typedef struct token token_t; /* Token Creation and Destruction Interface */ -token_t *token_data_create(c_token_types kind, int lin, int col, int len); - token_t *token_create(c_token_types kind, int lin, int col, int len); - token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len); - token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len); - token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len); - token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len); - void token_destroy(token_t *token); /* Token Interface */ c_token_types token_type(token_t *token); - int64_t token_int(token_t *token); - double token_float(token_t *token); - const char *token_string(token_t *token); - char token_char(token_t *token); - int token_line(token_t *token); - int token_column(token_t *token); - void print_token(token_t *tok); extern hash_table_t *string_table; diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c @@ -39,183 +39,183 @@ token_t *peek_token(void) { /* Stringify Type */ const char *stringify_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: + case TOK_IF: return "if"; - case TOK_CTK_ELSE: + case TOK_ELSE: return "else"; - case TOK_CTK_SWITCH: + case TOK_SWITCH: return "switch"; - case TOK_CTK_CASE: + case TOK_CASE: return "case"; - case TOK_CTK_DEFAULT: + case TOK_DEFAULT: return "default"; - case TOK_CTK_WHILE: + case TOK_WHILE: return "while"; - case TOK_CTK_DO: + case TOK_DO: return "do"; - case TOK_CTK_FOR: + case TOK_FOR: return "for"; - case TOK_CTK_CONTINUE: + case TOK_CONTINUE: return "continue"; - case TOK_CTK_BREAK: + case TOK_BREAK: return "break"; - case TOK_CTK_RETURN: + case TOK_RETURN: return "return"; - case TOK_CTK_GOTO: + case TOK_GOTO: return "goto"; - case TOK_TK_VOID: + case TOK_VOID: return "void"; - case TOK_TK_CHAR: + case TOK_CHAR: return "char"; - case TOK_TK_SHORT: + case TOK_SHORT: return "short"; - case TOK_TK_INT: + case TOK_INT: return "int"; - case TOK_TK_LONG: + case TOK_LONG: return "long"; - case TOK_TK_FLOAT: + case TOK_FLOAT: return "float"; - case TOK_TK_DOUBLE: + case TOK_DOUBLE: return "double"; - case TOK_TK_SIGNED: + case TOK_SIGNED: return "signed"; - case TOK_TK_UNSIGNED: + case TOK_UNSIGNED: return "unsigned"; - case TOK_TK_STRUCT: + case TOK_STRUCT: return "struct"; - case TOK_TK_UNION: + case TOK_UNION: return "union"; - case TOK_TK_ENUM: + case TOK_ENUM: return "enum"; - case TOK_TK_TYPEDEF: + case TOK_TYPEDEF: return "typedef"; - case TOK_SCSK_AUTO: + case TOK_AUTO: return "auto"; - case TOK_SCSK_REGISTER: + case TOK_REGISTER: return "register"; - case TOK_SCSK_STATIC: + case TOK_STATIC: return "static"; - case TOK_SCSK_EXTERN: + case TOK_EXTERN: return "extern"; - case TOK_SCSK_CONST: + case TOK_CONST: return "const"; - case TOK_SCSK_VOLATILE: + case TOK_VOLATILE: return "volatile"; - case TOK_MK_SIZEOF: + case TOK_SIZEOF: return "sizeof"; - case TOK_OP_ADD: + case TOK_ADD: return "+"; - case TOK_OP_SUB: + case TOK_SUB: return "-"; - case TOK_OP_MUL: + case TOK_MUL: return "*"; - case TOK_OP_DIV: + case TOK_DIV: return "/"; - case TOK_OP_MOD: + case TOK_MOD: return "%"; - case TOK_OP_BIT_AND: + case TOK_BIT_AND: return "&"; - case TOK_OP_BIT_OR: + case TOK_BIT_OR: return "|"; - case TOK_OP_BIT_XOR: + case TOK_BIT_XOR: return "^"; - case TOK_OP_BIT_NOT: + case TOK_BIT_NOT: return "~"; - case TOK_OP_LSHIFT: + case TOK_LSHIFT: return "<<"; - case TOK_OP_RSHIFT: + case TOK_RSHIFT: return ">>"; - case TOK_OP_NOT: + case TOK_NOT: return "!"; - case TOK_OP_ASSIGN: + case TOK_ASSIGN: return "="; - case TOK_OP_LT: + case TOK_LT: return "<"; - case TOK_OP_GT: + case TOK_GT: return ">"; - case TOK_OP_INC: + case TOK_INC: return "++"; - case TOK_OP_DEC: + case TOK_DEC: return "--"; - case TOK_OP_EQ: + case TOK_EQ: return "=="; - case TOK_OP_NE: + case TOK_NE: return "!="; - case TOK_OP_LE: + case TOK_LE: return "<="; - case TOK_OP_GE: + case TOK_GE: return ">="; - case TOK_OP_AND: + case TOK_AND: return "&&"; - case TOK_OP_OR: + case TOK_OR: return "||"; - case TOK_OP_MEMBER_POINTER: + case TOK_MEMBER_POINTER: return "->"; - case TOK_OP_MEMBER: + case TOK_MEMBER: return "."; - case TOK_OP_COND_DECISION: + case TOK_COND_DECISION: return ":"; - case TOK_OP_COND: + case TOK_COND: return "?"; - case TOK_OP_ASSIGN_ADD: + case TOK_ASSIGN_ADD: return "+="; - case TOK_OP_ASSIGN_SUB: + case TOK_ASSIGN_SUB: return "-="; - case TOK_OP_ASSIGN_MUL: + case TOK_ASSIGN_MUL: return "*="; - case TOK_OP_ASSIGN_DIV: + case TOK_ASSIGN_DIV: return "/="; - case TOK_OP_ASSIGN_MOD: + case TOK_ASSIGN_MOD: return "%="; - case TOK_OP_ASSIGN_BITAND: + case TOK_ASSIGN_BITAND: return "&="; - case TOK_OP_ASSIGN_BITOR: + case TOK_ASSIGN_BITOR: return "|="; - case TOK_OP_ASSIGN_BITXOR: + case TOK_ASSIGN_BITXOR: return "^="; - case TOK_OP_ASSIGN_LSHIFT: + case TOK_ASSIGN_LSHIFT: return "<<="; - case TOK_OP_ASSIGN_RSHIFT: + case TOK_ASSIGN_RSHIFT: return ">>="; - case TOK_SEP_HASH: + case TOK_HASH: return "#"; case TOK_ID: return "identifier"; - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_U64: - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_S64: + case TOK_INTEGER_U32: + case TOK_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_S64: return "integer constant"; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: return "floating constant"; - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: return "character constant"; - case TOK_CONST_STRING_ASCII: + case TOK_STRING_ASCII: return "string constant"; - case TOK_SPECIAL_EOF: + case TOK_EOF: return "EOF"; - case TOK_SPECIAL_ERROR: + case TOK_ERROR: return "error"; - case TOK_SEP_LEFT_PAREN: + case TOK_LEFT_PAREN: return "("; - case TOK_SEP_RIGHT_PAREN: + case TOK_RIGHT_PAREN: return ")"; - case TOK_SEP_LEFT_BRACKET: + case TOK_LEFT_BRACKET: return "["; - case TOK_SEP_RIGHT_BRACKET: + case TOK_RIGHT_BRACKET: return "]"; - case TOK_SEP_LEFT_BRACE: + case TOK_LEFT_BRACE: return "{"; - case TOK_SEP_RIGHT_BRACE: + case TOK_RIGHT_BRACE: return "}"; - case TOK_SEP_COMMA: + case TOK_COMMA: return ","; - case TOK_SEP_SEMICOLON: + case TOK_SEMICOLON: return ";"; - case TOK_SEP_DOT: + case TOK_DOT: return "."; - case TOK_SEP_ELLIPSIS: + case TOK_ELLIPSIS: return "..."; } return "UNKNOWN"; @@ -326,9 +326,9 @@ static token_t *skip_whitespace(void) { } } else { // Handled here to simplify the code. if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } } else { input_ungetc(c); @@ -344,31 +344,31 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[0]) { case 'a': if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') - return TOK_SCSK_AUTO; + return TOK_AUTO; break; case 'b': if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && buf[4] == 'k') - return TOK_CTK_BREAK; + return TOK_BREAK; break; case 'c': switch (buf[1]) { case 'a': if (len == 4 && buf[2] == 's' && buf[3] == 'e') - return TOK_CTK_CASE; + return TOK_CASE; break; case 'h': if (len == 4 && buf[2] == 'a' && buf[3] == 'r') - return TOK_TK_CHAR; + return TOK_CHAR; break; case 'o': if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') - return TOK_SCSK_CONST; + return TOK_CONST; if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') - return TOK_CTK_CONTINUE; + return TOK_CONTINUE; break; } break; @@ -378,14 +378,14 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && buf[5] == 'l' && buf[6] == 't') - return TOK_CTK_DEFAULT; + return TOK_DEFAULT; break; case 'o': if (len == 2 && buf[2] == '\0') - return TOK_CTK_DO; + return TOK_DO; if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && buf[5] == 'e') - return TOK_TK_DOUBLE; + return TOK_DOUBLE; break; } break; @@ -394,16 +394,16 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 4 && buf[2] == 's' && buf[3] == 'e') - return TOK_CTK_ELSE; + return TOK_ELSE; break; case 'n': if (len == 4 && buf[2] == 'u' && buf[3] == 'm') - return TOK_TK_ENUM; + return TOK_ENUM; break; case 'x': if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && buf[5] == 'n') - return TOK_SCSK_EXTERN; + return TOK_EXTERN; break; } break; @@ -412,36 +412,36 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') - return TOK_TK_FLOAT; + return TOK_FLOAT; break; case 'o': if (len == 3 && buf[2] == 'r') - return TOK_CTK_FOR; + return TOK_FOR; break; } break; case 'g': if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') - return TOK_CTK_GOTO; + return TOK_GOTO; break; case 'i': switch (buf[1]) { case 'f': if (len == 2 && buf[2] == '\0') - return TOK_CTK_IF; + return TOK_IF; break; case 'n': if (len == 3 && buf[2] == 't') - return TOK_TK_INT; + return TOK_INT; break; } break; case 'l': if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') - return TOK_TK_LONG; + return TOK_LONG; break; case 'r': @@ -449,10 +449,10 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') - return TOK_SCSK_REGISTER; + return TOK_REGISTER; if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && buf[5] == 'n') - return TOK_CTK_RETURN; + return TOK_RETURN; break; } break; @@ -461,29 +461,29 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'h': if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') - return TOK_TK_SHORT; + return TOK_SHORT; break; case 't': if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'c') - return TOK_SCSK_STATIC; + return TOK_STATIC; break; case 'i': if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && buf[5] == 'd') - return TOK_TK_SIGNED; + return TOK_SIGNED; if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && buf[5] == 'f') - return TOK_MK_SIZEOF; + return TOK_SIZEOF; break; case 'r': if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') - return TOK_TK_STRUCT; + return TOK_STRUCT; break; case 'w': if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && buf[5] == 'h') - return TOK_CTK_SWITCH; + return TOK_SWITCH; break; } break; @@ -491,17 +491,17 @@ c_token_types get_keyword(const char *buf, int len) { case 't': if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') - return TOK_TK_TYPEDEF; + return TOK_TYPEDEF; break; case 'u': switch (buf[1]) { case 'n': if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') - return TOK_TK_UNION; + return TOK_UNION; if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') - return TOK_TK_UNSIGNED; + return TOK_UNSIGNED; break; } break; @@ -510,10 +510,10 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'o': if (len == 4 && buf[2] == 'i' && buf[3] == 'd') - return TOK_TK_VOID; + return TOK_VOID; if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') - return TOK_SCSK_VOLATILE; + return TOK_VOLATILE; break; } break; @@ -521,7 +521,7 @@ c_token_types get_keyword(const char *buf, int len) { case 'w': if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && buf[4] == 'e') - return TOK_CTK_WHILE; + return TOK_WHILE; break; default: @@ -578,7 +578,7 @@ static token_t *read_number(void) { char cnext = input_getc(); if (!isdigit(cnext)) { input_ungetc(cnext); - return token_create(TOK_OP_MEMBER, line, column, 1); + return token_create(TOK_MEMBER, line, column, 1); } input_ungetc(cnext); } @@ -721,8 +721,8 @@ static token_t *read_number(void) { "precision\n", f); } - return token_create_float(is_single ? TOK_CONST_FLOAT_32 - : TOK_CONST_FLOAT_64, + return token_create_float(is_single ? TOK_FLOAT_32 + : TOK_FLOAT_64, line, column, f, i); } else { @@ -736,7 +736,7 @@ static token_t *read_number(void) { } if (is_unsigned) { if (is_long) { - return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i); + return token_create_int(TOK_INTEGER_U64, line, column, int_, i); } else { if (int_ > UINT32_MAX) { tok_warn( @@ -744,7 +744,7 @@ static token_t *read_number(void) { "int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i); + return token_create_int(TOK_INTEGER_U32, line, column, int_, i); } } else { if (is_long) { @@ -755,13 +755,13 @@ static token_t *read_number(void) { "Warning: Integer constant %lld is out of range for long long\n", i); } - return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i); + return token_create_int(TOK_INTEGER_S64, line, column, int_, i); } else { if (int_ & (1UL << 31)) { tok_warn("Warning: Integer constant %lld is out of range for int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i); + return token_create_int(TOK_INTEGER_S32, line, column, int_, i); } } @@ -878,7 +878,7 @@ static token_t *read_string_literal(void) { return NULL; } - token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, + token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, i + esc_pad + 2); if (buf != s_buf) { free(buf); @@ -911,7 +911,7 @@ static token_t *read_char_constant(void) { return NULL; } len++; - return token_create_char(TOK_CONST_CHAR, line, column, val, len); + return token_create_char(TOK_CHAR_CONST, line, column, val, len); } /* Tokenize Operator */ @@ -923,65 +923,65 @@ token_t *read_operator(void) { case '!': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_NE, line, column, 2); + return token_create(TOK_NE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_NOT, line, column, 1); + return token_create(TOK_NOT, line, column, 1); } case '%': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MOD, line, column, 2); + return token_create(TOK_ASSIGN_MOD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MOD, line, column, 1); + return token_create(TOK_MOD, line, column, 1); } case '&': { c = input_getc(); if (c == '&') - return token_create(TOK_OP_AND, line, column, 2); + return token_create(TOK_AND, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2); + return token_create(TOK_ASSIGN_BITAND, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_AND, line, column, 1); + return token_create(TOK_BIT_AND, line, column, 1); } case '(': - return token_create(TOK_SEP_LEFT_PAREN, line, column, 1); + return token_create(TOK_LEFT_PAREN, line, column, 1); case ')': - return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1); + return token_create(TOK_RIGHT_PAREN, line, column, 1); case '*': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MUL, line, column, 2); + return token_create(TOK_ASSIGN_MUL, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MUL, line, column, 1); + return token_create(TOK_MUL, line, column, 1); } case '+': { c = input_getc(); if (c == '+') - return token_create(TOK_OP_INC, line, column, 2); + return token_create(TOK_INC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_ADD, line, column, 2); + return token_create(TOK_ASSIGN_ADD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ADD, line, column, 2); + return token_create(TOK_ADD, line, column, 2); } case ',': - return token_create(TOK_SEP_COMMA, line, column, 1); + return token_create(TOK_COMMA, line, column, 1); case '-': { c = input_getc(); if (c == '-') - return token_create(TOK_OP_DEC, line, column, 2); + return token_create(TOK_DEC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_SUB, line, column, 2); + return token_create(TOK_ASSIGN_SUB, line, column, 2); if (c == '>') - return token_create(TOK_OP_MEMBER_POINTER, line, column, 2); + return token_create(TOK_MEMBER_POINTER, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_SUB, line, column, 1); + return token_create(TOK_SUB, line, column, 1); } case '.': { c = input_getc(); if (c == '.') { c = input_getc(); if (c == '.') { - return token_create(TOK_SEP_ELLIPSIS, line, column, 3); + return token_create(TOK_ELLIPSIS, line, column, 3); } else { // Bail out, can't store more than one unget tok_error("Unexpected character '.' at line %d, column %d\n", line, @@ -994,77 +994,77 @@ token_t *read_operator(void) { case '/': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } case ':': - return token_create(TOK_OP_COND_DECISION, line, column, 1); + return token_create(TOK_COND_DECISION, line, column, 1); case ';': - return token_create(TOK_SEP_SEMICOLON, line, column, 1); + return token_create(TOK_SEMICOLON, line, column, 1); case '<': { c = input_getc(); if (c == '<') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_LSHIFT, line, column, 2); + return token_create(TOK_LSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_LE, line, column, 2); + return token_create(TOK_LE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_LT, line, column, 1); + return token_create(TOK_LT, line, column, 1); } case '=': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN, line, column, 2); + return token_create(TOK_ASSIGN, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ASSIGN, line, column, 1); + return token_create(TOK_ASSIGN, line, column, 1); } case '>': { c = input_getc(); if (c == '>') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_RSHIFT, line, column, 2); + return token_create(TOK_RSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_GE, line, column, 2); + return token_create(TOK_GE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_GT, line, column, 1); + return token_create(TOK_GT, line, column, 1); } case '?': - return token_create(TOK_OP_COND, line, column, 1); + return token_create(TOK_COND, line, column, 1); case '[': - return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1); + return token_create(TOK_LEFT_BRACKET, line, column, 1); case ']': - return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1); + return token_create(TOK_RIGHT_BRACKET, line, column, 1); case '^': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2); + return token_create(TOK_ASSIGN_BITXOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_XOR, line, column, 1); + return token_create(TOK_BIT_XOR, line, column, 1); } case '{': - return token_create(TOK_SEP_LEFT_BRACE, line, column, 1); + return token_create(TOK_LEFT_BRACE, line, column, 1); case '|': { c = input_getc(); if (c == '|') - return token_create(TOK_OP_OR, line, column, 2); + return token_create(TOK_OR, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2); + return token_create(TOK_ASSIGN_BITOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_OR, line, column, 1); + return token_create(TOK_BIT_OR, line, column, 1); } case '}': - return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1); + return token_create(TOK_RIGHT_BRACE, line, column, 1); case '~': - return token_create(TOK_OP_BIT_NOT, line, column, 1); + return token_create(TOK_BIT_NOT, line, column, 1); default: input_ungetc(c); return NULL; @@ -1140,6 +1140,7 @@ int main(int argc, char **argv) { destroy_tokenizer(); remove(preprocessed); free(preprocessed); + hash_table_destroy(string_table); return 0; } diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html @@ -287,29 +287,39 @@ li.L9 { </div> <a name="1:4"><div class="section"><h4>4. The Lexer</h4></a> -<p>A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). +<p>A lexical analyzer reads source code and produces tokens, which are the smallest units of meaning in a language. These tokens are then used by the parser to build an abstract syntax tree (AST), which represents the structure of the program. +</p> +<p>For example, in the C programming language, tokens include keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). </p> <p>Given a string like <code>int main() { return 0; }</code>, the lexer would produce a series of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>, <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>, <code>RBRACE</code>. </p> </div> <a name="1:5"><div class="section"><h4>5. Design</h4></a> -<p>I'll break the lexer up into a couple of modules. <code>token.c</code> will contain the token data structure and functions to create and destroy tokens. <code>input.c</code> will contain the input data structure and functions to read from the input file. <code>tokenizer.c</code> will contain the main lexer logic. +<p>I'll break the lexer into several modules: </p> +<ul> +<li><code>token.c</code> will contain the token data structure and functions to create and destroy tokens. +</li> +<li><code>input.c</code> will contain the input data structure and functions to read from the input file. +</li> +<li><code>tokenizer.c</code> will contain the main lexer logic. +</li> +</ul> </div> <a name="1:6"><div class="section"><h4>6. Token Interface</h4></a> -<p>Tokens are the smallest unit of meaning in a language. They're used by the parser to build an abstract syntax tree (AST). We'll need a couple of things to represent a token: +<p>We'll need several components to represent a token: </p> <ul> -<li>The type of token. This will be an enum, with values like <code>TOK_CTK_IF</code> or <code>TOK_CONST_INTEGER_U32</code>. +<li>The type of token, which will be an enum with values like <code>TOK_IF</code> or <code>TOK_INTEGER_U32</code>. </li> <li>The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do. </li> -<li>The line and column of the token. This is used for error messages. +<li>The line and column of the token, used for error messages. </li> </ul> -<p>As I mentioned earlier, we're trying to implement a sort of class system in C. For that, we'll need to hide the token implementation details behind an opaque pointer. We could just have a <code>void</code> pointer, but that stops us from being able to use compile-time type checking. Instead, we'll use a forward declaration of the token type in the header file, and then define the token type in the implementation file. +<p>To implement a class system in C, we'll hide the token implementation details behind an opaque pointer. We'll use a forward declaration of the token type in the header file and define the token type in the implementation file. </p> </div> @@ -322,65 +332,51 @@ typedef struct token token_t; </pre> -<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p> +<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:40">40</a></p> </div> </div> <a name="1:8"><div class="section"><h4 class="noheading">8. </h4></a> -<p>We'll need a couple of functions to create and destroy tokens. +<p>We'll need functions to create and destroy tokens. </p> <div class="codeblock"> <span class="codeblock_name">{Token Creation and Destruction Interface <a href="lexer.html#1:8">8</a>}</span> <pre class="prettyprint lang-c"> -token_t *token_data_create(c_token_types kind, int lin, int col, int len); - token_t *token_create(c_token_types kind, int lin, int col, int len); - token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len); - token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len); - token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len); - token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len); - void token_destroy(token_t *token); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:38">38</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:40">40</a></p> </div> </div> <a name="1:9"><div class="section"><h4 class="noheading">9. </h4></a> -<p>We'll also need some functions to access the token data. +<p>We'll also need functions to access the token data. </p> <div class="codeblock"> <span class="codeblock_name">{Token Interface <a href="lexer.html#1:9">9</a>}</span> <pre class="prettyprint lang-c"> c_token_types token_type(token_t *token); - int64_t token_int(token_t *token); - double token_float(token_t *token); - const char *token_string(token_t *token); - char token_char(token_t *token); - int token_line(token_t *token); - int token_column(token_t *token); - void print_token(token_t *tok); </pre> -<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p> +<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:40">40</a></p> </div> </div> <a name="1:10"><div class="section"><h4 class="noheading">10. </h4></a> -<p>We'll need some types to represent the different kinds of tokens. +<p>We'll need types to represent the different kinds of tokens. </p> <div class="codeblock"> @@ -388,118 +384,118 @@ void print_token(token_t *tok); <pre class="prettyprint lang-c"> typedef enum { // Control Keywords - TOK_CTK_IF, - TOK_CTK_ELSE, - TOK_CTK_SWITCH, - TOK_CTK_CASE, - TOK_CTK_DEFAULT, - TOK_CTK_WHILE, - TOK_CTK_DO, - TOK_CTK_FOR, - TOK_CTK_CONTINUE, - TOK_CTK_BREAK, - TOK_CTK_RETURN, - TOK_CTK_GOTO, + TOK_IF, + TOK_ELSE, + TOK_SWITCH, + TOK_CASE, + TOK_DEFAULT, + TOK_WHILE, + TOK_DO, + TOK_FOR, + TOK_CONTINUE, + TOK_BREAK, + TOK_RETURN, + TOK_GOTO, // Type Keywords - TOK_TK_VOID, - TOK_TK_CHAR, - TOK_TK_SHORT, - TOK_TK_INT, - TOK_TK_LONG, - TOK_TK_FLOAT, - TOK_TK_DOUBLE, - TOK_TK_SIGNED, - TOK_TK_UNSIGNED, - TOK_TK_STRUCT, - TOK_TK_UNION, - TOK_TK_ENUM, - TOK_TK_TYPEDEF, + TOK_VOID, + TOK_CHAR, + TOK_SHORT, + TOK_INT, + TOK_LONG, + TOK_FLOAT, + TOK_DOUBLE, + TOK_SIGNED, + TOK_UNSIGNED, + TOK_STRUCT, + TOK_UNION, + TOK_ENUM, + TOK_TYPEDEF, // Storage Class/Specifier Keywords - TOK_SCSK_AUTO, - TOK_SCSK_REGISTER, - TOK_SCSK_STATIC, - TOK_SCSK_EXTERN, - TOK_SCSK_CONST, - TOK_SCSK_VOLATILE, + TOK_AUTO, + TOK_REGISTER, + TOK_STATIC, + TOK_EXTERN, + TOK_CONST, + TOK_VOLATILE, // Misc Keywords - TOK_MK_SIZEOF, + TOK_SIZEOF, // Operators - TOK_OP_ADD, // + - TOK_OP_SUB, // - - TOK_OP_MUL, // * - TOK_OP_DIV, // / - TOK_OP_MOD, // % - TOK_OP_BIT_AND, // &amp; - TOK_OP_BIT_OR, // | - TOK_OP_BIT_XOR, // ^ - TOK_OP_BIT_NOT, // ~ - TOK_OP_LSHIFT, // &lt;&lt; - TOK_OP_RSHIFT, // &gt;&gt; - TOK_OP_NOT, // ! - TOK_OP_ASSIGN, // = - TOK_OP_LT, // &lt; - TOK_OP_GT, // &gt; - TOK_OP_INC, // ++ - TOK_OP_DEC, // -- - TOK_OP_EQ, // == - TOK_OP_NE, // != - TOK_OP_LE, // &lt;= - TOK_OP_GE, // &gt;= - TOK_OP_AND, // &amp;&amp; - TOK_OP_OR, // || - TOK_OP_MEMBER_POINTER, // -&gt; - TOK_OP_MEMBER, // . - TOK_OP_COND_DECISION, // : - TOK_OP_COND, // ? - TOK_OP_ASSIGN_ADD, // += - TOK_OP_ASSIGN_SUB, // -= - TOK_OP_ASSIGN_MUL, // *= - TOK_OP_ASSIGN_DIV, // /= - TOK_OP_ASSIGN_MOD, // %= - TOK_OP_ASSIGN_BITAND, // &amp;= - TOK_OP_ASSIGN_BITOR, // |= - TOK_OP_ASSIGN_BITXOR, // ^= - TOK_OP_ASSIGN_LSHIFT, // &lt;&lt;= - TOK_OP_ASSIGN_RSHIFT, // &gt;&gt;= + TOK_ADD, // + + TOK_SUB, // - + TOK_MUL, // * + TOK_DIV, // / + TOK_MOD, // % + TOK_BIT_AND, // &amp; + TOK_BIT_OR, // | + TOK_BIT_XOR, // ^ + TOK_BIT_NOT, // ~ + TOK_LSHIFT, // &lt;&lt; + TOK_RSHIFT, // &gt;&gt; + TOK_NOT, // ! + TOK_ASSIGN, // = + TOK_LT, // &lt; + TOK_GT, // &gt; + TOK_INC, // ++ + TOK_DEC, // -- + TOK_EQ, // == + TOK_NE, // != + TOK_LE, // &lt;= + TOK_GE, // &gt;= + TOK_AND, // &amp;&amp; + TOK_OR, // || + TOK_MEMBER_POINTER, // -&gt; + TOK_MEMBER, // . + TOK_COND_DECISION, // : + TOK_COND, // ? + TOK_ASSIGN_ADD, // += + TOK_ASSIGN_SUB, // -= + TOK_ASSIGN_MUL, // *= + TOK_ASSIGN_DIV, // /= + TOK_ASSIGN_MOD, // %= + TOK_ASSIGN_BITAND, // &amp;= + TOK_ASSIGN_BITOR, // |= + TOK_ASSIGN_BITXOR, // ^= + TOK_ASSIGN_LSHIFT, // &lt;&lt;= + TOK_ASSIGN_RSHIFT, // &gt;&gt;= // Separators - TOK_SEP_LEFT_PAREN, // ( - TOK_SEP_RIGHT_PAREN, // ) - TOK_SEP_LEFT_BRACKET, // [ - TOK_SEP_RIGHT_BRACKET, // ] - TOK_SEP_LEFT_BRACE, // { - TOK_SEP_RIGHT_BRACE, // } - TOK_SEP_COMMA, // , - TOK_SEP_SEMICOLON, // ; - TOK_SEP_DOT, // . - TOK_SEP_ELLIPSIS, // ... - TOK_SEP_HASH, // # + TOK_LEFT_PAREN, // ( + TOK_RIGHT_PAREN, // ) + TOK_LEFT_BRACKET, // [ + TOK_RIGHT_BRACKET, // ] + TOK_LEFT_BRACE, // { + TOK_RIGHT_BRACE, // } + TOK_COMMA, // , + TOK_SEMICOLON, // ; + TOK_DOT, // . + TOK_ELLIPSIS, // ... + TOK_HASH, // # // Identifiers TOK_ID, // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' - TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) + TOK_INTEGER_U32, // u + TOK_INTEGER_U64, // ul + TOK_INTEGER_S32, // (no suffix) + TOK_INTEGER_S64, // l + TOK_FLOAT_32, // f + TOK_FLOAT_64, // (no suffix) + TOK_CHAR_CONST, // 'c' + TOK_STRING_ASCII, // "string" (width of 8 bits) // Special - TOK_SPECIAL_EOF, - TOK_SPECIAL_ERROR, + TOK_EOF, + TOK_ERROR, } c_token_types; </pre> -<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p> +<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:40">40</a></p> </div> </div> <a name="1:11"><div class="section"><h4 class="noheading">11. </h4></a> @@ -511,41 +507,38 @@ typedef enum { <pre class="prettyprint lang-c"> #ifndef TOKEN_H #define TOKEN_H -#include &lt;stdint.h&gt; // We use this for int64_t +#include &lt;stdint.h&gt; // For int64_t <span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span> <span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span> -<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:16">16</a>}</span> +<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span> <span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span> extern int column; extern int line; #endif </pre> -<p class="seealso">Redefined in section <a href="lexer.html#1:38">38</a></p> +<p class="seealso">Redefined in section <a href="lexer.html#1:40">40</a></p> </div> </div> <a name="1:12"><div class="section"><h4>12. Token Implementation</h4></a> -<p>Now that we have the interface, we can implement the token data structure. We'll need a couple of things: +<p>Now that we have the interface, we can implement the token data structure. We'll need: </p> <ul> <li>The token type. </li> -<li>A way to store extra data. +<li>A way to store extra data. </li> -<li>Implementations of the functions we defined in the interface. +<li>Implementations of the functions defined in the interface. </li> </ul> - -</div> -<a name="1:13"><div class="section"><h4 class="noheading">13. </h4></a> -<p>One problem is we haven't defined a way to verify that the token we're getting isn't corrupt. We'll use a tag for that. +<p>To verify the token isn't corrupt, we'll use a tag. <code>TOK_MAGIC_1</code> represents a token with optional data, and <code>TOK_MAGIC_2</code> represents a token without optional data. </p> -<p>You might notice that a zero-length array is used in the token data structure. This is a GCC extension that allows us to allocate memory for the token data structure and the token data in one allocation. This is a bit of a hack, but it's a common pattern in C code. +<p>A zero-length array is used in the token data structure. This GCC extension allows us to allocate memory for the token data structure and the token data in one allocation. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Data Structure <a href="lexer.html#1:13">13</a>}</span> +<span class="codeblock_name">{Token Data Structure <a href="lexer.html#1:12">12</a>}</span> <pre class="prettyprint lang-c"> #define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" @@ -575,15 +568,15 @@ int line = 1; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> </div> </div> -<a name="1:14"><div class="section"><h4 class="noheading">14. </h4></a> -<p>We'll need to implement an interface for accessing the token data and a macro for accessing optional data. +<a name="1:13"><div class="section"><h4 class="noheading">13. </h4></a> +<p>We'll implement an interface for accessing the token data and a macro for accessing optional data. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Data Access <a href="lexer.html#1:14">14</a>}</span> +<span class="codeblock_name">{Token Data Access <a href="lexer.html#1:13">13</a>}</span> <pre class="prettyprint lang-c"> #define token_data(token) ((struct token_data *)((token)-&gt;opt_data)) @@ -593,29 +586,25 @@ c_token_types token_type(token_t *token) { } int64_t token_int(token_t *token) { - assert(token-&gt;kind == TOK_CONST_INTEGER_U32 || - token-&gt;kind == TOK_CONST_INTEGER_U64 || - token-&gt;kind == TOK_CONST_INTEGER_S32 || - token-&gt;kind == TOK_CONST_INTEGER_S64); + assert(token-&gt;kind == TOK_INTEGER_U32 || token-&gt;kind == TOK_INTEGER_U64 || token-&gt;kind == TOK_INTEGER_S32 || token-&gt;kind == TOK_INTEGER_S64); assert(token-&gt;magic == TOK_MAGIC_1); return token_data(token)-&gt;data.i; } double token_float(token_t *token) { - assert(token-&gt;kind == TOK_CONST_FLOAT_32 || - token-&gt;kind == TOK_CONST_FLOAT_64); + assert(token-&gt;kind == TOK_FLOAT_32 || token-&gt;kind == TOK_FLOAT_64); assert(token-&gt;magic == TOK_MAGIC_1); return token_data(token)-&gt;data.f; } const char *token_string(token_t *token) { - assert(token-&gt;kind == TOK_CONST_STRING_ASCII || token-&gt;kind == TOK_ID); + assert(token-&gt;kind == TOK_STRING_ASCII || token-&gt;kind == TOK_ID); assert(token-&gt;magic == TOK_MAGIC_1); return token_data(token)-&gt;data.s; } char token_char(token_t *token) { - assert(token-&gt;kind == TOK_CONST_CHAR); + assert(token-&gt;kind == TOK_CHAR_CONST); assert(token-&gt;magic == TOK_MAGIC_1); return token_data(token)-&gt;data.c; } @@ -632,288 +621,306 @@ int token_column(token_t *token) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> +</div> +</div> +<a name="1:14"><div class="section"><h4 class="noheading">14. </h4></a> +<p>For debugging, we'll add a function to print the token type. +</p> + +<div class="codeblock"> +<span class="codeblock_name">{Token Debugging <a href="lexer.html#1:14">14</a>}</span> +<pre class="prettyprint lang-c"> +<span class="nocode pln">{Token Type Enum to String, <a href="lexer.html#1:15">15</a>}</span> +<span class="nocode pln">{Unescape String, <a href="lexer.html#1:16">16</a>}</span> +<span class="nocode pln">{Print Token, <a href="lexer.html#1:17">17</a>}</span> +</pre> + + +<p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> </div> </div> <a name="1:15"><div class="section"><h4 class="noheading">15. </h4></a> -<p>For debugging, I'll add a function to print the token type. +<p>This function returns a string with the token type name. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Debugging <a href="lexer.html#1:15">15</a>}</span> +<span class="codeblock_name">{Token Type Enum to String <a href="lexer.html#1:15">15</a>}</span> <pre class="prettyprint lang-c"> const char *token_name_from_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: - return "TOK_CTK_IF"; - case TOK_CTK_ELSE: - return "TOK_CTK_ELSE"; - case TOK_CTK_SWITCH: - return "TOK_CTK_SWITCH"; - case TOK_CTK_CASE: - return "TOK_CTK_CASE"; - case TOK_CTK_DEFAULT: - return "TOK_CTK_DEFAULT"; - case TOK_CTK_WHILE: - return "TOK_CTK_WHILE"; - case TOK_CTK_DO: - return "TOK_CTK_DO"; - case TOK_CTK_FOR: - return "TOK_CTK_FOR"; - case TOK_CTK_CONTINUE: - return "TOK_CTK_CONTINUE"; - case TOK_CTK_BREAK: - return "TOK_CTK_BREAK"; - case TOK_CTK_RETURN: - return "TOK_CTK_RETURN"; - case TOK_CTK_GOTO: - return "TOK_CTK_GOTO"; - case TOK_TK_VOID: - return "TOK_TK_VOID"; - case TOK_TK_CHAR: - return "TOK_TK_CHAR"; - case TOK_TK_SHORT: - return "TOK_TK_SHORT"; - case TOK_TK_INT: - return "TOK_TK_INT"; - case TOK_TK_LONG: - return "TOK_TK_LONG"; - case TOK_TK_FLOAT: - return "TOK_TK_FLOAT"; - case TOK_TK_DOUBLE: - return "TOK_TK_DOUBLE"; - case TOK_TK_SIGNED: - return "TOK_TK_SIGNED"; - case TOK_TK_UNSIGNED: - return "TOK_TK_UNSIGNED"; - case TOK_TK_STRUCT: - return "TOK_TK_STRUCT"; - case TOK_TK_UNION: - return "TOK_TK_UNION"; - case TOK_TK_ENUM: - return "TOK_TK_ENUM"; - case TOK_TK_TYPEDEF: - return "TOK_TK_TYPEDEF"; - case TOK_SCSK_AUTO: - return "TOK_SCSK_AUTO"; - case TOK_SCSK_REGISTER: - return "TOK_SCSK_REGISTER"; - case TOK_SCSK_STATIC: - return "TOK_SCSK_STATIC"; - case TOK_SCSK_EXTERN: - return "TOK_SCSK_EXTERN"; - case TOK_SCSK_CONST: - return "TOK_SCSK_CONST"; - case TOK_SCSK_VOLATILE: - return "TOK_SCSK_VOLATILE"; - case TOK_MK_SIZEOF: - return "TOK_MK_SIZEOF"; - case TOK_OP_ADD: - return "TOK_OP_ADD"; - case TOK_OP_SUB: - return "TOK_OP_SUB"; - case TOK_OP_MUL: - return "TOK_OP_MUL"; - case TOK_OP_DIV: - return "TOK_OP_DIV"; - case TOK_OP_MOD: - return "TOK_OP_MOD"; - case TOK_OP_BIT_AND: - return "TOK_OP_BIT_AND"; - case TOK_OP_BIT_OR: - return "TOK_OP_BIT_OR"; - case TOK_OP_BIT_XOR: - return "TOK_OP_BIT_XOR"; - case TOK_OP_BIT_NOT: - return "TOK_OP_BIT_NOT"; - case TOK_OP_LSHIFT: - return "TOK_OP_LSHIFT"; - case TOK_OP_RSHIFT: - return "TOK_OP_RSHIFT"; - case TOK_OP_NOT: - return "TOK_OP_NOT"; - case TOK_OP_ASSIGN: - return "TOK_OP_ASSIGN"; - case TOK_OP_LT: - return "TOK_OP_LT"; - case TOK_OP_GT: - return "TOK_OP_GT"; - case TOK_OP_INC: - return "TOK_OP_INC"; - case TOK_OP_DEC: - return "TOK_OP_DEC"; - case TOK_OP_EQ: - return "TOK_OP_EQ"; - case TOK_OP_NE: - return "TOK_OP_NE"; - case TOK_OP_LE: - return "TOK_OP_LE"; - case TOK_OP_GE: - return "TOK_OP_GE"; - case TOK_OP_AND: - return "TOK_OP_AND"; - case TOK_OP_OR: - return "TOK_OP_OR"; - case TOK_OP_MEMBER_POINTER: - return "TOK_OP_MEMBER_POINTER"; - case TOK_OP_MEMBER: - return "TOK_OP_MEMBER"; - case TOK_OP_COND_DECISION: - return "TOK_OP_COND_DECISION"; - case TOK_OP_COND: - return "TOK_OP_COND"; - case TOK_OP_ASSIGN_ADD: - return "TOK_OP_ASSIGN_ADD"; - case TOK_OP_ASSIGN_SUB: - return "TOK_OP_ASSIGN_SUB"; - case TOK_OP_ASSIGN_MUL: - return "TOK_OP_ASSIGN_MUL"; - case TOK_OP_ASSIGN_DIV: - return "TOK_OP_ASSIGN_DIV"; - case TOK_OP_ASSIGN_MOD: - return "TOK_OP_ASSIGN_MOD"; - case TOK_OP_ASSIGN_BITAND: - return "TOK_OP_ASSIGN_BITAND"; - case TOK_OP_ASSIGN_BITOR: - return "TOK_OP_ASSIGN_BITOR"; - case TOK_OP_ASSIGN_BITXOR: - return "TOK_OP_ASSIGN_BITXOR"; - case TOK_OP_ASSIGN_LSHIFT: - return "TOK_OP_ASSIGN_LSHIFT"; - case TOK_OP_ASSIGN_RSHIFT: - return "TOK_OP_ASSIGN_RSHIFT"; - case TOK_SEP_HASH: - return "TOK_SEP_HASH"; + case TOK_IF: + return "TOK_IF"; + case TOK_ELSE: + return "TOK_ELSE"; + case TOK_SWITCH: + return "TOK_SWITCH"; + case TOK_CASE: + return "TOK_CASE"; + case TOK_DEFAULT: + return "TOK_DEFAULT"; + case TOK_WHILE: + return "TOK_WHILE"; + case TOK_DO: + return "TOK_DO"; + case TOK_FOR: + return "TOK_FOR"; + case TOK_CONTINUE: + return "TOK_CONTINUE"; + case TOK_BREAK: + return "TOK_BREAK"; + case TOK_RETURN: + return "TOK_RETURN"; + case TOK_GOTO: + return "TOK_GOTO"; + case TOK_VOID: + return "TOK_VOID"; + case TOK_CHAR: + return "TOK_CHAR"; + case TOK_SHORT: + return "TOK_SHORT"; + case TOK_INT: + return "TOK_INT"; + case TOK_LONG: + return "TOK_LONG"; + case TOK_FLOAT: + return "TOK_FLOAT"; + case TOK_DOUBLE: + return "TOK_DOUBLE"; + case TOK_SIGNED: + return "TOK_SIGNED"; + case TOK_UNSIGNED: + return "TOK_UNSIGNED"; + case TOK_STRUCT: + return "TOK_STRUCT"; + case TOK_UNION: + return "TOK_UNION"; + case TOK_ENUM: + return "TOK_ENUM"; + case TOK_TYPEDEF: + return "TOK_TYPEDEF"; + case TOK_AUTO: + return "TOK_AUTO"; + case TOK_REGISTER: + return "TOK_REGISTER"; + case TOK_STATIC: + return "TOK_STATIC"; + case TOK_EXTERN: + return "TOK_EXTERN"; + case TOK_CONST: + return "TOK_CONST"; + case TOK_VOLATILE: + return "TOK_VOLATILE"; + case TOK_SIZEOF: + return "TOK_SIZEOF"; + case TOK_ADD: + return "TOK_ADD"; + case TOK_SUB: + return "TOK_SUB"; + case TOK_MUL: + return "TOK_MUL"; + case TOK_DIV: + return "TOK_DIV"; + case TOK_MOD: + return "TOK_MOD"; + case TOK_BIT_AND: + return "TOK_BIT_AND"; + case TOK_BIT_OR: + return "TOK_BIT_OR"; + case TOK_BIT_XOR: + return "TOK_BIT_XOR"; + case TOK_BIT_NOT: + return "TOK_BIT_NOT"; + case TOK_LSHIFT: + return "TOK_LSHIFT"; + case TOK_RSHIFT: + return "TOK_RSHIFT"; + case TOK_NOT: + return "TOK_NOT"; + case TOK_ASSIGN: + return "TOK_ASSIGN"; + case TOK_LT: + return "TOK_LT"; + case TOK_GT: + return "TOK_GT"; + case TOK_INC: + return "TOK_INC"; + case TOK_DEC: + return "TOK_DEC"; + case TOK_EQ: + return "TOK_EQ"; + case TOK_NE: + return "TOK_NE"; + case TOK_LE: + return "TOK_LE"; + case TOK_GE: + return "TOK_GE"; + case TOK_AND: + return "TOK_AND"; + case TOK_OR: + return "TOK_OR"; + case TOK_MEMBER_POINTER: + return "TOK_MEMBER_POINTER"; + case TOK_MEMBER: + return "TOK_MEMBER"; + case TOK_COND_DECISION: + return "TOK_COND_DECISION"; + case TOK_COND: + return "TOK_COND"; + case TOK_ASSIGN_ADD: + return "TOK_ASSIGN_ADD"; + case TOK_ASSIGN_SUB: + return "TOK_ASSIGN_SUB"; + case TOK_ASSIGN_MUL: + return "TOK_ASSIGN_MUL"; + case TOK_ASSIGN_DIV: + return "TOK_ASSIGN_DIV"; + case TOK_ASSIGN_MOD: + return "TOK_ASSIGN_MOD"; + case TOK_ASSIGN_BITAND: + return "TOK_ASSIGN_BITAND"; + case TOK_ASSIGN_BITOR: + return "TOK_ASSIGN_BITOR"; + case TOK_ASSIGN_BITXOR: + return "TOK_ASSIGN_BITXOR"; + case TOK_ASSIGN_LSHIFT: + return "TOK_ASSIGN_LSHIFT"; + case TOK_ASSIGN_RSHIFT: + return "TOK_ASSIGN_RSHIFT"; + case TOK_HASH: + return "TOK_HASH"; case TOK_ID: return "TOK_ID"; - case TOK_CONST_INTEGER_U32: - return "TOK_CONST_INTEGER_U32"; - case TOK_CONST_INTEGER_U64: - return "TOK_CONST_INTEGER_U64"; - case TOK_CONST_INTEGER_S32: - return "TOK_CONST_INTEGER_S32"; - case TOK_CONST_INTEGER_S64: - return "TOK_CONST_INTEGER_S64"; - case TOK_CONST_FLOAT_32: - return "TOK_CONST_FLOAT_32"; - case TOK_CONST_FLOAT_64: - return "TOK_CONST_FLOAT_64"; - case TOK_CONST_CHAR: - return "TOK_CONST_CHAR"; - case TOK_CONST_STRING_ASCII: - return "TOK_CONST_STRING_ASCII"; - case TOK_SPECIAL_EOF: - return "TOK_SPECIAL_EOF"; - case TOK_SPECIAL_ERROR: - return "TOK_SPECIAL_ERROR"; - case TOK_SEP_LEFT_PAREN: - return "TOK_SEP_LEFT_PAREN"; - case TOK_SEP_RIGHT_PAREN: - return "TOK_SEP_RIGHT_PAREN"; - case TOK_SEP_LEFT_BRACKET: - return "TOK_SEP_LEFT_BRACKET"; - case TOK_SEP_RIGHT_BRACKET: - return "TOK_SEP_RIGHT_BRACKET"; - case TOK_SEP_LEFT_BRACE: - return "TOK_SEP_LEFT_BRACE"; - case TOK_SEP_RIGHT_BRACE: - return "TOK_SEP_RIGHT_BRACE"; - case TOK_SEP_COMMA: - return "TOK_SEP_COMMA"; - case TOK_SEP_SEMICOLON: - return "TOK_SEP_SEMICOLON"; - case TOK_SEP_DOT: - return "TOK_SEP_DOT"; - case TOK_SEP_ELLIPSIS: - return "TOK_SEP_ELLIPSIS"; + case TOK_INTEGER_U32: + return "TOK_INTEGER_U32"; + case TOK_INTEGER_U64: + return "TOK_INTEGER_U64"; + case TOK_INTEGER_S32: + return "TOK_INTEGER_S32"; + case TOK_INTEGER_S64: + return "TOK_INTEGER_S64"; + case TOK_FLOAT_32: + return "TOK_FLOAT_32"; + case TOK_FLOAT_64: + return "TOK_FLOAT_64"; + case TOK_CHAR_CONST: + return "TOK_CHAR_CONST"; + case TOK_STRING_ASCII: + return "TOK_STRING_ASCII"; + case TOK_EOF: + return "TOK_EOF"; + case TOK_ERROR: + return "TOK_ERROR"; + case TOK_LEFT_PAREN: + return "TOK_LEFT_PAREN"; + case TOK_RIGHT_PAREN: + return "TOK_RIGHT_PAREN"; + case TOK_LEFT_BRACKET: + return "TOK_LEFT_BRACKET"; + case TOK_RIGHT_BRACKET: + return "TOK_RIGHT_BRACKET"; + case TOK_LEFT_BRACE: + return "TOK_LEFT_BRACE"; + case TOK_RIGHT_BRACE: + return "TOK_RIGHT_BRACE"; + case TOK_COMMA: + return "TOK_COMMA"; + case TOK_SEMICOLON: + return "TOK_SEMICOLON"; + case TOK_DOT: + return "TOK_DOT"; + case TOK_ELLIPSIS: + return "TOK_ELLIPSIS"; } return "UNKNOWN"; } +</pre> + +<p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> +</div> +</div> +<a name="1:16"><div class="section"><h4 class="noheading">16. </h4></a> +<p>This function adds escape characters to a string for printing. +</p> + +<div class="codeblock"> +<span class="codeblock_name">{Unescape String <a href="lexer.html#1:16">16</a>}</span> +<pre class="prettyprint lang-c"> +#define clamp(x, min, max) ((x) &lt; (min) ? (min) : (x) &gt; (max) ? (max) : (x)) char *re_escape_string(const char *str) { int len = strlen(str); char *buf = malloc(len * 2 + 1); - if (buf == NULL) { + if (!buf) { fprintf(stderr, "Out of memory. Cannot escape string\n"); exit(1); } int i = 0; for (int j = 0; j &lt; len; j++) { switch (str[j]) { - case '\a': - buf[i++] = '\\'; - buf[i++] = 'a'; - break; - case '\b': - buf[i++] = '\\'; - buf[i++] = 'b'; - break; - case '\f': - buf[i++] = '\\'; - buf[i++] = 'f'; - break; - case '\n': - buf[i++] = '\\'; - buf[i++] = 'n'; - break; - case '\r': - buf[i++] = '\\'; - buf[i++] = 'r'; - break; - case '\t': - buf[i++] = '\\'; - buf[i++] = 't'; - break; - case '\v': - buf[i++] = '\\'; - buf[i++] = 'v'; - break; - case '\\': - buf[i++] = '\\'; - buf[i++] = '\\'; - break; - case '\'': - buf[i++] = '\\'; - buf[i++] = '\''; - break; - case '"': - buf[i++] = '\\'; - buf[i++] = '"'; - break; - default: - buf[i++] = str[j]; - break; + case '\a': buf[i++] = '\\'; buf[i++] = 'a'; break; + case '\b': buf[i++] = '\\'; buf[i++] = 'b'; break; + case '\f': buf[i++] = '\\'; buf[i++] = 'f'; break; + case '\n': buf[i++] = '\\'; buf[i++] = 'n'; break; + case '\r': buf[i++] = '\\'; buf[i++] = 'r'; break; + case '\t': buf[i++] = '\\'; buf[i++] = 't'; break; + case '\v': buf[i++] = '\\'; buf[i++] = 'v'; break; + case '\\': buf[i++] = '\\'; buf[i++] = '\\'; break; + case '\'': buf[i++] = '\\'; buf[i++] = '\''; break; + case '"': buf[i++] = '\\'; buf[i++] = '"'; break; + default: { + if (isprint(str[j])) { + buf[i++] = str[j]; + } else { + buf[i++] = '\\'; + buf[i++] = 'x'; + buf[i++] = "0123456789abcdef"[clamp(str[j] &gt;&gt; 4, 0, 0xf)]; + buf[i++] = "0123456789abcdef"[clamp(str[j] &amp; 0xf, 0, 0xf)]; + } + } } } buf[i] = '\0'; return buf; } +</pre> + + +<p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> +</div> +</div> +<a name="1:17"><div class="section"><h4 class="noheading">17. </h4></a> +<p>This function prints the token type and value. +</p> +<div class="codeblock"> +<span class="codeblock_name">{Print Token <a href="lexer.html#1:17">17</a>}</span> +<pre class="prettyprint lang-c"> void print_token(token_t *tok) { - if (tok == NULL) { + if (!tok) { printf("NULL\n"); return; } const char *name = token_name_from_type(tok-&gt;kind); switch (tok-&gt;kind) { case TOK_ID: - case TOK_CONST_STRING_ASCII: { + case TOK_STRING_ASCII: { char *escaped = re_escape_string(token_string(tok)); printf("%s: \"%s\"@%d:%d\n", name, escaped, tok-&gt;line, tok-&gt;column); free(escaped); break; } - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok-&gt;line, tok-&gt;column); break; - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_S64: - case TOK_CONST_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_U32: + case TOK_INTEGER_S64: + case TOK_INTEGER_U64: printf("%s: %ld@%d:%d\n", name, token_int(tok), tok-&gt;line, tok-&gt;column); break; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: printf("%s: %f@%d:%d\n", name, token_float(tok), tok-&gt;line, tok-&gt;column); break; default: @@ -924,19 +931,19 @@ void print_token(token_t *tok) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> </div> </div> -<a name="1:16"><div class="section"><h4 class="noheading">16. </h4></a> -<p>Now we can implement the functions we defined in the interface. +<a name="1:18"><div class="section"><h4 class="noheading">18. </h4></a> +<p>Now we can implement functions to create and destroy tokens. We'll start with the easy ones. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Creation and Destruction <a href="lexer.html#1:16">16</a>}</span> +<span class="codeblock_name">{Token Creation and Destruction <a href="lexer.html#1:18">18</a>}</span> <pre class="prettyprint lang-c"> token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -950,7 +957,7 @@ token_t *token_data_create(c_token_types kind, int lin, int col, int len) { token_t *token_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t)); - if (token == NULL) { + if (!token) { fputs("Out of memory\n", stderr); exit(1); } @@ -982,9 +989,6 @@ token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len void token_destroy(token_t *token) { if (token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2) { - if (token-&gt;kind == TOK_CONST_STRING_ASCII) { - free((char *)token_data(token)-&gt;data.s); - } free(token); } else { fputs("Corrupt token\n", stderr); @@ -994,15 +998,15 @@ void token_destroy(token_t *token) { </pre> -<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p> </div> </div> -<a name="1:17"><div class="section"><h4 class="noheading">17. </h4></a> +<a name="1:19"><div class="section"><h4 class="noheading">19. </h4></a> <p><code>token_create_string</code> can be implemented either the easy way or the right way. Let's try the easy way. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Create String <a href="lexer.html#1:17">17</a>}</span> +<span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>}</span> <pre class="prettyprint lang-c"> token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) { token_t *token = token_create(kind, lin, col, len); @@ -1011,164 +1015,176 @@ token_t *token_create_string(c_token_types kind, int lin, int col, const char *s } </pre> -<p class="seealso">Redefined in section <a href="lexer.html#1:37">37</a></p> -<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Redefined in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> </div> </div> -<a name="1:18"><div class="section"><h4 class="noheading">18. </h4></a> +<a name="1:20"><div class="section"><h4 class="noheading">20. </h4></a> <p>There's an issue with this approach. <code>token_create_string</code> will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to <code>fprintf</code>, <code>fscanf</code>, <code>strchr</code>, <code>strtok</code>, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage. </p> <p>To fix this, we use a hash table to store the strings. We'll define a hash table in <code>hash_table.h</code> and <code>hash_table.c</code>. </p> </div> -<a name="1:19"><div class="section"><h4>19. Hash Table</h4></a> -<p>A hash table is a data structure that maps keys to values. It's commonly used to store information, such as variables and functions in a symbol table. To implement a generic hash table, we'll need several things: +<a name="1:21"><div class="section"><h4>21. Hash Table</h4></a> +<p>A hash table is a data structure that maps keys to values. It's commonly used for implementing symbol tables to store variables and functions. To create a generic hash table, we'll need: </p> -<ul> -<li>A function to hash the keys. +<ol> +<li>A hash function to convert keys into array indices </li> -<li>A function to compare keys. +<li>A comparison function to check if two keys are equal </li> -<li>An opaque type for the hash table. +<li>An opaque type to represent the hash table </li> -<li>A function to destroy deleted keys and values. +<li>A destructor function to clean up keys and values when removing entries </li> -</ul> -<p>Let's start with the interface. +</ol> +<p>Let's start with the interface: </p> </div> -<a name="1:20"><div class="section"><h4 class="noheading">20. </h4></a> +<a name="1:22"><div class="section"><h4 class="noheading">22. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Hash Table Opaque Types <a href="lexer.html#1:20">20</a>}</span> +<span class="codeblock_name">{Hash Table Opaque Types <a href="lexer.html#1:22">22</a>}</span> <pre class="prettyprint lang-c"> typedef struct hash_table hash_table_t; -typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned int (*hash_table_hash_fn)(void *key); -typedef void (*hash_table_dtor)(void *value, int is_key); +typedef int (*hash_table_cmp_fn)(const void *key1, const void *key2); +typedef unsigned int (*hash_table_hash_fn)(const void *key); +typedef void (*hash_table_dtor)(void *data, int is_key); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> </div> </div> -<a name="1:21"><div class="section"><h4 class="noheading">21. </h4></a> +<a name="1:23"><div class="section"><h4 class="noheading">23. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Hash Table Creation and Destruction <a href="lexer.html#1:21">21</a>}</span> +<span class="codeblock_name">{Hash Table Creation and Destruction <a href="lexer.html#1:23">23</a>}</span> <pre class="prettyprint lang-c"> hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor); void hash_table_destroy(hash_table_t *table); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> </div> </div> -<a name="1:22"><div class="section"><h4 class="noheading">22. </h4></a> +<a name="1:24"><div class="section"><h4 class="noheading">24. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Hash Table Access <a href="lexer.html#1:22">22</a>}</span> +<span class="codeblock_name">{Hash Table Access <a href="lexer.html#1:24">24</a>}</span> <pre class="prettyprint lang-c"> -void *hash_table_get(hash_table_t *table, void *key); +void *hash_table_get(const hash_table_t *table, const void *key); void hash_table_put(hash_table_t *table, void *key, void *value); -void hash_table_remove(hash_table_t *table, void *key); +void hash_table_remove(hash_table_t *table, const void *key); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> </div> </div> -<a name="1:23"><div class="section"><h4 class="noheading">23. </h4></a> +<a name="1:25"><div class="section"><h4 class="noheading">25. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{<strong>hash_table.h</strong> <a href="lexer.html#1:23">23</a>}</span> +<span class="codeblock_name">{<strong>hash_table.h</strong> <a href="lexer.html#1:25">25</a>}</span> <pre class="prettyprint lang-c"> #ifndef HASH_TABLE_H #define HASH_TABLE_H -<span class="nocode pln">{Hash Table Opaque Types, <a href="lexer.html#1:20">20</a>}</span> -<span class="nocode pln">{Hash Table Creation and Destruction, <a href="lexer.html#1:21">21</a>}</span> -<span class="nocode pln">{Hash Table Access, <a href="lexer.html#1:22">22</a>}</span> -#endif + +<span class="nocode pln">{Hash Table Opaque Types, <a href="lexer.html#1:22">22</a>}</span> +<span class="nocode pln">{Hash Table Creation and Destruction, <a href="lexer.html#1:23">23</a>}</span> +<span class="nocode pln">{Hash Table Access, <a href="lexer.html#1:24">24</a>}</span> + +#endif /* HASH_TABLE_H */ </pre> </div> </div> -<a name="1:24"><div class="section"><h4 class="noheading">24. </h4></a> -<p>Let's implement the hash table now. +<a name="1:26"><div class="section"><h4 class="noheading">26. </h4></a> +<p>Now let's implement the hash table: </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>hash_table.c</strong> <a href="lexer.html#1:24">24</a>}</span> +<span class="codeblock_name">{<strong>hash_table.c</strong> <a href="lexer.html#1:26">26</a>}</span> <pre class="prettyprint lang-c"> #include &lt;stdlib.h&gt; #include &lt;string.h&gt; #include &lt;stdio.h&gt; #include "hash_table.h" -<span class="nocode pln">{Hash Table Data Structure, <a href="lexer.html#1:25">25</a>}</span> -<span class="nocode pln">{Hash Table Entry Data Structure, <a href="lexer.html#1:26">26</a>}</span> +<span class="nocode pln">{Hash Table Data Structure, <a href="lexer.html#1:27">27</a>}</span> +<span class="nocode pln">{Hash Table Entry Data Structure, <a href="lexer.html#1:28">28</a>}</span> hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) { -<span class="nocode pln"> {Allocate and Initialize Hash Table, <a href="lexer.html#1:27">27</a>}</span> - return table; +<span class="nocode pln"> {Allocate and Initialize Hash Table, <a href="lexer.html#1:29">29</a>}</span> + return table; } void hash_table_destroy(hash_table_t *table) { -<span class="nocode pln"> {Destroy Entries, <a href="lexer.html#1:28">28</a>}</span> - free(table-&gt;entries); - free(table); + if (!table) return; +<span class="nocode pln"> {Destroy Entries, <a href="lexer.html#1:30">30</a>}</span> + free(table-&gt;entries); + free(table); } -void *hash_table_get(hash_table_t *table, void *key) { -<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span> -<span class="nocode pln"> {Loop Through Entries and Return Value if Match, <a href="lexer.html#1:33">33</a>}</span> - return NULL; +void *hash_table_get(const hash_table_t *table, const void *key) { + if (!table || !key) return NULL; +<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> +<span class="nocode pln"> {Loop Through Entries and Return Value if Match, <a href="lexer.html#1:35">35</a>}</span> + return NULL; } void hash_table_put(hash_table_t *table, void *key, void *value) { -<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span> -<span class="nocode pln"> {Loop Through Entries and Replace Value if Key Matches, <a href="lexer.html#1:30">30</a>}</span> -<span class="nocode pln"> {Allocate New Entry if No Match, <a href="lexer.html#1:31">31</a>}</span> + if (!table || !key) return; +<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> +<span class="nocode pln"> {Loop Through Entries and Replace Value if Key Matches, <a href="lexer.html#1:32">32</a>}</span> +<span class="nocode pln"> {Allocate New Entry if No Match, <a href="lexer.html#1:33">33</a>}</span> } -void hash_table_remove(hash_table_t *table, void *key) { -<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span> -<span class="nocode pln"> {Loop Through Entries and Remove Entry if Key Matches, <a href="lexer.html#1:32">32</a>}</span> +void hash_table_remove(hash_table_t *table, const void *key) { + if (!table || !key) return; +<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> +<span class="nocode pln"> {Loop Through Entries and Remove Entry if Key Matches, <a href="lexer.html#1:34">34</a>}</span> } #ifdef TEST_HASH_TABLE #include &lt;assert.h&gt; -#include &lt;stdio.h&gt; -#include &lt;string.h&gt; -int string_cmp(void *key1, void *key2) { - return strcmp((char *)key1, (char *)key2); +static int string_cmp(const void *key1, const void *key2) { + return strcmp((const char *)key1, (const char *)key2); } -unsigned long string_hash(void *key) { - unsigned long hash = 5381; - char *str = (char *)key; - while (*str != '\0') { - hash = ((hash &lt;&lt; 5) + hash) + *str; - str++; - } - return hash; +static unsigned int string_hash(const void *key) { + unsigned int hash = 5381; + const unsigned char *str = key; + int c; + + while ((c = *str++)) + { + hash = ((hash &lt;&lt; 5) + hash) + c; + } + + return hash; } -int main() { - hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); - hash_table_put(table, "foo", "bar"); - hash_table_put(table, "foo", "baz"); - assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0); - hash_table_remove(table, "foo"); - assert(hash_table_get(table, "foo") == NULL); - hash_table_destroy(table); - return 0; +int main(void) { + hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); + assert(table != NULL); + + hash_table_put(table, "foo", "bar"); + hash_table_put(table, "foo", "baz"); + assert(strcmp((const char *)hash_table_get(table, "foo"), "baz") == 0); + + hash_table_remove(table, "foo"); + assert(hash_table_get(table, "foo") == NULL); + + hash_table_destroy(table); + printf("All tests passed!\n"); + return 0; } #endif </pre> @@ -1177,60 +1193,61 @@ int main() { </div> </div> -<a name="1:25"><div class="section"><h4 class="noheading">25. </h4></a> -<p>For the hash table data structure, we'll define a pointer to an array of entries, the size of the array, and the hash/comparison functions. +<a name="1:27"><div class="section"><h4 class="noheading">27. </h4></a> +<p>The hash table data structure contains an array of entry pointers, the size of the array, and function pointers for comparison, hashing, and destruction. </p> <div class="codeblock"> -<span class="codeblock_name">{Hash Table Data Structure <a href="lexer.html#1:25">25</a>}</span> +<span class="codeblock_name">{Hash Table Data Structure <a href="lexer.html#1:27">27</a>}</span> <pre class="prettyprint lang-c"> struct hash_table { - struct hash_table_entry **entries; - int size; - hash_table_cmp_fn cmp; - hash_table_hash_fn hash; - hash_table_dtor dtor; + struct hash_table_entry **entries; + int size; + hash_table_cmp_fn cmp; + hash_table_hash_fn hash; + hash_table_dtor dtor; }; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:26"><div class="section"><h4 class="noheading">26. </h4></a> -<p>Entries in the hash table will have a key, a value, and a link to the next entry in the chain. +<a name="1:28"><div class="section"><h4 class="noheading">28. </h4></a> +<p>Each entry in the hash table contains a key, a value, and a pointer to the next entry in the chain for collision resolution via chaining. </p> <div class="codeblock"> -<span class="codeblock_name">{Hash Table Entry Data Structure <a href="lexer.html#1:26">26</a>}</span> +<span class="codeblock_name">{Hash Table Entry Data Structure <a href="lexer.html#1:28">28</a>}</span> <pre class="prettyprint lang-c"> struct hash_table_entry { - void *key; - void *value; - struct hash_table_entry *next; + void *key; + void *value; + struct hash_table_entry *next; }; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:27"><div class="section"><h4 class="noheading">27. </h4></a> -<p>Allocating a hash table involves allocating memory for the hash table itself and the entries, zeroing out the entries, and setting the hash and comparison functions. +<a name="1:29"><div class="section"><h4 class="noheading">29. </h4></a> +<p>To allocate a hash table, we allocate memory for the table structure and its entries, initialize the entries to NULL, and set the function pointers. </p> <div class="codeblock"> -<span class="codeblock_name">{Allocate and Initialize Hash Table <a href="lexer.html#1:27">27</a>}</span> +<span class="codeblock_name">{Allocate and Initialize Hash Table <a href="lexer.html#1:29">29</a>}</span> <pre class="prettyprint lang-c"> hash_table_t *table = malloc(sizeof(struct hash_table)); -if (table == NULL) { - fputs("Out of memory, could not allocate hash table\n", stderr); - exit(1); +if (!table) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table\n"); + exit(EXIT_FAILURE); } table-&gt;entries = calloc(size, sizeof(struct hash_table_entry *)); -if (table-&gt;entries == NULL) { - fputs("Out of memory, could not allocate hash table entries\n", stderr); - exit(1); +if (!table-&gt;entries) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entries\n"); + free(table); + exit(EXIT_FAILURE); } table-&gt;size = size; table-&gt;cmp = cmp; @@ -1239,82 +1256,83 @@ table-&gt;dtor = dtor; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:28"><div class="section"><h4 class="noheading">28. </h4></a> -<p>To destroy a hash table, we loop through the entries, freeing the keys and values, and then free the entries and the table itself. +<a name="1:30"><div class="section"><h4 class="noheading">30. </h4></a> +<p>To destroy the entries in a hash table, we iterate through all entries, free the keys and values using the destructor if provided, and free the entry itself. </p> <div class="codeblock"> -<span class="codeblock_name">{Destroy Entries <a href="lexer.html#1:28">28</a>}</span> +<span class="codeblock_name">{Destroy Entries <a href="lexer.html#1:30">30</a>}</span> <pre class="prettyprint lang-c"> for (int i = 0; i &lt; table-&gt;size; i++) { - struct hash_table_entry *entry = table-&gt;entries[i]; - while (entry != NULL) { - struct hash_table_entry *next = entry-&gt;next; - if (table-&gt;dtor != NULL) { - table-&gt;dtor(entry-&gt;key, 1); - table-&gt;dtor(entry-&gt;value, 0); + struct hash_table_entry *entry = table-&gt;entries[i]; + while (entry) { + struct hash_table_entry *next = entry-&gt;next; + if (table-&gt;dtor) { + table-&gt;dtor(entry-&gt;key, 1); + table-&gt;dtor(entry-&gt;value, 0); + } + free(entry); + entry = next; } - free(entry); - entry = next; - } } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:29"><div class="section"><h4 class="noheading">29. </h4></a> -<p>To get an entry from the hash table, we hash the key, loop through the entries, and return the value if we find a match. +<a name="1:31"><div class="section"><h4 class="noheading">31. </h4></a> +<p>To retrieve an entry's hash bucket, we apply the hash function to the key and take the modulus of the result with the table size. </p> <div class="codeblock"> -<span class="codeblock_name">{Get Entry By Hash <a href="lexer.html#1:29">29</a>}</span> +<span class="codeblock_name">{Get Entry By Hash <a href="lexer.html#1:31">31</a>}</span> <pre class="prettyprint lang-c"> unsigned int hash = table-&gt;hash(key) % table-&gt;size; struct hash_table_entry *entry = table-&gt;entries[hash]; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:30"><div class="section"><h4 class="noheading">30. </h4></a> -<p>To put an entry in the hash table, we hash the key, loop through the entries, and replace the value if we find a match. +<a name="1:32"><div class="section"><h4 class="noheading">32. </h4></a> +<p>When putting a new entry in the table, we first check if the key already exists. If it does, we update the value; otherwise, we create a new entry. </p> <div class="codeblock"> -<span class="codeblock_name">{Loop Through Entries and Replace Value if Key Matches <a href="lexer.html#1:30">30</a>}</span> +<span class="codeblock_name">{Loop Through Entries and Replace Value if Key Matches <a href="lexer.html#1:32">32</a>}</span> <pre class="prettyprint lang-c"> -while (entry != NULL) { - if (table-&gt;cmp(entry-&gt;key, key) == 0) { - entry-&gt;value = value; - return; - } - entry = entry-&gt;next; +while (entry) { + if (table-&gt;cmp(entry-&gt;key, key) == 0) { + if (table-&gt;dtor) table-&gt;dtor(entry-&gt;value, 0); + entry-&gt;value = value; + return; + } + entry = entry-&gt;next; } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:31"><div class="section"><h4 class="noheading">31. </h4></a> -<p>If we don't find a match, we allocate a new entry, set the key and value, and insert it at the head of the linked list. +<a name="1:33"><div class="section"><h4 class="noheading">33. </h4></a> +<p>If no matching key is found, we create a new entry and insert it at the beginning of the hash bucket. </p> -<p>This exploits a property in computer science called locality of reference. The gist of that is that when you write to a piece of memory, you're likely to read from it again soon. By putting the new entry at the head of the linked list, we increase the chances that we'll find it quickly next time. +<p>This can possibly improve performance due to a property called temporal locality. When we access an entry, we're likely to access it again soon. Since the entry is at the beginning of the list, it's likely to be accessed again soon. </p> <div class="codeblock"> -<span class="codeblock_name">{Allocate New Entry if No Match <a href="lexer.html#1:31">31</a>}</span> +<span class="codeblock_name">{Allocate New Entry if No Match <a href="lexer.html#1:33">33</a>}</span> <pre class="prettyprint lang-c"> struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); -if (new_entry == NULL) { - fputs("Out of memory, could not allocate hash table entry\n", stderr); - exit(1); +if (!new_entry) { + fprintf(stderr, "Error: Out of memory, could not allocate hash table entry\n"); + exit(EXIT_FAILURE); } new_entry-&gt;key = key; new_entry-&gt;value = value; @@ -1323,25 +1341,25 @@ table-&gt;entries[hash] = new_entry; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:32"><div class="section"><h4 class="noheading">32. </h4></a> -<p>To remove an entry from the hash table, we hash the key, loop through the entries, and remove the entry if we find a match. +<a name="1:34"><div class="section"><h4 class="noheading">34. </h4></a> +<p>To remove an entry, we find its bucket, update the linked list to bypass it, then free the entry and its contents. </p> <div class="codeblock"> -<span class="codeblock_name">{Loop Through Entries and Remove Entry if Key Matches <a href="lexer.html#1:32">32</a>}</span> +<span class="codeblock_name">{Loop Through Entries and Remove Entry if Key Matches <a href="lexer.html#1:34">34</a>}</span> <pre class="prettyprint lang-c"> struct hash_table_entry *prev = NULL; -while (entry != NULL) { +while (entry) { if (table-&gt;cmp(entry-&gt;key, key) == 0) { - if (prev == NULL) { - table-&gt;entries[hash] = entry-&gt;next; - } else { + if (prev) prev-&gt;next = entry-&gt;next; - } - if (table-&gt;dtor != NULL) { + else + table-&gt;entries[hash] = entry-&gt;next; + + if (table-&gt;dtor) { table-&gt;dtor(entry-&gt;key, 1); table-&gt;dtor(entry-&gt;value, 0); } @@ -1354,36 +1372,36 @@ while (entry != NULL) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:33"><div class="section"><h4 class="noheading">33. </h4></a> -<p>To find a value associated with a given key in the hash table, we hash the string, loop through the entries, and return the value if a match is found. +<a name="1:35"><div class="section"><h4 class="noheading">35. </h4></a> +<p>To retrieve a value from a given bucket, we just walk the list and return the value if a matching key is found. </p> <div class="codeblock"> -<span class="codeblock_name">{Loop Through Entries and Return Value if Match <a href="lexer.html#1:33">33</a>}</span> +<span class="codeblock_name">{Loop Through Entries and Return Value if Match <a href="lexer.html#1:35">35</a>}</span> <pre class="prettyprint lang-c"> -while (entry != NULL) { - if (table-&gt;cmp(entry-&gt;key, key) == 0) { - return entry-&gt;value; - } - entry = entry-&gt;next; +while (entry) { + if (table-&gt;cmp(entry-&gt;key, key) == 0) { + return entry-&gt;value; + } + entry = entry-&gt;next; } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> </div> </div> -<a name="1:34"><div class="section"><h4 class="noheading">34. </h4></a> +<a name="1:36"><div class="section"><h4 class="noheading">36. </h4></a> <p>We're now almost ready to implement <code>token_create_string</code> the right way. First, we'll need a good hash function. </p> <p>Hash functions are a very interesting topic and there's a lot of good research on them. The hash function we use should be fast, have a low collision rate, and be able to handle strings of any length. </p> <p>We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful. </p> -<p>Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 376 were unique. That's a collision rate of 99.999999%! +<p>Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 3760 were unique, which is terrible. </p> <p>Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well. </p> @@ -1391,7 +1409,7 @@ while (entry != NULL) { </p> <div class="codeblock"> -<span class="codeblock_name">{Hash Function <a href="lexer.html#1:34">34</a>}</span> +<span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>}</span> <pre class="prettyprint lang-c"> static unsigned long hash_string(void *key) { unsigned long hash = 0, g; @@ -1408,7 +1426,7 @@ static unsigned long hash_string(void *key) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> </div> <p>This is a bit slow on modern processors because it's not very cache-friendly. We can do better. Let's use its child, ELFHash, from libc. </p> @@ -1416,11 +1434,11 @@ static unsigned long hash_string(void *key) { </p> <div class="codeblock"> -<span class="codeblock_name">{Hash Function <a href="lexer.html#1:34">34</a>} :=</span> +<span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>} :=</span> <pre class="prettyprint lang-c"> -static unsigned int hash_string(void *key) { +static unsigned int hash_string(const void *key) { unsigned long hash = 0, hi = 0; - char *p = key; + const char *p = key; hash = *p; if (hash != 0 &amp;&amp; p[1] != 0) { hash = (hash &lt;&lt; 4) + p[1]; @@ -1446,31 +1464,31 @@ static unsigned int hash_string(void *key) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> </div> </div> -<a name="1:35"><div class="section"><h4 class="noheading">35. </h4></a> +<a name="1:37"><div class="section"><h4 class="noheading">37. </h4></a> <p>We also need a comparison function for strings. </p> <div class="codeblock"> -<span class="codeblock_name">{String Comparison <a href="lexer.html#1:35">35</a>}</span> +<span class="codeblock_name">{String Comparison <a href="lexer.html#1:37">37</a>}</span> <pre class="prettyprint lang-c"> -static int cmp_string(void *key1, void *key2) { +static int cmp_string(const void *key1, const void *key2) { return strcmp((char *)key1, (char *)key2); } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> </div> </div> -<a name="1:36"><div class="section"><h4 class="noheading">36. </h4></a> +<a name="1:38"><div class="section"><h4 class="noheading">38. </h4></a> <p>Finally, we'll need a destructor for entries. </p> <div class="codeblock"> -<span class="codeblock_name">{String Destructor <a href="lexer.html#1:36">36</a>}</span> +<span class="codeblock_name">{String Destructor <a href="lexer.html#1:38">38</a>}</span> <pre class="prettyprint lang-c"> static void dtor_string(void *value, int is_key) { if (is_key) { @@ -1480,21 +1498,21 @@ static void dtor_string(void *value, int is_key) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> </div> </div> -<a name="1:37"><div class="section"><h4 class="noheading">37. </h4></a> +<a name="1:39"><div class="section"><h4 class="noheading">39. </h4></a> <p>Now we can implement <code>token_create_string</code> the right way. </p> <p>You might notice that we're using the same key and value. This way of using a hash table is normally called a set. We're using it to store strings, but we could use it to store anything we want to deduplicate. </p> <div class="codeblock"> -<span class="codeblock_name">{Token Create String <a href="lexer.html#1:17">17</a>} :=</span> +<span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>} :=</span> <pre class="prettyprint lang-c"> -<span class="nocode pln">{String Comparison, <a href="lexer.html#1:35">35</a>}</span> -<span class="nocode pln">{Hash Function, <a href="lexer.html#1:34">34</a>}</span> -<span class="nocode pln">{String Destructor, <a href="lexer.html#1:36">36</a>}</span> +<span class="nocode pln">{String Comparison, <a href="lexer.html#1:37">37</a>}</span> +<span class="nocode pln">{Hash Function, <a href="lexer.html#1:36">36</a>}</span> +<span class="nocode pln">{String Destructor, <a href="lexer.html#1:38">38</a>}</span> hash_table_t *string_table; token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) { @@ -1513,10 +1531,10 @@ token_t *token_create_string(c_token_types kind, int lin, int col, </pre> -<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> </div> </div> -<a name="1:38"><div class="section"><h4 class="noheading">38. </h4></a> +<a name="1:40"><div class="section"><h4 class="noheading">40. </h4></a> <p>We'll add an external declaration for <code>string_table</code> in <code>token.h</code> so other programs can take advantage of it. </p> @@ -1541,39 +1559,40 @@ extern int line; </div> </div> -<a name="1:39"><div class="section"><h4 class="noheading">39. </h4></a> +<a name="1:41"><div class="section"><h4 class="noheading">41. </h4></a> <p>Finally, we implement the token data structure in <code>token.c</code>. </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>token.c</strong> <a href="lexer.html#1:39">39</a>}</span> +<span class="codeblock_name">{<strong>token.c</strong> <a href="lexer.html#1:41">41</a>}</span> <pre class="prettyprint lang-c"> #include &lt;stdlib.h&gt; #include &lt;string.h&gt; #include &lt;stdio.h&gt; #include &lt;assert.h&gt; +#include &lt;ctype.h&gt; #include "token.h" #include "hash_table.h" -<span class="nocode pln">{Token Data Structure, <a href="lexer.html#1:13">13</a>}</span> -<span class="nocode pln">{Token Data Access, <a href="lexer.html#1:14">14</a>}</span> -<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:16">16</a>}</span> -<span class="nocode pln">{Token Create String, <a href="lexer.html#1:17">17</a>}</span> -<span class="nocode pln">{Token Debugging, <a href="lexer.html#1:15">15</a>}</span> +<span class="nocode pln">{Token Data Structure, <a href="lexer.html#1:12">12</a>}</span> +<span class="nocode pln">{Token Data Access, <a href="lexer.html#1:13">13</a>}</span> +<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span> +<span class="nocode pln">{Token Create String, <a href="lexer.html#1:19">19</a>}</span> +<span class="nocode pln">{Token Debugging, <a href="lexer.html#1:14">14</a>}</span> </pre> </div> </div> -<a name="1:40"><div class="section"><h4>40. Input</h4></a> +<a name="1:42"><div class="section"><h4>42. Input</h4></a> <p>Input will provide a simple interface for reading characters from a file. The stream itself is deliberately hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that. </p> </div> -<a name="1:41"><div class="section"><h4 class="noheading">41. </h4></a> +<a name="1:43"><div class="section"><h4 class="noheading">43. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Interface <a href="lexer.html#1:41">41</a>}</span> +<span class="codeblock_name">{Input Interface <a href="lexer.html#1:43">43</a>}</span> <pre class="prettyprint lang-c"> void input_init(const char *filename); int input_getc(void); @@ -1582,7 +1601,7 @@ void input_destroy(void); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:50">50</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> </div> <p>When the program wants to start reading a file, it calls <code>input_init</code> with the filename. It can then call <code>input_getc</code> to get the next character in the file. If there's no more input, <code>input_getc</code> will return <code>EOF</code>. </p> @@ -1592,7 +1611,7 @@ void input_destroy(void); </p> </div> -<a name="1:42"><div class="section"><h4>42. Input Design Decisions</h4></a> +<a name="1:44"><div class="section"><h4>44. Input Design Decisions</h4></a> <p>Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into memory, we'll need to read it in chunks. There are a couple of choices for how to do this: </p> <ol> @@ -1609,15 +1628,15 @@ void input_destroy(void); </p> </div> -<a name="1:43"><div class="section"><h4>43. Input Implementation</h4></a> +<a name="1:45"><div class="section"><h4>45. Input Implementation</h4></a> <p>The implementation of the input module is pretty straightforward. We have the following data structures and defines as globals: </p> </div> -<a name="1:44"><div class="section"><h4 class="noheading">44. </h4></a> +<a name="1:46"><div class="section"><h4 class="noheading">46. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Data <a href="lexer.html#1:44">44</a>}</span> +<span class="codeblock_name">{Input Data <a href="lexer.html#1:46">46</a>}</span> <pre class="prettyprint lang-c"> #define CHUNK_SIZE 128 static char buffer[CHUNK_SIZE]; @@ -1630,16 +1649,16 @@ static FILE *file = NULL; </pre> -<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> </div> <p>When the program calls <code>input_init</code>, we open the file. </p> </div> -<a name="1:45"><div class="section"><h4 class="noheading">45. </h4></a> +<a name="1:47"><div class="section"><h4 class="noheading">47. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Initialization <a href="lexer.html#1:45">45</a>}</span> +<span class="codeblock_name">{Input Initialization <a href="lexer.html#1:47">47</a>}</span> <pre class="prettyprint lang-c"> void input_init(const char *filename) { file = fopen(filename, "r"); @@ -1651,16 +1670,16 @@ void input_init(const char *filename) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> </div> <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is exhausted, we call <code>nextline</code>. We also track the line and column. </p> </div> -<a name="1:46"><div class="section"><h4 class="noheading">46. </h4></a> +<a name="1:48"><div class="section"><h4 class="noheading">48. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Get Character <a href="lexer.html#1:46">46</a>}</span> +<span class="codeblock_name">{Input Get Character <a href="lexer.html#1:48">48</a>}</span> <pre class="prettyprint lang-c"> int input_getc(void) { if (unget_buffer_stack_pos &gt; 0) { @@ -1679,16 +1698,16 @@ int input_getc(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> </div> <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. </p> </div> -<a name="1:47"><div class="section"><h4 class="noheading">47. </h4></a> +<a name="1:49"><div class="section"><h4 class="noheading">49. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Unget Character <a href="lexer.html#1:47">47</a>}</span> +<span class="codeblock_name">{Input Unget Character <a href="lexer.html#1:49">49</a>}</span> <pre class="prettyprint lang-c"> void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; @@ -1696,16 +1715,16 @@ void input_ungetc(int c) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> </div> <p>Since we're not using dynamic memory allocation, cleanup is pretty simple. </p> </div> -<a name="1:48"><div class="section"><h4 class="noheading">48. </h4></a> +<a name="1:50"><div class="section"><h4 class="noheading">50. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Input Destroy <a href="lexer.html#1:48">48</a>}</span> +<span class="codeblock_name">{Input Destroy <a href="lexer.html#1:50">50</a>}</span> <pre class="prettyprint lang-c"> void input_destroy(void) { fclose(file); @@ -1713,40 +1732,40 @@ void input_destroy(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> </div> </div> -<a name="1:49"><div class="section"><h4 class="noheading">49. </h4></a> +<a name="1:51"><div class="section"><h4 class="noheading">51. </h4></a> <p>We put the whole thing together in <code>input.c</code>. </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>input.c</strong> <a href="lexer.html#1:49">49</a>}</span> +<span class="codeblock_name">{<strong>input.c</strong> <a href="lexer.html#1:51">51</a>}</span> <pre class="prettyprint lang-c"> #include &lt;stdio.h&gt; #include &lt;stdlib.h&gt; #include "input.h" -<span class="nocode pln">{Input Data, <a href="lexer.html#1:44">44</a>}</span> -<span class="nocode pln">{Input Initialization, <a href="lexer.html#1:45">45</a>}</span> -<span class="nocode pln">{Input Get Character, <a href="lexer.html#1:46">46</a>}</span> -<span class="nocode pln">{Input Unget Character, <a href="lexer.html#1:47">47</a>}</span> -<span class="nocode pln">{Input Destroy, <a href="lexer.html#1:48">48</a>}</span> +<span class="nocode pln">{Input Data, <a href="lexer.html#1:46">46</a>}</span> +<span class="nocode pln">{Input Initialization, <a href="lexer.html#1:47">47</a>}</span> +<span class="nocode pln">{Input Get Character, <a href="lexer.html#1:48">48</a>}</span> +<span class="nocode pln">{Input Unget Character, <a href="lexer.html#1:49">49</a>}</span> +<span class="nocode pln">{Input Destroy, <a href="lexer.html#1:50">50</a>}</span> </pre> </div> </div> -<a name="1:50"><div class="section"><h4 class="noheading">50. </h4></a> +<a name="1:52"><div class="section"><h4 class="noheading">52. </h4></a> <p>We'll need an external declaration for <code>file</code> in <code>input.h</code> so other programs can take advantage of it. </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>input.h</strong> <a href="lexer.html#1:50">50</a>}</span> +<span class="codeblock_name">{<strong>input.h</strong> <a href="lexer.html#1:52">52</a>}</span> <pre class="prettyprint lang-c"> #ifndef INPUT_H #define INPUT_H -<span class="nocode pln">{Input Interface, <a href="lexer.html#1:41">41</a>}</span> +<span class="nocode pln">{Input Interface, <a href="lexer.html#1:43">43</a>}</span> #endif </pre> @@ -1754,18 +1773,18 @@ void input_destroy(void) { </div> </div> -<a name="1:51"><div class="section"><h4 class="noheading">51. </h4></a> +<a name="1:53"><div class="section"><h4 class="noheading">53. </h4></a> <p>We'll implement the lexer interface in <code>tokenizer.h</code> </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>tokenizer.h</strong> <a href="lexer.html#1:51">51</a>}</span> +<span class="codeblock_name">{<strong>tokenizer.h</strong> <a href="lexer.html#1:53">53</a>}</span> <pre class="prettyprint lang-c"> #ifndef TOKENIZER_H #define TOKENIZER_H #include "token.h" #include "input.h" -<span class="nocode pln">{Tokenization Interface, <a href="lexer.html#1:52">52</a>}</span> +<span class="nocode pln">{Tokenization Interface, <a href="lexer.html#1:54">54</a>}</span> #endif </pre> @@ -1773,7 +1792,7 @@ void input_destroy(void) { </div> </div> -<a name="1:52"><div class="section"><h4 class="noheading">52. </h4></a> +<a name="1:54"><div class="section"><h4 class="noheading">54. </h4></a> <p>The tokenization interface will have a couple of functions. <code>next_token</code> will return the next token in the input stream, <code>init_tokenizer</code> will initialize the tokenizer, and <code>destroy_tokenizer</code> will clean up. </p> <p>We'll also have some helper functions for lookahead and matching. @@ -1784,7 +1803,7 @@ void input_destroy(void) { </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenization Interface <a href="lexer.html#1:52">52</a>}</span> +<span class="codeblock_name">{Tokenization Interface <a href="lexer.html#1:54">54</a>}</span> <pre class="prettyprint lang-c"> void init_tokenizer(const char *filename); void destroy_tokenizer(void); @@ -1796,15 +1815,15 @@ void consume_alt(c_token_types *kinds, int n); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p> </div> </div> -<a name="1:53"><div class="section"><h4 class="noheading">53. </h4></a> +<a name="1:55"><div class="section"><h4 class="noheading">55. </h4></a> <p>Now we can finally implement the tokenizer. </p> <div class="codeblock"> -<span class="codeblock_name">{<strong>tokenizer.c</strong> <a href="lexer.html#1:53">53</a>}</span> +<span class="codeblock_name">{<strong>tokenizer.c</strong> <a href="lexer.html#1:55">55</a>}</span> <pre class="prettyprint lang-c"> #include &lt;assert.h&gt; #include &lt;ctype.h&gt; @@ -1821,20 +1840,20 @@ void consume_alt(c_token_types *kinds, int n); #include "input.h" token_t *left_stack[8]; int left_stack_pos = 0; -<span class="nocode pln">{Utility Functions, <a href="lexer.html#1:54">54</a>}</span> -<span class="nocode pln">{Tokenization Function, <a href="lexer.html#1:56">56</a>}</span> +<span class="nocode pln">{Utility Functions, <a href="lexer.html#1:56">56</a>}</span> +<span class="nocode pln">{Tokenization Function, <a href="lexer.html#1:58">58</a>}</span> </pre> </div> </div> -<a name="1:54"><div class="section"><h4 class="noheading">54. </h4></a> +<a name="1:56"><div class="section"><h4 class="noheading">56. </h4></a> <p>Utility functions are everything that doesn't directly tokenize the input. </p> <div class="codeblock"> -<span class="codeblock_name">{Utility Functions <a href="lexer.html#1:54">54</a>}</span> +<span class="codeblock_name">{Utility Functions <a href="lexer.html#1:56">56</a>}</span> <pre class="prettyprint lang-c"> void init_tokenizer(const char *filename) { input_init(filename); @@ -1857,7 +1876,7 @@ token_t *peek_token(void) { return token; } -<span class="nocode pln">{Stringify Type, <a href="lexer.html#1:55">55</a>}</span> +<span class="nocode pln">{Stringify Type, <a href="lexer.html#1:57">57</a>}</span> void consume(c_token_types kind) { token_t *token = next_token(); @@ -1886,195 +1905,195 @@ void consume_alt(c_token_types *kinds, int n) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:55">55</a></p> </div> </div> -<a name="1:55"><div class="section"><h4 class="noheading">55. </h4></a> +<a name="1:57"><div class="section"><h4 class="noheading">57. </h4></a> <p>We'll need a helper function to convert token types to strings. It's pretty simple, just tedious. </p> <div class="codeblock"> -<span class="codeblock_name">{Stringify Type <a href="lexer.html#1:55">55</a>}</span> +<span class="codeblock_name">{Stringify Type <a href="lexer.html#1:57">57</a>}</span> <pre class="prettyprint lang-c"> const char *stringify_type(c_token_types type) { switch (type) { - case TOK_CTK_IF: + case TOK_IF: return "if"; - case TOK_CTK_ELSE: + case TOK_ELSE: return "else"; - case TOK_CTK_SWITCH: + case TOK_SWITCH: return "switch"; - case TOK_CTK_CASE: + case TOK_CASE: return "case"; - case TOK_CTK_DEFAULT: + case TOK_DEFAULT: return "default"; - case TOK_CTK_WHILE: + case TOK_WHILE: return "while"; - case TOK_CTK_DO: + case TOK_DO: return "do"; - case TOK_CTK_FOR: + case TOK_FOR: return "for"; - case TOK_CTK_CONTINUE: + case TOK_CONTINUE: return "continue"; - case TOK_CTK_BREAK: + case TOK_BREAK: return "break"; - case TOK_CTK_RETURN: + case TOK_RETURN: return "return"; - case TOK_CTK_GOTO: + case TOK_GOTO: return "goto"; - case TOK_TK_VOID: + case TOK_VOID: return "void"; - case TOK_TK_CHAR: + case TOK_CHAR: return "char"; - case TOK_TK_SHORT: + case TOK_SHORT: return "short"; - case TOK_TK_INT: + case TOK_INT: return "int"; - case TOK_TK_LONG: + case TOK_LONG: return "long"; - case TOK_TK_FLOAT: + case TOK_FLOAT: return "float"; - case TOK_TK_DOUBLE: + case TOK_DOUBLE: return "double"; - case TOK_TK_SIGNED: + case TOK_SIGNED: return "signed"; - case TOK_TK_UNSIGNED: + case TOK_UNSIGNED: return "unsigned"; - case TOK_TK_STRUCT: + case TOK_STRUCT: return "struct"; - case TOK_TK_UNION: + case TOK_UNION: return "union"; - case TOK_TK_ENUM: + case TOK_ENUM: return "enum"; - case TOK_TK_TYPEDEF: + case TOK_TYPEDEF: return "typedef"; - case TOK_SCSK_AUTO: + case TOK_AUTO: return "auto"; - case TOK_SCSK_REGISTER: + case TOK_REGISTER: return "register"; - case TOK_SCSK_STATIC: + case TOK_STATIC: return "static"; - case TOK_SCSK_EXTERN: + case TOK_EXTERN: return "extern"; - case TOK_SCSK_CONST: + case TOK_CONST: return "const"; - case TOK_SCSK_VOLATILE: + case TOK_VOLATILE: return "volatile"; - case TOK_MK_SIZEOF: + case TOK_SIZEOF: return "sizeof"; - case TOK_OP_ADD: + case TOK_ADD: return "+"; - case TOK_OP_SUB: + case TOK_SUB: return "-"; - case TOK_OP_MUL: + case TOK_MUL: return "*"; - case TOK_OP_DIV: + case TOK_DIV: return "/"; - case TOK_OP_MOD: + case TOK_MOD: return "%"; - case TOK_OP_BIT_AND: + case TOK_BIT_AND: return "&amp;"; - case TOK_OP_BIT_OR: + case TOK_BIT_OR: return "|"; - case TOK_OP_BIT_XOR: + case TOK_BIT_XOR: return "^"; - case TOK_OP_BIT_NOT: + case TOK_BIT_NOT: return "~"; - case TOK_OP_LSHIFT: + case TOK_LSHIFT: return "&lt;&lt;"; - case TOK_OP_RSHIFT: + case TOK_RSHIFT: return "&gt;&gt;"; - case TOK_OP_NOT: + case TOK_NOT: return "!"; - case TOK_OP_ASSIGN: + case TOK_ASSIGN: return "="; - case TOK_OP_LT: + case TOK_LT: return "&lt;"; - case TOK_OP_GT: + case TOK_GT: return "&gt;"; - case TOK_OP_INC: + case TOK_INC: return "++"; - case TOK_OP_DEC: + case TOK_DEC: return "--"; - case TOK_OP_EQ: + case TOK_EQ: return "=="; - case TOK_OP_NE: + case TOK_NE: return "!="; - case TOK_OP_LE: + case TOK_LE: return "&lt;="; - case TOK_OP_GE: + case TOK_GE: return "&gt;="; - case TOK_OP_AND: + case TOK_AND: return "&amp;&amp;"; - case TOK_OP_OR: + case TOK_OR: return "||"; - case TOK_OP_MEMBER_POINTER: + case TOK_MEMBER_POINTER: return "-&gt;"; - case TOK_OP_MEMBER: + case TOK_MEMBER: return "."; - case TOK_OP_COND_DECISION: + case TOK_COND_DECISION: return ":"; - case TOK_OP_COND: + case TOK_COND: return "?"; - case TOK_OP_ASSIGN_ADD: + case TOK_ASSIGN_ADD: return "+="; - case TOK_OP_ASSIGN_SUB: + case TOK_ASSIGN_SUB: return "-="; - case TOK_OP_ASSIGN_MUL: + case TOK_ASSIGN_MUL: return "*="; - case TOK_OP_ASSIGN_DIV: + case TOK_ASSIGN_DIV: return "/="; - case TOK_OP_ASSIGN_MOD: + case TOK_ASSIGN_MOD: return "%="; - case TOK_OP_ASSIGN_BITAND: + case TOK_ASSIGN_BITAND: return "&amp;="; - case TOK_OP_ASSIGN_BITOR: + case TOK_ASSIGN_BITOR: return "|="; - case TOK_OP_ASSIGN_BITXOR: + case TOK_ASSIGN_BITXOR: return "^="; - case TOK_OP_ASSIGN_LSHIFT: + case TOK_ASSIGN_LSHIFT: return "&lt;&lt;="; - case TOK_OP_ASSIGN_RSHIFT: + case TOK_ASSIGN_RSHIFT: return "&gt;&gt;="; - case TOK_SEP_HASH: + case TOK_HASH: return "#"; case TOK_ID: return "identifier"; - case TOK_CONST_INTEGER_U32: - case TOK_CONST_INTEGER_U64: - case TOK_CONST_INTEGER_S32: - case TOK_CONST_INTEGER_S64: + case TOK_INTEGER_U32: + case TOK_INTEGER_U64: + case TOK_INTEGER_S32: + case TOK_INTEGER_S64: return "integer constant"; - case TOK_CONST_FLOAT_32: - case TOK_CONST_FLOAT_64: + case TOK_FLOAT_32: + case TOK_FLOAT_64: return "floating constant"; - case TOK_CONST_CHAR: + case TOK_CHAR_CONST: return "character constant"; - case TOK_CONST_STRING_ASCII: + case TOK_STRING_ASCII: return "string constant"; - case TOK_SPECIAL_EOF: + case TOK_EOF: return "EOF"; - case TOK_SPECIAL_ERROR: + case TOK_ERROR: return "error"; - case TOK_SEP_LEFT_PAREN: + case TOK_LEFT_PAREN: return "("; - case TOK_SEP_RIGHT_PAREN: + case TOK_RIGHT_PAREN: return ")"; - case TOK_SEP_LEFT_BRACKET: + case TOK_LEFT_BRACKET: return "["; - case TOK_SEP_RIGHT_BRACKET: + case TOK_RIGHT_BRACKET: return "]"; - case TOK_SEP_LEFT_BRACE: + case TOK_LEFT_BRACE: return "{"; - case TOK_SEP_RIGHT_BRACE: + case TOK_RIGHT_BRACE: return "}"; - case TOK_SEP_COMMA: + case TOK_COMMA: return ","; - case TOK_SEP_SEMICOLON: + case TOK_SEMICOLON: return ";"; - case TOK_SEP_DOT: + case TOK_DOT: return "."; - case TOK_SEP_ELLIPSIS: + case TOK_ELLIPSIS: return "..."; } return "UNKNOWN"; @@ -2082,25 +2101,25 @@ const char *stringify_type(c_token_types type) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:54">54</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> </div> </div> -<a name="1:56"><div class="section"><h4 class="noheading">56. </h4></a> +<a name="1:58"><div class="section"><h4 class="noheading">58. </h4></a> <p>Now we can implement the tokenization function. The pattern is pretty simple: we call each of the tokenization functions in turn until we find a match. If we don't find a match, we print an error message and exit. You might wonder why skip_whitespace can return a token. This makes handling the divide operator easier as comments also start with a slash. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenization Function <a href="lexer.html#1:56">56</a>}</span> +<span class="codeblock_name">{Tokenization Function <a href="lexer.html#1:58">58</a>}</span> <pre class="prettyprint lang-c"> char file_name[1024]; -<span class="nocode pln">{Warning/Error Functions, <a href="lexer.html#1:57">57</a>}</span> -<span class="nocode pln">{Skip Whitespace, <a href="lexer.html#1:58">58</a>}</span> -<span class="nocode pln">{Tokenize Identifier, <a href="lexer.html#1:59">59</a>}</span> -<span class="nocode pln">{Tokenize Number, <a href="lexer.html#1:62">62</a>}</span> -<span class="nocode pln">{Tokenize String, <a href="lexer.html#1:71">71</a>}</span> -<span class="nocode pln">{Tokenize Character, <a href="lexer.html#1:70">70</a>}</span> -<span class="nocode pln">{Tokenize Operator, <a href="lexer.html#1:61">61</a>}</span> +<span class="nocode pln">{Warning/Error Functions, <a href="lexer.html#1:59">59</a>}</span> +<span class="nocode pln">{Skip Whitespace, <a href="lexer.html#1:60">60</a>}</span> +<span class="nocode pln">{Tokenize Identifier, <a href="lexer.html#1:61">61</a>}</span> +<span class="nocode pln">{Tokenize Number, <a href="lexer.html#1:64">64</a>}</span> +<span class="nocode pln">{Tokenize String, <a href="lexer.html#1:73">73</a>}</span> +<span class="nocode pln">{Tokenize Character, <a href="lexer.html#1:72">72</a>}</span> +<span class="nocode pln">{Tokenize Operator, <a href="lexer.html#1:63">63</a>}</span> token_t *next_token(void) { if (left_stack_pos &gt; 0) { return left_stack[--left_stack_pos]; @@ -2140,20 +2159,20 @@ token_t *next_token(void) { } #ifdef TEST_TOKENIZER -<span class="nocode pln">{Run Test, <a href="lexer.html#1:73">73</a>}</span> +<span class="nocode pln">{Run Test, <a href="lexer.html#1:75">75</a>}</span> #endif </pre> -<p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:55">55</a></p> </div> </div> -<a name="1:57"><div class="section"><h4 class="noheading">57. </h4></a> +<a name="1:59"><div class="section"><h4 class="noheading">59. </h4></a> <p>We'll need a couple of helper functions to skip whitespace and print warnings/errors. </p> <div class="codeblock"> -<span class="codeblock_name">{Warning/Error Functions <a href="lexer.html#1:57">57</a>}</span> +<span class="codeblock_name">{Warning/Error Functions <a href="lexer.html#1:59">59</a>}</span> <pre class="prettyprint lang-c"> void tok_error(const char *fmt, ...) { va_list args; @@ -2175,15 +2194,15 @@ void tok_warn(const char *fmt, ...) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:58"><div class="section"><h4 class="noheading">58. </h4></a> +<a name="1:60"><div class="section"><h4 class="noheading">60. </h4></a> <p>The <code>skip_whitespace</code> function is pretty simple. It just skips over any comments, whitespace, and line directives. </p> <div class="codeblock"> -<span class="codeblock_name">{Skip Whitespace <a href="lexer.html#1:58">58</a>}</span> +<span class="codeblock_name">{Skip Whitespace <a href="lexer.html#1:60">60</a>}</span> <pre class="prettyprint lang-c"> static token_t *skip_whitespace(void) { int c; @@ -2242,9 +2261,9 @@ static token_t *skip_whitespace(void) { } } else { // Handled here to simplify the code. if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } } else { input_ungetc(c); @@ -2256,17 +2275,17 @@ static token_t *skip_whitespace(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:59"><div class="section"><h4 class="noheading">59. </h4></a> +<a name="1:61"><div class="section"><h4 class="noheading">61. </h4></a> <p>The <code>read_identifier</code> function reads an identifier from the input stream. C identifiers can contain letters, digits, and underscores, but they can't start with a digit. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenize Identifier <a href="lexer.html#1:59">59</a>}</span> +<span class="codeblock_name">{Tokenize Identifier <a href="lexer.html#1:61">61</a>}</span> <pre class="prettyprint lang-c"> -<span class="nocode pln">{Get Keyword, <a href="lexer.html#1:60">60</a>}</span> +<span class="nocode pln">{Get Keyword, <a href="lexer.html#1:62">62</a>}</span> static token_t *read_identifier(void) { int c; char buf[1024]; @@ -2300,45 +2319,45 @@ static token_t *read_identifier(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:60"><div class="section"><h4 class="noheading">60. </h4></a> +<a name="1:62"><div class="section"><h4 class="noheading">62. </h4></a> <p>The <code>get_keyword</code> function is a simple decision tree for identifying keywords. The code is pretty tedious, but it works. </p> <div class="codeblock"> -<span class="codeblock_name">{Get Keyword <a href="lexer.html#1:60">60</a>}</span> +<span class="codeblock_name">{Get Keyword <a href="lexer.html#1:62">62</a>}</span> <pre class="prettyprint lang-c"> c_token_types get_keyword(const char *buf, int len) { switch (buf[0]) { case 'a': if (len == 4 &amp;&amp; buf[1] == 'u' &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'o') - return TOK_SCSK_AUTO; + return TOK_AUTO; break; case 'b': if (len == 5 &amp;&amp; buf[1] == 'r' &amp;&amp; buf[2] == 'e' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 'k') - return TOK_CTK_BREAK; + return TOK_BREAK; break; case 'c': switch (buf[1]) { case 'a': if (len == 4 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'e') - return TOK_CTK_CASE; + return TOK_CASE; break; case 'h': if (len == 4 &amp;&amp; buf[2] == 'a' &amp;&amp; buf[3] == 'r') - return TOK_TK_CHAR; + return TOK_CHAR; break; case 'o': if (len == 5 &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 's' &amp;&amp; buf[4] == 't') - return TOK_SCSK_CONST; + return TOK_CONST; if (len == 8 &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'i' &amp;&amp; buf[5] == 'n' &amp;&amp; buf[6] == 'u' &amp;&amp; buf[7] == 'e') - return TOK_CTK_CONTINUE; + return TOK_CONTINUE; break; } break; @@ -2348,14 +2367,14 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 7 &amp;&amp; buf[2] == 'f' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 'u' &amp;&amp; buf[5] == 'l' &amp;&amp; buf[6] == 't') - return TOK_CTK_DEFAULT; + return TOK_DEFAULT; break; case 'o': if (len == 2 &amp;&amp; buf[2] == '\0') - return TOK_CTK_DO; + return TOK_DO; if (len == 6 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'b' &amp;&amp; buf[4] == 'l' &amp;&amp; buf[5] == 'e') - return TOK_TK_DOUBLE; + return TOK_DOUBLE; break; } break; @@ -2364,16 +2383,16 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 4 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'e') - return TOK_CTK_ELSE; + return TOK_ELSE; break; case 'n': if (len == 4 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'm') - return TOK_TK_ENUM; + return TOK_ENUM; break; case 'x': if (len == 6 &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'e' &amp;&amp; buf[4] == 'r' &amp;&amp; buf[5] == 'n') - return TOK_SCSK_EXTERN; + return TOK_EXTERN; break; } break; @@ -2382,36 +2401,36 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'l': if (len == 5 &amp;&amp; buf[2] == 'o' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 't') - return TOK_TK_FLOAT; + return TOK_FLOAT; break; case 'o': if (len == 3 &amp;&amp; buf[2] == 'r') - return TOK_CTK_FOR; + return TOK_FOR; break; } break; case 'g': if (len == 4 &amp;&amp; buf[1] == 'o' &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'o') - return TOK_CTK_GOTO; + return TOK_GOTO; break; case 'i': switch (buf[1]) { case 'f': if (len == 2 &amp;&amp; buf[2] == '\0') - return TOK_CTK_IF; + return TOK_IF; break; case 'n': if (len == 3 &amp;&amp; buf[2] == 't') - return TOK_TK_INT; + return TOK_INT; break; } break; case 'l': if (len == 4 &amp;&amp; buf[1] == 'o' &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 'g') - return TOK_TK_LONG; + return TOK_LONG; break; case 'r': @@ -2419,10 +2438,10 @@ c_token_types get_keyword(const char *buf, int len) { case 'e': if (len == 8 &amp;&amp; buf[2] == 'g' &amp;&amp; buf[3] == 'i' &amp;&amp; buf[4] == 's' &amp;&amp; buf[5] == 't' &amp;&amp; buf[6] == 'e' &amp;&amp; buf[7] == 'r') - return TOK_SCSK_REGISTER; + return TOK_REGISTER; if (len == 6 &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'u' &amp;&amp; buf[4] == 'r' &amp;&amp; buf[5] == 'n') - return TOK_CTK_RETURN; + return TOK_RETURN; break; } break; @@ -2431,29 +2450,29 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'h': if (len == 5 &amp;&amp; buf[2] == 'o' &amp;&amp; buf[3] == 'r' &amp;&amp; buf[4] == 't') - return TOK_TK_SHORT; + return TOK_SHORT; break; case 't': if (len == 6 &amp;&amp; buf[2] == 'a' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'i' &amp;&amp; buf[5] == 'c') - return TOK_SCSK_STATIC; + return TOK_STATIC; break; case 'i': if (len == 6 &amp;&amp; buf[2] == 'g' &amp;&amp; buf[3] == 'n' &amp;&amp; buf[4] == 'e' &amp;&amp; buf[5] == 'd') - return TOK_TK_SIGNED; + return TOK_SIGNED; if (len == 6 &amp;&amp; buf[2] == 'z' &amp;&amp; buf[3] == 'e' &amp;&amp; buf[4] == 'o' &amp;&amp; buf[5] == 'f') - return TOK_MK_SIZEOF; + return TOK_SIZEOF; break; case 'r': if (len == 6 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'c' &amp;&amp; buf[4] == 't') - return TOK_TK_STRUCT; + return TOK_STRUCT; break; case 'w': if (len == 6 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'c' &amp;&amp; buf[5] == 'h') - return TOK_CTK_SWITCH; + return TOK_SWITCH; break; } break; @@ -2461,17 +2480,17 @@ c_token_types get_keyword(const char *buf, int len) { case 't': if (len == 7 &amp;&amp; buf[1] == 'y' &amp;&amp; buf[2] == 'p' &amp;&amp; buf[3] == 'e' &amp;&amp; buf[4] == 'd' &amp;&amp; buf[5] == 'e' &amp;&amp; buf[6] == 'f') - return TOK_TK_TYPEDEF; + return TOK_TYPEDEF; break; case 'u': switch (buf[1]) { case 'n': if (len == 5 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'o' &amp;&amp; buf[4] == 'n') - return TOK_TK_UNION; + return TOK_UNION; if (len == 8 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'i' &amp;&amp; buf[4] == 'g' &amp;&amp; buf[5] == 'n' &amp;&amp; buf[6] == 'e' &amp;&amp; buf[7] == 'd') - return TOK_TK_UNSIGNED; + return TOK_UNSIGNED; break; } break; @@ -2480,10 +2499,10 @@ c_token_types get_keyword(const char *buf, int len) { switch (buf[1]) { case 'o': if (len == 4 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'd') - return TOK_TK_VOID; + return TOK_VOID; if (len == 8 &amp;&amp; buf[2] == 'l' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 't' &amp;&amp; buf[5] == 'i' &amp;&amp; buf[6] == 'l' &amp;&amp; buf[7] == 'e') - return TOK_SCSK_VOLATILE; + return TOK_VOLATILE; break; } break; @@ -2491,7 +2510,7 @@ c_token_types get_keyword(const char *buf, int len) { case 'w': if (len == 5 &amp;&amp; buf[1] == 'h' &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'l' &amp;&amp; buf[4] == 'e') - return TOK_CTK_WHILE; + return TOK_WHILE; break; default: @@ -2502,15 +2521,15 @@ c_token_types get_keyword(const char *buf, int len) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:61">61</a></p> </div> </div> -<a name="1:61"><div class="section"><h4 class="noheading">61. </h4></a> +<a name="1:63"><div class="section"><h4 class="noheading">63. </h4></a> <p>The <code>read_operator</code> function works similarly to the <code>read_identifier</code> function. It uses a decision tree to identify operators. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenize Operator <a href="lexer.html#1:61">61</a>}</span> +<span class="codeblock_name">{Tokenize Operator <a href="lexer.html#1:63">63</a>}</span> <pre class="prettyprint lang-c"> token_t *read_operator(void) { @@ -2520,65 +2539,65 @@ token_t *read_operator(void) { case '!': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_NE, line, column, 2); + return token_create(TOK_NE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_NOT, line, column, 1); + return token_create(TOK_NOT, line, column, 1); } case '%': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MOD, line, column, 2); + return token_create(TOK_ASSIGN_MOD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MOD, line, column, 1); + return token_create(TOK_MOD, line, column, 1); } case '&amp;': { c = input_getc(); if (c == '&amp;') - return token_create(TOK_OP_AND, line, column, 2); + return token_create(TOK_AND, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2); + return token_create(TOK_ASSIGN_BITAND, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_AND, line, column, 1); + return token_create(TOK_BIT_AND, line, column, 1); } case '(': - return token_create(TOK_SEP_LEFT_PAREN, line, column, 1); + return token_create(TOK_LEFT_PAREN, line, column, 1); case ')': - return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1); + return token_create(TOK_RIGHT_PAREN, line, column, 1); case '*': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_MUL, line, column, 2); + return token_create(TOK_ASSIGN_MUL, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_MUL, line, column, 1); + return token_create(TOK_MUL, line, column, 1); } case '+': { c = input_getc(); if (c == '+') - return token_create(TOK_OP_INC, line, column, 2); + return token_create(TOK_INC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_ADD, line, column, 2); + return token_create(TOK_ASSIGN_ADD, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ADD, line, column, 2); + return token_create(TOK_ADD, line, column, 2); } case ',': - return token_create(TOK_SEP_COMMA, line, column, 1); + return token_create(TOK_COMMA, line, column, 1); case '-': { c = input_getc(); if (c == '-') - return token_create(TOK_OP_DEC, line, column, 2); + return token_create(TOK_DEC, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_SUB, line, column, 2); + return token_create(TOK_ASSIGN_SUB, line, column, 2); if (c == '&gt;') - return token_create(TOK_OP_MEMBER_POINTER, line, column, 2); + return token_create(TOK_MEMBER_POINTER, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_SUB, line, column, 1); + return token_create(TOK_SUB, line, column, 1); } case '.': { c = input_getc(); if (c == '.') { c = input_getc(); if (c == '.') { - return token_create(TOK_SEP_ELLIPSIS, line, column, 3); + return token_create(TOK_ELLIPSIS, line, column, 3); } else { // Bail out, can't store more than one unget tok_error("Unexpected character '.' at line %d, column %d\n", line, @@ -2591,77 +2610,77 @@ token_t *read_operator(void) { case '/': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_DIV, line, column, 1); + return token_create(TOK_DIV, line, column, 1); } case ':': - return token_create(TOK_OP_COND_DECISION, line, column, 1); + return token_create(TOK_COND_DECISION, line, column, 1); case ';': - return token_create(TOK_SEP_SEMICOLON, line, column, 1); + return token_create(TOK_SEMICOLON, line, column, 1); case '&lt;': { c = input_getc(); if (c == '&lt;') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_LSHIFT, line, column, 2); + return token_create(TOK_LSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_LE, line, column, 2); + return token_create(TOK_LE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_LT, line, column, 1); + return token_create(TOK_LT, line, column, 1); } case '=': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN, line, column, 2); + return token_create(TOK_ASSIGN, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_ASSIGN, line, column, 1); + return token_create(TOK_ASSIGN, line, column, 1); } case '&gt;': { c = input_getc(); if (c == '&gt;') { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3); + return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); input_ungetc(c); - return token_create(TOK_OP_RSHIFT, line, column, 2); + return token_create(TOK_RSHIFT, line, column, 2); } if (c == '=') - return token_create(TOK_OP_GE, line, column, 2); + return token_create(TOK_GE, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_GT, line, column, 1); + return token_create(TOK_GT, line, column, 1); } case '?': - return token_create(TOK_OP_COND, line, column, 1); + return token_create(TOK_COND, line, column, 1); case '[': - return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1); + return token_create(TOK_LEFT_BRACKET, line, column, 1); case ']': - return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1); + return token_create(TOK_RIGHT_BRACKET, line, column, 1); case '^': { c = input_getc(); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2); + return token_create(TOK_ASSIGN_BITXOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_XOR, line, column, 1); + return token_create(TOK_BIT_XOR, line, column, 1); } case '{': - return token_create(TOK_SEP_LEFT_BRACE, line, column, 1); + return token_create(TOK_LEFT_BRACE, line, column, 1); case '|': { c = input_getc(); if (c == '|') - return token_create(TOK_OP_OR, line, column, 2); + return token_create(TOK_OR, line, column, 2); if (c == '=') - return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2); + return token_create(TOK_ASSIGN_BITOR, line, column, 2); input_ungetc(c); - return token_create(TOK_OP_BIT_OR, line, column, 1); + return token_create(TOK_BIT_OR, line, column, 1); } case '}': - return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1); + return token_create(TOK_RIGHT_BRACE, line, column, 1); case '~': - return token_create(TOK_OP_BIT_NOT, line, column, 1); + return token_create(TOK_BIT_NOT, line, column, 1); default: input_ungetc(c); return NULL; @@ -2672,50 +2691,50 @@ token_t *read_operator(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:62"><div class="section"><h4 class="noheading">62. </h4></a> +<a name="1:64"><div class="section"><h4 class="noheading">64. </h4></a> <p>The <code>read_number</code> function reads a number from the input stream. It can be an integer or a floating-point number. </p> <p>I've broken it up a bit to make it easier to read. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenize Number <a href="lexer.html#1:62">62</a>}</span> +<span class="codeblock_name">{Tokenize Number <a href="lexer.html#1:64">64</a>}</span> <pre class="prettyprint lang-c"> static token_t *read_number(void) { int c; char buf[1024]; int i = 0; c = input_getc(); -<span class="nocode pln"> {Check for valid prefix, <a href="lexer.html#1:63">63</a>}</span> +<span class="nocode pln"> {Check for valid prefix, <a href="lexer.html#1:65">65</a>}</span> int radix = 10; -<span class="nocode pln"> {Process Radix, <a href="lexer.html#1:64">64</a>}</span> +<span class="nocode pln"> {Process Radix, <a href="lexer.html#1:66">66</a>}</span> int is_float = 0; -<span class="nocode pln"> {Read Number Loop, <a href="lexer.html#1:65">65</a>}</span> +<span class="nocode pln"> {Read Number Loop, <a href="lexer.html#1:67">67</a>}</span> buf[i] = '\0'; -<span class="nocode pln"> {Process Suffixes, <a href="lexer.html#1:66">66</a>}</span> -<span class="nocode pln"> {Check for conflicting suffixes, <a href="lexer.html#1:67">67</a>}</span> +<span class="nocode pln"> {Process Suffixes, <a href="lexer.html#1:68">68</a>}</span> +<span class="nocode pln"> {Check for conflicting suffixes, <a href="lexer.html#1:69">69</a>}</span> if (is_float) { -<span class="nocode pln"> {Convert to float, <a href="lexer.html#1:68">68</a>}</span> +<span class="nocode pln"> {Convert to float, <a href="lexer.html#1:70">70</a>}</span> } else { -<span class="nocode pln"> {Convert to integer, <a href="lexer.html#1:69">69</a>}</span> +<span class="nocode pln"> {Convert to integer, <a href="lexer.html#1:71">71</a>}</span> } return NULL; } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:63"><div class="section"><h4 class="noheading">63. </h4></a> +<a name="1:65"><div class="section"><h4 class="noheading">65. </h4></a> <p>To determine if a character is a valid prefix for a number, we need to check if it's a digit or a period followed by a digit </p> <div class="codeblock"> -<span class="codeblock_name">{Check for valid prefix <a href="lexer.html#1:63">63</a>}</span> +<span class="codeblock_name">{Check for valid prefix <a href="lexer.html#1:65">65</a>}</span> <pre class="prettyprint lang-c"> // If we don't have a digit or decimal point, it's not a number if (!isdigit(c) &amp;&amp; c != '.') { @@ -2727,22 +2746,22 @@ static token_t *read_number(void) { char cnext = input_getc(); if (!isdigit(cnext)) { input_ungetc(cnext); - return token_create(TOK_OP_MEMBER, line, column, 1); + return token_create(TOK_MEMBER, line, column, 1); } input_ungetc(cnext); } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:64"><div class="section"><h4 class="noheading">64. </h4></a> +<a name="1:66"><div class="section"><h4 class="noheading">66. </h4></a> <p>A C constant starting with a zero is either an octal or hexadecimal constant. We need to check the next character to determine which one it is. </p> <div class="codeblock"> -<span class="codeblock_name">{Process Radix <a href="lexer.html#1:64">64</a>}</span> +<span class="codeblock_name">{Process Radix <a href="lexer.html#1:66">66</a>}</span> <pre class="prettyprint lang-c"> // Check for hex and octal. if (c == '0') { @@ -2762,13 +2781,13 @@ static token_t *read_number(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:65"><div class="section"><h4 class="noheading">65. </h4></a> +<a name="1:67"><div class="section"><h4 class="noheading">67. </h4></a> <div class="codeblock"> -<span class="codeblock_name">{Read Number Loop <a href="lexer.html#1:65">65</a>}</span> +<span class="codeblock_name">{Read Number Loop <a href="lexer.html#1:67">67</a>}</span> <pre class="prettyprint lang-c"> while ((c = input_getc()) != EOF) { // Since there can be multiple writes to the buffer, we want to make sure we @@ -2818,15 +2837,15 @@ static token_t *read_number(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:66"><div class="section"><h4 class="noheading">66. </h4></a> +<a name="1:68"><div class="section"><h4 class="noheading">68. </h4></a> <p>C constants can have suffixes to indicate their type. We need to check for these suffixes and set the appropriate flags. </p> <div class="codeblock"> -<span class="codeblock_name">{Process Suffixes <a href="lexer.html#1:66">66</a>}</span> +<span class="codeblock_name">{Process Suffixes <a href="lexer.html#1:68">68</a>}</span> <pre class="prettyprint lang-c"> int is_unsigned = 0; int is_long = 0; @@ -2859,15 +2878,15 @@ static token_t *read_number(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:67"><div class="section"><h4 class="noheading">67. </h4></a> +<a name="1:69"><div class="section"><h4 class="noheading">69. </h4></a> <p>If we find conflicting suffixes, we print a warning and ignore the suffixes. </p> <div class="codeblock"> -<span class="codeblock_name">{Check for conflicting suffixes <a href="lexer.html#1:67">67</a>}</span> +<span class="codeblock_name">{Check for conflicting suffixes <a href="lexer.html#1:69">69</a>}</span> <pre class="prettyprint lang-c"> if (is_single &amp;&amp; is_long) { tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " @@ -2887,15 +2906,15 @@ static token_t *read_number(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:68"><div class="section"><h4 class="noheading">68. </h4></a> -<p>If the constant is a floating-point number, we convert it to a float. We need to make sure that the number is in range for the given type and check for errors from strtod +<a name="1:70"><div class="section"><h4 class="noheading">70. </h4></a> +<p>If the string contains a float, we pass it to strtod. We need to make sure that the number is in range for the given type and check for errors from strtod </p> <div class="codeblock"> -<span class="codeblock_name">{Convert to float <a href="lexer.html#1:68">68</a>}</span> +<span class="codeblock_name">{Convert to float <a href="lexer.html#1:70">70</a>}</span> <pre class="prettyprint lang-c"> errno = 0; // Strtod generates a unix-style error when it's given something out of @@ -2919,21 +2938,21 @@ static token_t *read_number(void) { "precision\n", f); } - return token_create_float(is_single ? TOK_CONST_FLOAT_32 - : TOK_CONST_FLOAT_64, + return token_create_float(is_single ? TOK_FLOAT_32 + : TOK_FLOAT_64, line, column, f, i); </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:69"><div class="section"><h4 class="noheading">69. </h4></a> -<p>If the constant is an integer, we convert it to an integer. We need to make sure that the number is in range for the given type and check for errors from strtoll +<a name="1:71"><div class="section"><h4 class="noheading">71. </h4></a> +<p>If the string contains a number, we pass it to stroll. We need to make sure that the number is in range for the given type and check for errors from strtoll </p> <div class="codeblock"> -<span class="codeblock_name">{Convert to integer <a href="lexer.html#1:69">69</a>}</span> +<span class="codeblock_name">{Convert to integer <a href="lexer.html#1:71">71</a>}</span> <pre class="prettyprint lang-c"> errno = 0; uint64_t int_ = strtoull(buf, NULL, radix); @@ -2944,7 +2963,7 @@ static token_t *read_number(void) { } if (is_unsigned) { if (is_long) { - return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i); + return token_create_int(TOK_INTEGER_U64, line, column, int_, i); } else { if (int_ &gt; UINT32_MAX) { tok_warn( @@ -2952,7 +2971,7 @@ static token_t *read_number(void) { "int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i); + return token_create_int(TOK_INTEGER_U32, line, column, int_, i); } } else { if (is_long) { @@ -2963,27 +2982,27 @@ static token_t *read_number(void) { "Warning: Integer constant %lld is out of range for long long\n", i); } - return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i); + return token_create_int(TOK_INTEGER_S64, line, column, int_, i); } else { if (int_ &amp; (1UL &lt;&lt; 31)) { tok_warn("Warning: Integer constant %lld is out of range for int\n", int_); } - return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i); + return token_create_int(TOK_INTEGER_S32, line, column, int_, i); } } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:64">64</a></p> </div> </div> -<a name="1:70"><div class="section"><h4 class="noheading">70. </h4></a> +<a name="1:72"><div class="section"><h4 class="noheading">72. </h4></a> <p>The <code>read_char_constant</code> function reads a character constant from the input stream. It can be a single character or a multi-character escape sequence. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenize Character <a href="lexer.html#1:70">70</a>}</span> +<span class="codeblock_name">{Tokenize Character <a href="lexer.html#1:72">72</a>}</span> <pre class="prettyprint lang-c"> static token_t *read_char_constant(void) { int c; @@ -3009,15 +3028,15 @@ static token_t *read_char_constant(void) { return NULL; } len++; - return token_create_char(TOK_CONST_CHAR, line, column, val, len); + return token_create_char(TOK_CHAR_CONST, line, column, val, len); } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:71"><div class="section"><h4 class="noheading">71. </h4></a> +<a name="1:73"><div class="section"><h4 class="noheading">73. </h4></a> <p>The <code>read_string_literal</code> function reads a string literal from the input stream. </p> <p>For this function, an automatic-lifetime buffer is used to store the string it becomes too large. At that point, a heap-allocated buffer is used. @@ -3025,9 +3044,9 @@ This way we can avoid unnecessary heap allocations for small strings. </p> <div class="codeblock"> -<span class="codeblock_name">{Tokenize String <a href="lexer.html#1:71">71</a>}</span> +<span class="codeblock_name">{Tokenize String <a href="lexer.html#1:73">73</a>}</span> <pre class="prettyprint lang-c"> -<span class="nocode pln">{Read Escape Sequence, <a href="lexer.html#1:72">72</a>}</span> +<span class="nocode pln">{Read Escape Sequence, <a href="lexer.html#1:74">74</a>}</span> static token_t *read_string_literal(void) { int c; c = input_getc(); @@ -3076,7 +3095,7 @@ static token_t *read_string_literal(void) { return NULL; } - token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, + token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, i + esc_pad + 2); if (buf != s_buf) { free(buf); @@ -3086,15 +3105,15 @@ static token_t *read_string_literal(void) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> </div> -<a name="1:72"><div class="section"><h4 class="noheading">72. </h4></a> +<a name="1:74"><div class="section"><h4 class="noheading">74. </h4></a> <p>Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases. </p> <div class="codeblock"> -<span class="codeblock_name">{Read Escape Sequence <a href="lexer.html#1:72">72</a>}</span> +<span class="codeblock_name">{Read Escape Sequence <a href="lexer.html#1:74">74</a>}</span> <pre class="prettyprint lang-c"> static char read_escape_sequence(int *len) { int c = input_getc(); @@ -3157,15 +3176,15 @@ static char read_escape_sequence(int *len) { </pre> -<p class="seealso">Used in section <a href="lexer.html#1:71">71</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:73">73</a></p> </div> </div> -<a name="1:73"><div class="section"><h4 class="noheading">73. </h4></a> +<a name="1:75"><div class="section"><h4 class="noheading">75. </h4></a> <p>Finally, I'll add some code for running the tokenizer as its own program. This way we can test it out. </p> <div class="codeblock"> -<span class="codeblock_name">{Run Test <a href="lexer.html#1:73">73</a>}</span> +<span class="codeblock_name">{Run Test <a href="lexer.html#1:75">75</a>}</span> <pre class="prettyprint lang-c"> char *preprocess(char *in) { char *output_name = malloc(1024); @@ -3194,12 +3213,13 @@ int main(int argc, char **argv) { destroy_tokenizer(); remove(preprocessed); free(preprocessed); + hash_table_destroy(string_table); return 0; } </pre> -<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> +<p class="seealso">Used in section <a href="lexer.html#1:58">58</a></p> </div> <h3> Bugs/Errata</h3> <p>I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed. @@ -3207,7 +3227,7 @@ int main(int argc, char **argv) { <ul> <li>had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code> check which if true returns EOF, preventing input initialization. Fixed by changing to <code>buffer_pos == buffer_size</code>. </li> -<li>assertion <code>token-&gt;kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot to expand check for identifiers which also use token_string. Fixed by changing to <code>token-&gt;kind == TOK_CONST_STRING_ASCII || token-&gt;kind == TOK_ID || token-&gt;kind == TOK_TID</code>. +<li>assertion <code>token-&gt;kind == TOK_STRING_ASCII</code> failed in token_string. Forgot to expand check for identifiers which also use token_string. Fixed by changing to <code>token-&gt;kind == TOK_STRING_ASCII || token-&gt;kind == TOK_ID || token-&gt;kind == TOK_TID</code>. </li> <li>token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving the call to free after the call to <code>hash_table_get</code>. </li> diff --git a/projects/cminus/lexer_new.html b/projects/cminus/lexer_new.html @@ -1,288 +0,0 @@ -<!DOCTYPE html> -<html> -<head> -<meta charset="utf-8"> -<title>Lexer</title> -<script> -!function(){var q=null;window.PR_SHOULD_USE_CONTINUATION=!0; -(function(){function R(a){function d(e){var b=e.charCodeAt(0);if(b!==92)return b;var a=e.charAt(1);return(b=r[a])?b:"0"<=a&&a<="7"?parseInt(e.substring(1),8):a==="u"||a==="x"?parseInt(e.substring(2),16):e.charCodeAt(1)}function g(e){if(e<32)return(e<16?"\\x0":"\\x")+e.toString(16);e=String.fromCharCode(e);return e==="\\"||e==="-"||e==="]"||e==="^"?"\\"+e:e}function b(e){var b=e.substring(1,e.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),e=[],a= -b[0]==="^",c=["["];a&&c.push("^");for(var a=a?1:0,f=b.length;a<f;++a){var h=b[a];if(/\\[bdsw]/i.test(h))c.push(h);else{var h=d(h),l;a+2<f&&"-"===b[a+1]?(l=d(b[a+2]),a+=2):l=h;e.push([h,l]);l<65||h>122||(l<65||h>90||e.push([Math.max(65,h)|32,Math.min(l,90)|32]),l<97||h>122||e.push([Math.max(97,h)&-33,Math.min(l,122)&-33]))}}e.sort(function(e,a){return e[0]-a[0]||a[1]-e[1]});b=[];f=[];for(a=0;a<e.length;++a)h=e[a],h[0]<=f[1]+1?f[1]=Math.max(f[1],h[1]):b.push(f=h);for(a=0;a<b.length;++a)h=b[a],c.push(g(h[0])), -h[1]>h[0]&&(h[1]+1>h[0]&&c.push("-"),c.push(g(h[1])));c.push("]");return c.join("")}function s(e){for(var a=e.source.match(/\[(?:[^\\\] ]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),c=a.length,d=[],f=0,h=0;f<c;++f){var l=a[f];l==="("?++h:"\\"===l.charAt(0)&&(l=+l.substring(1))&&(l<=h?d[l]=-1:a[f]=g(l))}for(f=1;f<d.length;++f)-1===d[f]&&(d[f]=++x);for(h=f=0;f<c;++f)l=a[f],l==="("?(++h,d[h]||(a[f]="(?:")):"\\"===l.charAt(0)&&(l=+l.substring(1))&&l<=h&& -(a[f]="\\"+d[l]);for(f=0;f<c;++f)"^"===a[f]&&"^"!==a[f+1]&&(a[f]="");if(e.ignoreCase&&m)for(f=0;f<c;++f)l=a[f],e=l.charAt(0),l.length>=2&&e==="["?a[f]=b(l):e!=="\\"&&(a[f]=l.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return a.join("")}for(var x=0,m=!1,j=!1,k=0,c=a.length;k<c;++k){var i=a[k];if(i.ignoreCase)j=!0;else if(/[a-z]/i.test(i.source.replace(/\\u[\da-f]{4}|\\x[\da-f]{2}|\\[^UXux]/gi,""))){m=!0;j=!1;break}}for(var r={b:8,t:9,n:10,v:11, -f:12,r:13},n=[],k=0,c=a.length;k<c;++k){i=a[k];if(i.global||i.multiline)throw Error(""+i);n.push("(?:"+s(i)+")")}return RegExp(n.join("|"),j?"gi":"g")}function S(a,d){function g(a){var c=a.nodeType;if(c==1){if(!b.test(a.className)){for(c=a.firstChild;c;c=c.nextSibling)g(c);c=a.nodeName.toLowerCase();if("br"===c||"li"===c)s[j]="\n",m[j<<1]=x++,m[j++<<1|1]=a}}else if(c==3||c==4)c=a.nodeValue,c.length&&(c=d?c.replace(/\r\n?/g,"\n"):c.replace(/[\t\n\r ]+/g," "),s[j]=c,m[j<<1]=x,x+=c.length,m[j++<<1|1]= -a)}var b=/(?:^|\s)nocode(?:\s|$)/,s=[],x=0,m=[],j=0;g(a);return{a:s.join("").replace(/\n$/,""),d:m}}function H(a,d,g,b){d&&(a={a:d,e:a},g(a),b.push.apply(b,a.g))}function T(a){for(var d=void 0,g=a.firstChild;g;g=g.nextSibling)var b=g.nodeType,d=b===1?d?a:g:b===3?U.test(g.nodeValue)?a:d:d;return d===a?void 0:d}function D(a,d){function g(a){for(var j=a.e,k=[j,"pln"],c=0,i=a.a.match(s)||[],r={},n=0,e=i.length;n<e;++n){var z=i[n],w=r[z],t=void 0,f;if(typeof w==="string")f=!1;else{var h=b[z.charAt(0)]; -if(h)t=z.match(h[1]),w=h[0];else{for(f=0;f<x;++f)if(h=d[f],t=z.match(h[1])){w=h[0];break}t||(w="pln")}if((f=w.length>=5&&"lang-"===w.substring(0,5))&&!(t&&typeof t[1]==="string"))f=!1,w="src";f||(r[z]=w)}h=c;c+=z.length;if(f){f=t[1];var l=z.indexOf(f),B=l+f.length;t[2]&&(B=z.length-t[2].length,l=B-f.length);w=w.substring(5);H(j+h,z.substring(0,l),g,k);H(j+h+l,f,I(w,f),k);H(j+h+B,z.substring(B),g,k)}else k.push(j+h,w)}a.g=k}var b={},s;(function(){for(var g=a.concat(d),j=[],k={},c=0,i=g.length;c<i;++c){var r= -g[c],n=r[3];if(n)for(var e=n.length;--e>=0;)b[n.charAt(e)]=r;r=r[1];n=""+r;k.hasOwnProperty(n)||(j.push(r),k[n]=q)}j.push(/[\S\s]/);s=R(j)})();var x=d.length;return g}function v(a){var d=[],g=[];a.tripleQuotedStrings?d.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?d.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/, -q,"'\"`"]):d.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&g.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var b=a.hashComments;b&&(a.cStyleComments?(b>1?d.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):d.push(["com",/^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),g.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h(?:h|pp|\+\+)?|[a-z]\w*)>/,q])):d.push(["com", -/^#[^\n\r]*/,q,"#"]));a.cStyleComments&&(g.push(["com",/^\/\/[^\n\r]*/,q]),g.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));if(b=a.regexLiterals){var s=(b=b>1?"":"\n\r")?".":"[\\S\\s]";g.push(["lang-regex",RegExp("^(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*("+("/(?=[^/*"+b+"])(?:[^/\\x5B\\x5C"+b+"]|\\x5C"+s+"|\\x5B(?:[^\\x5C\\x5D"+b+"]|\\x5C"+ -s+")*(?:\\x5D|$))+/")+")")])}(b=a.types)&&g.push(["typ",b]);b=(""+a.keywords).replace(/^ | $/g,"");b.length&&g.push(["kwd",RegExp("^(?:"+b.replace(/[\s,]+/g,"|")+")\\b"),q]);d.push(["pln",/^\s+/,q," \r\n\t\u00a0"]);b="^.[^\\s\\w.$@'\"`/\\\\]*";a.regexLiterals&&(b+="(?!s*/)");g.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/, -q],["pun",RegExp(b),q]);return D(d,g)}function J(a,d,g){function b(a){var c=a.nodeType;if(c==1&&!x.test(a.className))if("br"===a.nodeName)s(a),a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)b(a);else if((c==3||c==4)&&g){var d=a.nodeValue,i=d.match(m);if(i)c=d.substring(0,i.index),a.nodeValue=c,(d=d.substring(i.index+i[0].length))&&a.parentNode.insertBefore(j.createTextNode(d),a.nextSibling),s(a),c||a.parentNode.removeChild(a)}}function s(a){function b(a,c){var d= -c?a.cloneNode(!1):a,e=a.parentNode;if(e){var e=b(e,1),g=a.nextSibling;e.appendChild(d);for(var i=g;i;i=g)g=i.nextSibling,e.appendChild(i)}return d}for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),d;(d=a.parentNode)&&d.nodeType===1;)a=d;c.push(a)}for(var x=/(?:^|\s)nocode(?:\s|$)/,m=/\r\n?|\n/,j=a.ownerDocument,k=j.createElement("li");a.firstChild;)k.appendChild(a.firstChild);for(var c=[k],i=0;i<c.length;++i)b(c[i]);d===(d|0)&&c[0].setAttribute("value",d);var r=j.createElement("ol"); -r.className="linenums";for(var d=Math.max(0,d-1|0)||0,i=0,n=c.length;i<n;++i)k=c[i],k.className="L"+(i+d)%10,k.firstChild||k.appendChild(j.createTextNode("\u00a0")),r.appendChild(k);a.appendChild(r)}function p(a,d){for(var g=d.length;--g>=0;){var b=d[g];F.hasOwnProperty(b)?E.console&&console.warn("cannot override language handler %s",b):F[b]=a}}function I(a,d){if(!a||!F.hasOwnProperty(a))a=/^\s*</.test(d)?"default-markup":"default-code";return F[a]}function K(a){var d=a.h;try{var g=S(a.c,a.i),b=g.a; -a.a=b;a.d=g.d;a.e=0;I(d,b)(a);var s=/\bMSIE\s(\d+)/.exec(navigator.userAgent),s=s&&+s[1]<=8,d=/\n/g,x=a.a,m=x.length,g=0,j=a.d,k=j.length,b=0,c=a.g,i=c.length,r=0;c[i]=m;var n,e;for(e=n=0;e<i;)c[e]!==c[e+2]?(c[n++]=c[e++],c[n++]=c[e++]):e+=2;i=n;for(e=n=0;e<i;){for(var p=c[e],w=c[e+1],t=e+2;t+2<=i&&c[t+1]===w;)t+=2;c[n++]=p;c[n++]=w;e=t}c.length=n;var f=a.c,h;if(f)h=f.style.display,f.style.display="none";try{for(;b<k;){var l=j[b+2]||m,B=c[r+2]||m,t=Math.min(l,B),A=j[b+1],G;if(A.nodeType!==1&&(G=x.substring(g, -t))){s&&(G=G.replace(d,"\r"));A.nodeValue=G;var L=A.ownerDocument,o=L.createElement("span");o.className=c[r+1];var v=A.parentNode;v.replaceChild(o,A);o.appendChild(A);g<l&&(j[b+1]=A=L.createTextNode(x.substring(t,l)),v.insertBefore(A,o.nextSibling))}g=t;g>=l&&(b+=2);g>=B&&(r+=2)}}finally{if(f)f.style.display=h}}catch(u){E.console&&console.log(u&&u.stack||u)}}var E=window,y=["break,continue,do,else,for,if,return,while"],C=[[y,"auto,case,char,const,default,double,enum,extern,float,goto,inline,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"], -"catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],M=[C,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,delegate,dynamic_cast,explicit,export,friend,generic,late_check,mutable,namespace,nullptr,property,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],V=[C,"abstract,assert,boolean,byte,extends,final,finally,implements,import,instanceof,interface,null,native,package,strictfp,super,synchronized,throws,transient"], -N=[C,"abstract,as,base,bool,by,byte,checked,decimal,delegate,descending,dynamic,event,finally,fixed,foreach,from,group,implicit,in,interface,internal,into,is,let,lock,null,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var,virtual,where"],C=[C,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],O=[y,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"], -P=[y,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],W=[y,"as,assert,const,copy,drop,enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,pub,pure,ref,self,static,struct,true,trait,type,unsafe,use"],y=[y,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],Q=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)\b/, -U=/\S/,X=v({keywords:[M,N,C,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",O,P,y],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),F={};p(X,["default-code"]);p(D([],[["pln",/^[^<?]+/],["dec",/^<!\w[^>]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-", -/^<xmp\b[^>]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^<script\b[^>]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^<style\b[^>]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i] ]),["default-markup","htm","html","mxml","xhtml","xml","xsl"]);p(D([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"] ],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/], -["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css",/^style\s*=\s*([^\s"'>]+)/i] ]),["in.tag"]);p(D([],[["atv",/^[\S\s]+/] ]),["uq.val"]);p(v({keywords:M,hashComments:!0,cStyleComments:!0,types:Q}),["c","cc","cpp","cxx","cyc","m"]);p(v({keywords:"null,true,false"}),["json"]);p(v({keywords:N,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:Q}), -["cs"]);p(v({keywords:V,cStyleComments:!0}),["java"]);p(v({keywords:y,hashComments:!0,multiLineStrings:!0}),["bash","bsh","csh","sh"]);p(v({keywords:O,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}),["cv","py","python"]);p(v({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:2}),["perl","pl","pm"]);p(v({keywords:P, -hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb","ruby"]);p(v({keywords:C,cStyleComments:!0,regexLiterals:!0}),["javascript","js"]);p(v({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,throw,true,try,unless,until,when,while,yes",hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);p(v({keywords:W,cStyleComments:!0,multilineStrings:!0}),["rc","rs","rust"]); -p(D([],[["str",/^[\S\s]+/] ]),["regex"]);var Y=E.PR={createSimpleLexer:D,registerLangHandler:p,sourceDecorator:v,PR_ATTRIB_NAME:"atn",PR_ATTRIB_VALUE:"atv",PR_COMMENT:"com",PR_DECLARATION:"dec",PR_KEYWORD:"kwd",PR_LITERAL:"lit",PR_NOCODE:"nocode",PR_PLAIN:"pln",PR_PUNCTUATION:"pun",PR_SOURCE:"src",PR_STRING:"str",PR_TAG:"tag",PR_TYPE:"typ",prettyPrintOne:E.prettyPrintOne=function(a,d,g){var b=document.createElement("div");b.innerHTML="<pre>"+a+"</pre>";b=b.firstChild;g&&J(b,g,!0);K({h:d,j:g,c:b,i:1}); -return b.innerHTML},prettyPrint:E.prettyPrint=function(a,d){function g(){for(var b=E.PR_SHOULD_USE_CONTINUATION?c.now()+250:Infinity;i<p.length&&c.now()<b;i++){for(var d=p[i],j=h,k=d;k=k.previousSibling;){var m=k.nodeType,o=(m===7||m===8)&&k.nodeValue;if(o?!/^\??prettify\b/.test(o):m!==3||/\S/.test(k.nodeValue))break;if(o){j={};o.replace(/\b(\w+)=([\w%+\-.:]+)/g,function(a,b,c){j[b]=c});break}}k=d.className;if((j!==h||e.test(k))&&!v.test(k)){m=!1;for(o=d.parentNode;o;o=o.parentNode)if(f.test(o.tagName)&& -o.className&&e.test(o.className)){m=!0;break}if(!m){d.className+=" prettyprinted";m=j.lang;if(!m){var m=k.match(n),y;if(!m&&(y=T(d))&&t.test(y.tagName))m=y.className.match(n);m&&(m=m[1])}if(w.test(d.tagName))o=1;else var o=d.currentStyle,u=s.defaultView,o=(o=o?o.whiteSpace:u&&u.getComputedStyle?u.getComputedStyle(d,q).getPropertyValue("white-space"):0)&&"pre"===o.substring(0,3);u=j.linenums;if(!(u=u==="true"||+u))u=(u=k.match(/\blinenums\b(?::(\d+))?/))?u[1]&&u[1].length?+u[1]:!0:!1;u&&J(d,u,o);r= -{h:m,c:d,j:u,i:o};K(r)}}}i<p.length?setTimeout(g,250):"function"===typeof a&&a()}for(var b=d||document.body,s=b.ownerDocument||document,b=[b.getElementsByTagName("pre"),b.getElementsByTagName("code"),b.getElementsByTagName("xmp")],p=[],m=0;m<b.length;++m)for(var j=0,k=b[m].length;j<k;++j)p.push(b[m][j]);var b=q,c=Date;c.now||(c={now:function(){return+new Date}});var i=0,r,n=/\blang(?:uage)?-([\w.]+)(?!\S)/,e=/\bprettyprint\b/,v=/\bprettyprinted\b/,w=/pre|xmp/i,t=/^code$/i,f=/^(?:pre|code|xmp)$/i, -h={};g()}};typeof define==="function"&&define.amd&&define("google-code-prettify",[],function(){return Y})})();}() -</script> -<style> -.pln{color:#1b181b}.str{color:#918b3b}.kwd{color:#7b59c0}.com{color:#9e8f9e}.typ{color:#516aec}.lit{color:#a65926}.clo,.opn,.pun{color:#1b181b}.tag{color:#ca402b}.atn{color:#a65926}.atv{color:#159393}.dec{color:#a65926}.var{color:#ca402b}.fun{color:#516aec}pre.prettyprint{background:#f7f3f7;color:#ab9bab;font-family:Menlo,Consolas,"Bitstream Vera Sans Mono","DejaVu Sans Mono",Monaco,monospace;font-size:12px;line-height:1.5;border:1px solid #d8cad8;padding:10px}ol.linenums{margin-top:0;margin-bottom:0} -body{min-width:200px;max-width:850px;margin:0 auto;padding:30px;}.chapter-nav{font-size: 10pt;}a:link,a:visited{color:#00f}.codeblock_name,code,pre.prettyprint{font-family:Monaco,"Lucida Console",monospace}body{font-size:14pt}.codeblock_name,.math,.seealso,code{font-size:10pt}.codeblock{page-break-inside:avoid;padding-bottom:15px}.math{text-indent:0}pre.prettyprint{font-size:10pt;padding:10px;border-radius:10px;border:none;white-space:pre-wrap}.codeblock_name{margin-top:1.25em;display:block}a:link{text-decoration:none}a:link:not(.lit):hover{color:#00f;text-decoration:underline}a:link:active{color:red}h4{padding-right:1.25em}h4.noheading{margin-bottom:0}h1{text-align:center}code{padding:2px}pre{-moz-tab-size:4;-o-tab-size:4;tab-size:4}p:not(.notp){margin:0;text-indent:2em}.two-col{list-style-type:none}.two-col li:before{content:'-';padding:5px;margin-right:5px;color:orange;background-color:#fff;display:inline-block}@media print{body{font-size:10pt}pre.prettyprint{font-size:8pt}.seealso{font-size:9pt}.codeblock_name,.math,code{font-size:8pt}.math{text-indent:0}} -/* code blocks (Style from jmeiners.com/lc3-vm, CC BY-NC-SA 4.0, used with attribution) */ -code, -.block-header, -.file-name - { - font-size: 11pt; - font-family: 'Fira Mono', Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; -} - -.file-name-hr -{ - font-size: 13pt; - font-family: 'Fira Mono', Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; -} - -/* Quotes and Block Quotes */ -blockquote { - margin: 1.5em 10px; - padding: 0.5em 10px; - border-left: 5px solid #ccc; - color: #666; - background-color: #f9f9f9; - font-style: italic; -} - -blockquote p { - margin: 0; - font-size: 1.2em; -} - -q { - quotes: "“" "”" "‘" "’"; - font-style: italic; -} - -q::before { - content: open-quote; -} - -q::after { - content: close-quote; -} - -/*! Color themes for Google Code Prettify | MIT License | github.com/jmblog/color-themes-for-google-code-prettify */ -.prettyprint { - background: #f5f7ff; - font-family: Menlo, "Bitstream Vera Sans Mono", "DejaVu Sans Mono", Monaco, Consolas, monospace; - border: 0 !important; -} - -.pln { - color: #202746; -} - -/* Specify class=linenums on a pre to get line numbering */ -ol.linenums { - margin-top: 0; - margin-bottom: 0; - color: #202746; -} - -li.L0, -li.L1, -li.L2, -li.L3, -li.L4, -li.L5, -li.L6, -li.L7, -li.L8, -li.L9 { - padding-left: 1em; - background-color: #f5f7ff; - list-style-type: decimal; -} - -@media screen { - - /* string content */ - - .str { - color: #ac9739; - } - - /* keyword */ - - .kwd { - color: #6679cc; - } - - /* comment */ - - .com { - color: #202746; - } - - /* type name */ - - .typ { - color: #3d8fd1; - } - - /* literal value */ - - .lit { - color: #c76b29; - } - - /* punctuation */ - - .pun { - color: #202746; - } - - /* lisp open bracket */ - - .opn { - color: #202746; - } - - /* lisp close bracket */ - - .clo { - color: #202746; - } - - /* markup tag name */ - - .tag { - color: #c94922; - } - - /* markup attribute name */ - - .atn { - color: #c76b29; - } - - /* markup attribute value */ - - .atv { - color: #22a2c9; - } - - /* declaration */ - - .dec { - color: #c76b29; - } - - /* variable name */ - - .var { - color: #c94922; - } - - /* function name */ - - .fun { - color: #3d8fd1; - } -}</style> -</head> -<body onload="prettyPrint()"> -<section> -<h1>Lexer</h1> -<a name="1:1"><div class="section"><h4>1. General Project Structure</h4></a> -<p>Since this is the first article, I'll outline the project structure for the C- compiler. -</p> -<p>The project has a series of pretty typical stages: -</p> -<ol> -<li>The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E"). -</li> -<li>The parser. This takes the tokens and builds an abstract syntax tree (AST). -</li> -<li>The symbol table. This exists in a sort of in-between space next to the lexer and parser. It's used to store information about variables and functions. -</li> -<li>The type checker. This is used to ensure that the types of variables and functions are correct. -</li> -<li>The code generator. This takes the AST and generates an intermediate representation (IR). -</li> -<li>The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages. -</li> -<li>The lowerer. This takes the IR and lowers it to a simpler IR. -</li> -<li>The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers. -</li> -<li>The code emitter. This takes the IR and emits RISC-V assembly. -</li> -</ol> -<p>As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena. -</p> -<h2> Some Rules</h2> -<p>Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent. -</p> -<h4> 1. PROGRAM LIKE IT'S 1999</h4> -<blockquote><p> 640 KB ought to be enough for anybody. - Bill Gates -</p> -</blockquote> -<p>Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage. -</p> -<p>This places a lot of constraints on the project, but I think it's a good exercise in minimalism. -</p> -<p>Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future). -</p> -<h4> 2. PROGRAM IN C++--</h4> -<p>I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits: -</p> -<ul> -<li>Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header. -</li> -<li>Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure. -</li> -<li>Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called. -</li> -</ul> -<h4> 3. DON'T GET FANCY</h4> -<p>My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else. -</p> -<p>That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability. -</p> -<h4> 4. DESIGN FOR DEBUGGING</h4> -<p>This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point. -</p> -<p>This might be painful, but it'll make debugging a lot simpler and let users look under the hood. -</p> -<h4> 5. SMART DATA, STUPID CODE</h4> -<p>A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible. -</p> -<p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table. -</p> -<h4> Misc</h4> -<p>THIS IS A LITERATE PROGRAM! Go to <a href="https://reagancfischer.dev/lexer.lit">this link</a> to see the file that generated this HTML. -</p> -<h2> The Lexer</h2> -<p>A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). -</p> -<p>Given a string like <code>int main() { return 0; }</code>, the lexer would produce a series of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>, <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>, <code>RBRACE</code>. -</p> -<h3> Design</h3> -<p>I'll break the lexer up into two modules, <code>tokenizer.c</code> and <code>input.c</code>. The input module will be responsible for reading the file and providing characters to the tokenizer, while the tokenizer module will be responsible for producing tokens. -</p> -<h3> Input</h3> -<h4> Input Interface</h4> - -</div> -</body> diff --git a/projects/cminus/lexer_new.lit b/projects/cminus/lexer_new.lit @@ -1,614 +0,0 @@ -@code_type c .c -@comment_type /* %s */ -@compiler lit -t lexer.lit && gcc -Wall -Wextra -Wpedantic -Wstrict-aliasing=3 -Wwrite-strings -Wvla -Wcast-align=strict -Wstrict-prototypes -Wstringop-overflow=4 -Wshadow -fanalyzer tokenizer.c input.c hash_table.c -D TOK_TEST -g -O0 && rm a.out - -@title Lexer -@add_css ../style.css -@s General Project Structure -Since this is the first article, I'll outline the project structure for the C- compiler. - -The project has a series of pretty typical stages: - -1. The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E"). -2. The parser. This takes the tokens and builds an abstract syntax tree (AST). -3. The symbol table. This exists in a sort of in-between space next to the lexer and parser. It's used to store information about variables and functions. -4. The type checker. This is used to ensure that the types of variables and functions are correct. -5. The code generator. This takes the AST and generates an intermediate representation (IR). -6. The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages. -7. The lowerer. This takes the IR and lowers it to a simpler IR. -8. The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers. -9. The code emitter. This takes the IR and emits RISC-V assembly. - -As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena. - -@s Some Rules - -Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent. - -1. PROGRAM LIKE IT'S 1999 - -> 640 KB ought to be enough for anybody. - Bill Gates - -Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage. - -This places a lot of constraints on the project, but I think it's a good exercise in minimalism. - -Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future). - -2. PROGRAM IN C++-- - -I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits: - -* Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header. -* Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure. -* Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called. - -3. DON'T GET FANCY - -My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else. - -That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability. - -4. DESIGN FOR DEBUGGING - -This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point. - -This might be painful, but it'll make debugging a lot simpler and let users look under the hood. - -5. SMART DATA, STUPID CODE - -A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible. - -For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table. - -@s Misc -THIS IS A LITERATE PROGRAM! Go to [this link](https://reagancfischer.dev/lexer.lit) to see the file that generated this HTML. - -@s The Lexer - -A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). - -Given a string like `int main() { return 0; }`, the lexer would produce a series of tokens like `INT`, `IDENTIFIER(main)`, `LPAREN`, `RPAREN`, `LBRACE`, `RETURN`, `INTCONSTANT(0)`, `SEMICOLON`, `RBRACE`. - -@s Design - -I'll break the lexer up into a couple of modules. `token.c` will contain the token data structure and functions to create and destroy tokens. `input.c` will contain the input data structure and functions to read from the input file. `tokenizer.c` will contain the main lexer logic. - -@s Token Interface -Tokens are the smallest unit of meaning in a language. They're used by the parser to build an abstract syntax tree (AST). We'll need a couple of things to represent a token: -* The type of token. This will be an enum, with values like `TOK_CTK_IF` or `TOK_CONST_INTEGER_U32`. -* The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do. -* The line and column of the token. This is used for error messages. - -As I mentioned earlier, we're trying to implement a sort of class system in C. For that, we'll need to hide the token implementation details behind an opaque pointer. We could just have a `void` pointer, but that stops us from being able to use compile-time type checking. -Instead, we'll use a forward declaration of the token type in the header file, and then define the token type in the implementation file. -@s ---- Opaque Token Type -typedef struct token token_t; ---- -@s -We'll need a couple of functions to create and destroy tokens. ---- Token Creation and Destruction -token_t *token_data_create(c_token_types kind, int lin, int col, - int len); - -token_t *token_create(c_token_types kind, int lin, int col, int len); - -token_t *token_create_int(c_token_types kind, int lin, int col, - int64_t i, int len); - -token_t *token_create_float(c_token_types kind, int lin, int col, - double f, int len); - -token_t *token_create_char(c_token_types kind, int lin, int col, char c, - int len); - -token_t *token_create_string(c_token_types kind, int lin, int col, - const char *s, int len); - -void token_destroy(token_t *token); ---- -@s -We'll also need some functions to access the token data. ---- Token Interface -c_token_types token_type(token_t *token); - -int64_t token_int(token_t *token); - -double token_float(token_t *token); - -const char *token_string(token_t *token); - -char token_char(token_t *token); - -int token_line(token_t *token); - -int token_column(token_t *token); ---- -@s -We'll need some types to represent the different kinds of tokens. ---- Token Types -typedef enum { - // Control Keywords - TOK_CTK_IF, - TOK_CTK_ELSE, - TOK_CTK_SWITCH, - TOK_CTK_CASE, - TOK_CTK_DEFAULT, - TOK_CTK_WHILE, - TOK_CTK_DO, - TOK_CTK_FOR, - TOK_CTK_CONTINUE, - TOK_CTK_BREAK, - TOK_CTK_RETURN, - TOK_CTK_GOTO, - - // Type Keywords - TOK_TK_VOID, - TOK_TK_CHAR, - TOK_TK_SHORT, - TOK_TK_INT, - TOK_TK_LONG, - TOK_TK_FLOAT, - TOK_TK_DOUBLE, - TOK_TK_SIGNED, - TOK_TK_UNSIGNED, - TOK_TK_STRUCT, - TOK_TK_UNION, - TOK_TK_ENUM, - TOK_TK_TYPEDEF, - - // Storage Class/Specifier Keywords - TOK_SCSK_AUTO, - TOK_SCSK_REGISTER, - TOK_SCSK_STATIC, - TOK_SCSK_EXTERN, - TOK_SCSK_CONST, - TOK_SCSK_VOLATILE, - - // Misc Keywords - TOK_MK_SIZEOF, - - // Operators - TOK_OP_ADD, // + - TOK_OP_SUB, // - - TOK_OP_MUL, // * - TOK_OP_DIV, // / - TOK_OP_MOD, // % - TOK_OP_BIT_AND, // & - TOK_OP_BIT_OR, // | - TOK_OP_BIT_XOR, // ^ - TOK_OP_BIT_NOT, // ~ - TOK_OP_LSHIFT, // << - TOK_OP_RSHIFT, // >> - TOK_OP_NOT, // ! - TOK_OP_ASSIGN, // = - TOK_OP_LT, // < - TOK_OP_GT, // > - TOK_OP_INC, // ++ - TOK_OP_DEC, // -- - TOK_OP_EQ, // == - TOK_OP_NE, // != - TOK_OP_LE, // <= - TOK_OP_GE, // >= - TOK_OP_AND, // && - TOK_OP_OR, // || - TOK_OP_MEMBER_POINTER, // -> - TOK_OP_MEMBER, // . - TOK_OP_COND_DECISION, // : - TOK_OP_COND, // ? - TOK_OP_ASSIGN_ADD, // += - TOK_OP_ASSIGN_SUB, // -= - TOK_OP_ASSIGN_MUL, // *= - TOK_OP_ASSIGN_DIV, // /= - TOK_OP_ASSIGN_MOD, // %= - TOK_OP_ASSIGN_BITAND, // &= - TOK_OP_ASSIGN_BITOR, // |= - TOK_OP_ASSIGN_BITXOR, // ^= - TOK_OP_ASSIGN_LSHIFT, // <<= - TOK_OP_ASSIGN_RSHIFT, // >>= - - // Separators - TOK_SEP_LEFT_PAREN, // ( - TOK_SEP_RIGHT_PAREN, // ) - TOK_SEP_LEFT_BRACKET, // [ - TOK_SEP_RIGHT_BRACKET, // ] - TOK_SEP_LEFT_BRACE, // { - TOK_SEP_RIGHT_BRACE, // } - TOK_SEP_COMMA, // , - TOK_SEP_SEMICOLON, // ; - TOK_SEP_DOT, // . - TOK_SEP_ELLIPSIS, // ... - TOK_SEP_HASH, // # - - // Identifiers - TOK_ID, - - // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' - TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) - - // Special - TOK_SPECIAL_EOF, - TOK_SPECIAL_ERROR, -} c_token_types; ---- -@s -We bring this all together in `token.h`. ---- token.h -#ifndef TOKEN_H -#define TOKEN_H -#include <stdint.h> // We use this for int64_t -@{Token Types} -@{Opaque Token Type} -@{Token Creation and Destruction} -@{Token Interface} -#endif ---- - -@s Token Implementation -Now that we have the interface, we can implement the token data structure. We'll need a couple of things: -* The token type. -* A way to store extra data. -* Implementations of the functions we defined in the interface. - -@s -One problem is we haven't defined a way to verify that the token we're getting isn't corrupt. We'll use a tag for that. ---- Token Data Structure -#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" -#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" -struct token { - long magic; - int line; - int column; - short kind; - long opt_data[0]; -}; - -typedef struct token token_t; - -struct token_data { - union { - long long i; - double f; - const char *s; - char c; - } data; -}; - -typedef struct token_data token_data_t; ---- -You might notice that a zero-length array is used in the token data structure. This is a GCC extension that allows us to allocate memory for the token data structure and the token data in one allocation. This is a bit of a hack, but it's a common pattern in C code. -@s -To access this extra data, we define a macro ---- Token Data Access -#define token_data(token) ((struct token_data *)((token)->opt_data)) ---- -@s -Now we can implement the functions we defined in the interface. ---- Token Creation and Destruction (Except `token_create_string`) -static token_t *token_data_create(c_token_types kind, int lin, int col, int len) { - token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); - if (token == NULL) { - fputs("Out of memory\n", stderr); - exit(1); - } - token->magic = TOK_MAGIC_1; - token->line = lin; - token->column = col; - column += len; - token->kind = kind; - return token; -} - -static token_t *token_create(c_token_types kind, int lin, int col, int len) { - token_t *token = malloc(sizeof(token_t)); - if (token == NULL) { - fputs("Out of memory\n", stderr); - exit(1); - } - token->magic = TOK_MAGIC_2; - token->line = lin; - token->column = col; - column += len; - token->kind = kind; - return token; -} - -static token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) { - token_t *token = token_data_create(kind, lin, col, len); - token_data(token)->data.i = i; - return token; -} - -static token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) { - token_t *token = token_data_create(kind, lin, col, len); - token_data(token)->data.f = f; - return token; -} - -static token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) { - token_t *token = token_data_create(kind, lin, col, len); - token_data(token)->data.c = c; - return token; -} - -void token_destroy(token_t *token) { - if (token->magic == TOK_MAGIC_1) { - free(token); - } else if (token->magic == TOK_MAGIC_2) { - free(token); - if (kind == TOK_CONST_STRING_ASCII) { - free(token_data(token)->data.s); - } - } else { - fputs("Corrupt token\n", stderr); - exit(1); - } -} ---- -@s -`token_create_string` can be implemented either the easy way or the right way. Let's try the easy way. ---- token_create_string -token_t *token_create_string(c_token_types kind, int lin, int col, - const char *s, int len) { - token_t *token = token_create(kind, lin, col, len); - token_data(token)->data.s = strdup(s); - return token; -} ---- -@s -There's an issue with this approach. `token_create_string` will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to `fprintf`, `fscanf`, `strchr`, `strtok`, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage. -To fix this, we use a hash table to store the strings. We'll define a hash table in `hash_table.h` and `hash_table.c`. -@s Hash Table -A hash table is a data structure that maps keys to values. It's commonly used to store information, such as variables and functions in a symbol table. To implement a generic hash table, we'll need several things: -* A function to hash the keys. -* A function to compare keys. -* An opaque type for the hash table. -* A function to destroy deleted keys and values. - -Let's start with the interface. - -@s ---- Hash Table Opaque Types -typedef struct hash_table hash_table_t; -typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned int (*hash_table_hash_fn)(void *key); -typedef void (*hash_table_dtor)(void *value, int is_key); ---- - -@s ---- Hash Table Creation and Destruction -hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor); -void hash_table_destroy(hash_table_t *table); ---- - -@s ---- Hash Table Access -void *hash_table_get(hash_table_t *table, void *key); -void hash_table_put(hash_table_t *table, void *key, void *value); -void hash_table_remove(hash_table_t *table, void *key); ---- - -@s ---- hash_table.h -#ifndef HASH_TABLE_H -#define HASH_TABLE_H -@{Hash Table Opaque Types} -@{Hash Table Creation and Destruction} -@{Hash Table Access} -#endif ---- - -@s -Let's implement the hash table now. - ---- hash_table.c -#include <stdlib.h> -#include <string.h> -#include <stdio.h> -#include "hash_table.h" - -@{Hash Table Data Structure} -@{Hash Table Entry Data Structure} - -hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) { - @{Allocate and Initialize Hash Table} - return table; -} - -void hash_table_destroy(hash_table_t *table) { - @{Destroy Entries} - free(table->entries); - free(table); -} - -void *hash_table_get(hash_table_t *table, void *key) { - @{Get Entry By Hash} - @{Loop Through Entries and Return Value if Match} - return NULL; -} - -void hash_table_put(hash_table_t *table, void *key, void *value) { - @{Get Entry By Hash} - @{Loop Through Entries and Replace Value if Key Matches} - @{Allocate New Entry if No Match} -} - -void hash_table_remove(hash_table_t *table, void *key) { - @{Get Entry By Hash} - @{Loop Through Entries and Remove Entry if Key Matches} -} - -#ifdef TEST_HASH_TABLE -#include <assert.h> -#include <stdio.h> -#include <string.h> - -int string_cmp(void *key1, void *key2) { - return strcmp((char *)key1, (char *)key2); -} - -unsigned long string_hash(void *key) { - unsigned long hash = 5381; - char *str = (char *)key; - while (*str != '\0') { - hash = ((hash << 5) + hash) + *str; - str++; - } - return hash; -} - -int main() { - hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); - hash_table_put(table, "foo", "bar"); - hash_table_put(table, "foo", "baz"); - assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0); - hash_table_remove(table, "foo"); - assert(hash_table_get(table, "foo") == NULL); - hash_table_destroy(table); - return 0; -} -#endif ---- - -@s -For the hash table data structure, we'll define a pointer to an array of entries, the size of the array, and the hash/comparison functions. - ---- Hash Table Data Structure -struct hash_table { - struct hash_table_entry **entries; - int size; - hash_table_cmp_fn cmp; - hash_table_hash_fn hash; - hash_table_dtor dtor; -}; ---- - -@s -Entries in the hash table will have a key, a value, and a link to the next entry in the chain. - ---- Hash Table Entry Data Structure -struct hash_table_entry { - void *key; - void *value; - struct hash_table_entry *next; -}; ---- - -@s -Allocating a hash table involves allocating memory for the hash table itself and the entries, zeroing out the entries, and setting the hash and comparison functions. - ---- Allocate and Initialize Hash Table -hash_table_t *table = malloc(sizeof(struct hash_table)); -if (table == NULL) { - fputs("Out of memory, could not allocate hash table\n", stderr); - exit(1); -} -table->entries = calloc(size, sizeof(struct hash_table_entry *)); -if (table->entries == NULL) { - fputs("Out of memory, could not allocate hash table entries\n", stderr); - exit(1); -} -table->size = size; -table->cmp = cmp; -table->hash = hash; -table->dtor = dtor; ---- - -@s -To destroy a hash table, we loop through the entries, freeing the keys and values, and then free the entries and the table itself. - ---- Destroy Entries -for (int i = 0; i < table->size; i++) { - struct hash_table_entry *entry = table->entries[i]; - while (entry != NULL) { - struct hash_table_entry *next = entry->next; - if (table->dtor != NULL) { - table->dtor(entry->key, 1); - table->dtor(entry->value, 0); - } - free(entry); - entry = next; - } -} ---- - -@s -To get an entry from the hash table, we hash the key, loop through the entries, and return the value if we find a match. - ---- Get Entry By Hash -unsigned int hash = table->hash(key) % table->size; -struct hash_table_entry *entry = table->entries[hash]; ---- - -@s -To put an entry in the hash table, we hash the key, loop through the entries, and replace the value if we find a match. - ---- Loop Through Entries and Replace Value if Key Matches -while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - entry->value = value; - return; - } - entry = entry->next; -} ---- - -@s -If we don't find a match, we allocate a new entry, set the key and value, and insert it at the head of the linked list. -This exploits a property in computer science called locality of reference. The gist of that is that when you write to a piece of memory, you're likely to read from it again soon. By putting the new entry at the head of the linked list, we increase the chances that we'll find it quickly next time. - ---- Allocate New Entry if No Match -struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); -if (new_entry == NULL) { - fputs("Out of memory, could not allocate hash table entry\n", stderr); - exit(1); -} -new_entry->key = key; -new_entry->value = value; -new_entry->next = table->entries[hash]; -table->entries[hash] = new_entry; ---- - -@s -To remove an entry from the hash table, we hash the key, loop through the entries, and remove the entry if we find a match. - ---- Loop Through Entries and Remove Entry if Key Matches -struct hash_table_entry *prev = NULL; -while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - if (prev == NULL) { - table->entries[hash] = entry->next; - } else { - prev->next = entry->next; - } - if (table->dtor != NULL) { - table->dtor(entry->key, 1); - table->dtor(entry->value, 0); - } - free(entry); - return; - } - prev = entry; - entry = entry->next; -} ---- - -@s -To find a value associated with a given key in the hash table, we hash the string, loop through the entries, and return the value if a match is found. - ---- Loop Through Entries and Return Value if Match -while (entry != NULL) { - if (table->cmp(entry->key, key) == 0) { - return entry->value; - } - entry = entry->next; -} ----- \ No newline at end of file diff --git a/projects/style_old.css b/projects/style_old.css @@ -1,107 +0,0 @@ -/* code blocks (Style from jmeiners.com/lc3-vm, CC BY-NC-SA 4.0, used with attribution) */ -code, -.block-header { - font-size: 11pt; - font-family: 'Fira Mono', Menlo, Monaco, Consolas, Liberation Mono, Courier New, monospace; -} - -.block-header, -.block-header+pre.prettyprint { - background-color: #cecece; - border: 2px solid #fffdef; - box-sizing: border-box; -} - -.block-header { - display: inline-block; - - /* allows the header to overlap the code block to hide part of the top border */ - position: relative; - z-index: 6; - - border-top-left-radius: 0.6rem; - border-top-right-radius: 0.6rem; - border-bottom-width: 0; - - padding: 0.4rem 0.6rem; -} - -.block-title { - font-weight: normal; -} - -.block-title a { - /* this causes navigating to a block link to scroll you just a few pixels above the block header */ - margin-top: -1rem; - padding-top: 1rem; -} - -.block-title, -.block-header a:link, -.block-header a:visited { - color: #262521; -} - -.block-header a:hover, -.block-header a:active { - color: #112d75; -} - -.code-block pre.prettyprint { - padding: 0.6rem; - white-space: pre-wrap; - border-radius: 0.6rem; -} - -.code-block .block-header+pre.prettyprint { - /* overlap to the top 1px of the code block with the header so that the top border is partially obscured */ - position: relative; - z-index: 5; - margin-top: -1px; - - border-top-left-radius: 0; -} - - -.block-usages { - margin-top: -1rem; -} - -.block-usages small { - display: inline-block; - margin: 0.4rem 0.6rem; - font-size: 11pt; - color: #363535; -} - -.block-usages a, -.block-usages span { - padding: 0 0.5rem; - margin-left: 0.1rem; -} - -.block-usages a { - background-color: #f9f8f4; - border: 1px solid #c7c6bf; - box-sizing: border-box; - - color: #57554a; - border-radius: 0.3rem; -} - -.block-usages a+*, -.block-usages span+* { - margin-left: 0.2rem; -} - -.block-usages a:hover, -.block-usages a:active { - text-decoration: none; - background-color: #f9f9f7; - color: #a6a28d; -} - -h1+p, -h1+p+p { - text-align: center; -}- \ No newline at end of file