commit d3789f291f6b8e22a481a8b443a77d0c53966926
parent 844dc55b3121e79e47834c38ceffcd221d39bb18
Author: Reagan <rfische2@uccs.edu>
Date: Fri, 23 Aug 2024 09:12:10 -0600
lexer final
Diffstat:
12 files changed, 6429 insertions(+), 2386 deletions(-)
diff --git a/projects.html b/projects.html
@@ -90,7 +90,7 @@
</p>
<ol>
<li>
- <a href="projects/cminus/lexer_new.html">Step 1: Lexing</a>
+ <a href="projects/cminus/lexer.html">Step 1: Lexing</a>
</li>
</ol>
<h2 id="net" name="net">Mock Networks</h2>
diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c
@@ -1,117 +1,147 @@
-#include "hash_table.h"
-#include <stdio.h>
+/* hash_table.c */
#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "hash_table.h"
+
+/* Hash Table Data Structure */
struct hash_table {
- hash_table_entry_t **entries;
+ struct hash_table_entry **entries;
int size;
hash_table_cmp_fn cmp;
hash_table_hash_fn hash;
+ hash_table_dtor dtor;
};
+
+/* Hash Table Entry Data Structure */
struct hash_table_entry {
void *key;
void *value;
- hash_table_entry_t *next;
+ struct hash_table_entry *next;
};
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
- hash_table_hash_fn hash) {
- hash_table_t *table = malloc(sizeof(hash_table_t));
+
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) {
+ /* Allocate and Initialize Hash Table */
+ hash_table_t *table = malloc(sizeof(struct hash_table));
if (table == NULL) {
- return NULL;
+ fputs("Out of memory, could not allocate hash table\n", stderr);
+ exit(1);
}
- table->entries = calloc(size, sizeof(hash_table_entry_t *));
+ table->entries = calloc(size, sizeof(struct hash_table_entry *));
if (table->entries == NULL) {
- free(table);
- return NULL;
+ fputs("Out of memory, could not allocate hash table entries\n", stderr);
+ exit(1);
}
table->size = size;
table->cmp = cmp;
table->hash = hash;
+ table->dtor = dtor;
+
return table;
}
-void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
+void hash_table_destroy(hash_table_t *table) {
+ /* Destroy Entries */
for (int i = 0; i < table->size; i++) {
- hash_table_entry_t *entry = table->entries[i];
+ struct hash_table_entry *entry = table->entries[i];
while (entry != NULL) {
- hash_table_entry_t *next = entry->next;
- if (dtor != NULL) {
- dtor(entry->key, 1);
- dtor(entry->value, 0);
+ struct hash_table_entry *next = entry->next;
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
}
free(entry);
entry = next;
}
}
+
free(table->entries);
free(table);
}
void *hash_table_get(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
+ /* Get Entry By Hash */
+ unsigned int hash = table->hash(key) % table->size;
+ struct hash_table_entry *entry = table->entries[hash];
+
+ /* Loop Through Entries and Return Value if Match */
while (entry != NULL) {
if (table->cmp(entry->key, key) == 0) {
return entry->value;
}
entry = entry->next;
}
+
return NULL;
}
-int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
+void hash_table_put(hash_table_t *table, void *key, void *value) {
+ /* Get Entry By Hash */
+ unsigned int hash = table->hash(key) % table->size;
+ struct hash_table_entry *entry = table->entries[hash];
+
+ /* Loop Through Entries and Replace Value if Key Matches */
while (entry != NULL) {
if (table->cmp(entry->key, key) == 0) {
- if (replace) {
- entry->value = value;
- return 0;
- } else {
- return 1;
- }
+ entry->value = value;
+ return;
}
entry = entry->next;
}
- entry = malloc(sizeof(hash_table_entry_t));
- if (entry == NULL) {
- fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
+
+ /* Allocate New Entry if No Match */
+ struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry));
+ if (new_entry == NULL) {
+ fputs("Out of memory, could not allocate hash table entry\n", stderr);
exit(1);
}
- entry->key = key;
- entry->value = value;
- entry->next = table->entries[hash];
- table->entries[hash] = entry;
- return 0;
+ new_entry->key = key;
+ new_entry->value = value;
+ new_entry->next = table->entries[hash];
+ table->entries[hash] = new_entry;
+
}
void hash_table_remove(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
- hash_table_entry_t *prev = NULL;
+ /* Get Entry By Hash */
+ unsigned int hash = table->hash(key) % table->size;
+ struct hash_table_entry *entry = table->entries[hash];
+
+ /* Loop Through Entries and Remove Entry if Key Matches */
+ struct hash_table_entry *prev = NULL;
while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
- if (prev == NULL) {
- table->entries[hash] = entry->next;
- } else {
- prev->next = entry->next;
+ if (table->cmp(entry->key, key) == 0) {
+ if (prev == NULL) {
+ table->entries[hash] = entry->next;
+ } else {
+ prev->next = entry->next;
+ }
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
+ }
+ free(entry);
+ return;
}
- free(entry);
- return;
- }
- prev = entry;
- entry = entry->next;
+ prev = entry;
+ entry = entry->next;
}
+
}
#ifdef TEST_HASH_TABLE
#include <assert.h>
#include <stdio.h>
#include <string.h>
-int string_cmp(void *key1, void *key2) { return strcmp(key1, key2); }
+
+int string_cmp(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+
unsigned long string_hash(void *key) {
unsigned long hash = 5381;
- char *str = key;
+ char *str = (char *)key;
while (*str != '\0') {
hash = ((hash << 5) + hash) + *str;
str++;
@@ -120,13 +150,14 @@ unsigned long string_hash(void *key) {
}
int main() {
- hash_table_t *table = hash_table_create(16, string_cmp, string_hash);
- assert(hash_table_put(table, "foo", "bar", 0) == 0);
- assert(hash_table_put(table, "foo", "baz", 1) == 0);
- assert(strcmp(hash_table_get(table, "foo"), "baz") == 0);
+ hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL);
+ hash_table_put(table, "foo", "bar");
+ hash_table_put(table, "foo", "baz");
+ assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0);
hash_table_remove(table, "foo");
assert(hash_table_get(table, "foo") == NULL);
hash_table_destroy(table);
return 0;
}
#endif
+
diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h
@@ -1,15 +1,20 @@
+/* hash_table.h */
#ifndef HASH_TABLE_H
#define HASH_TABLE_H
+/* Hash Table Opaque Types */
typedef struct hash_table hash_table_t;
-typedef struct hash_table_entry hash_table_entry_t;
typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
typedef unsigned int (*hash_table_hash_fn)(void *key);
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
- hash_table_hash_fn hash);
-
typedef void (*hash_table_dtor)(void *value, int is_key);
-void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
+
+/* Hash Table Creation and Destruction */
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor);
+void hash_table_destroy(hash_table_t *table);
+
+/* Hash Table Access */
void *hash_table_get(hash_table_t *table, void *key);
-int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
+void hash_table_put(hash_table_t *table, void *key, void *value);
void hash_table_remove(hash_table_t *table, void *key);
+
#endif
+
diff --git a/projects/cminus/code/input.c b/projects/cminus/code/input.c
@@ -1,6 +1,8 @@
-#include "input.h"
+/* input.c */
#include <stdio.h>
#include <stdlib.h>
+#include "input.h"
+/* Input Data */
#define CHUNK_SIZE 128
static char buffer[CHUNK_SIZE];
static int buffer_pos = 0;
@@ -10,6 +12,7 @@ static int unget_buffer_stack_pos = 0;
static FILE *file = NULL;
+/* Input Initialization */
void input_init(const char *filename) {
file = fopen(filename, "r");
if (file == NULL) {
@@ -18,6 +21,7 @@ void input_init(const char *filename) {
}
}
+/* Input Get Character */
int input_getc(void) {
if (unget_buffer_stack_pos > 0) {
return unget_buffer_stack[--unget_buffer_stack_pos];
@@ -33,23 +37,14 @@ int input_getc(void) {
return c;
}
-void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }
-
-void input_destroy(void) { fclose(file); }
+/* Input Unget Character */
+void input_ungetc(int c) {
+ unget_buffer_stack[unget_buffer_stack_pos++] = c;
+}
-#ifdef TEST_INPUT
-#include <assert.h>
-int main() {
- FILE *file = fopen("input.c", "rb");
- input_init("input.c");
- int c1, c2;
- while ((c1 = fgetc(file)) != EOF) {
- c2 = input_getc();
- assert(c1 == c2);
- }
- assert(input_getc() == EOF);
+/* Input Destroy */
+void input_destroy(void) {
fclose(file);
- input_destroy();
- return 0;
}
-#endif
+
+
diff --git a/projects/cminus/code/input.h b/projects/cminus/code/input.h
@@ -1,7 +1,11 @@
+/* input.h */
#ifndef INPUT_H
#define INPUT_H
+/* Input Interface */
void input_init(const char *filename);
int input_getc(void);
void input_ungetc(int c);
void input_destroy(void);
+
#endif
+
diff --git a/projects/cminus/code/lexer.lit b/projects/cminus/code/lexer.lit
@@ -0,0 +1,2529 @@
+@code_type c .c
+@comment_type /* %s */
+@compiler lit -t lexer.lit && gcc -Wall -Wextra -Wstrict-aliasing=3 -Wwrite-strings -Wvla -Wcast-align=strict -Wstrict-prototypes -Wstringop-overflow=4 -Wshadow -fanalyzer tokenizer.c input.c hash_table.c token.c -D TEST_TOKENIZER -g -O0 && rm a.out
+
+@title Lexer
+@add_css ./style.css
+@s General Project Structure
+Since this is the first article, I'll outline the project structure for the C- compiler.
+
+The project has a series of pretty typical stages:
+
+1. The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E").
+2. The parser. This takes the tokens and builds an abstract syntax tree (AST).
+3. The symbol table. This exists in a sort of in-between space next to the lexer and parser. It's used to store information about variables and functions.
+4. The type checker. This is used to ensure that the types of variables and functions are correct.
+5. The code generator. This takes the AST and generates an intermediate representation (IR).
+6. The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages.
+7. The lowerer. This takes the IR and lowers it to a simpler IR.
+8. The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers.
+9. The code emitter. This takes the IR and emits RISC-V assembly.
+
+As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena.
+
+@s Some Rules
+
+Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent.
+
+1. PROGRAM LIKE IT'S 1999
+
+> 640 KB ought to be enough for anybody. - Bill Gates
+
+Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage.
+
+This places a lot of constraints on the project, but I think it's a good exercise in minimalism.
+
+Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future).
+
+2. PROGRAM IN C++--
+
+I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits:
+
+* Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header.
+* Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure.
+* Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called.
+
+3. DON'T GET FANCY
+
+My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else.
+
+That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability.
+
+4. DESIGN FOR DEBUGGING
+
+This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point.
+
+This might be painful, but it'll make debugging a lot simpler and let users look under the hood.
+
+5. SMART DATA, STUPID CODE
+
+A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible.
+
+For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table.
+
+@s Misc
+THIS IS A LITERATE PROGRAM! Go to [this link](https://reagancfischer.dev/projects/cminus/code/lexer.lit) to see the file that generated this HTML.
+
+@s The Lexer
+
+A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.).
+
+Given a string like `int main() { return 0; }`, the lexer would produce a series of tokens like `INT`, `IDENTIFIER(main)`, `LPAREN`, `RPAREN`, `LBRACE`, `RETURN`, `INTCONSTANT(0)`, `SEMICOLON`, `RBRACE`.
+
+@s Design
+
+I'll break the lexer up into a couple of modules. `token.c` will contain the token data structure and functions to create and destroy tokens. `input.c` will contain the input data structure and functions to read from the input file. `tokenizer.c` will contain the main lexer logic.
+
+@s Token Interface
+Tokens are the smallest unit of meaning in a language. They're used by the parser to build an abstract syntax tree (AST). We'll need a couple of things to represent a token:
+* The type of token. This will be an enum, with values like `TOK_CTK_IF` or `TOK_CONST_INTEGER_U32`.
+* The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do.
+* The line and column of the token. This is used for error messages.
+
+As I mentioned earlier, we're trying to implement a sort of class system in C. For that, we'll need to hide the token implementation details behind an opaque pointer. We could just have a `void` pointer, but that stops us from being able to use compile-time type checking. Instead, we'll use a forward declaration of the token type in the header file, and then define the token type in the implementation file.
+
+@s
+--- Opaque Token Type
+typedef struct token token_t;
+---
+
+@s
+We'll need a couple of functions to create and destroy tokens.
+--- Token Creation and Destruction Interface
+token_t *token_data_create(c_token_types kind, int lin, int col, int len);
+
+token_t *token_create(c_token_types kind, int lin, int col, int len);
+
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len);
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len);
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len);
+
+token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len);
+
+void token_destroy(token_t *token);
+---
+
+@s
+We'll also need some functions to access the token data.
+--- Token Interface
+c_token_types token_type(token_t *token);
+
+int64_t token_int(token_t *token);
+
+double token_float(token_t *token);
+
+const char *token_string(token_t *token);
+
+char token_char(token_t *token);
+
+int token_line(token_t *token);
+
+int token_column(token_t *token);
+
+void print_token(token_t *tok);
+---
+
+@s
+We'll need some types to represent the different kinds of tokens.
+--- Token Types
+typedef enum {
+ // Control Keywords
+ TOK_CTK_IF,
+ TOK_CTK_ELSE,
+ TOK_CTK_SWITCH,
+ TOK_CTK_CASE,
+ TOK_CTK_DEFAULT,
+ TOK_CTK_WHILE,
+ TOK_CTK_DO,
+ TOK_CTK_FOR,
+ TOK_CTK_CONTINUE,
+ TOK_CTK_BREAK,
+ TOK_CTK_RETURN,
+ TOK_CTK_GOTO,
+
+ // Type Keywords
+ TOK_TK_VOID,
+ TOK_TK_CHAR,
+ TOK_TK_SHORT,
+ TOK_TK_INT,
+ TOK_TK_LONG,
+ TOK_TK_FLOAT,
+ TOK_TK_DOUBLE,
+ TOK_TK_SIGNED,
+ TOK_TK_UNSIGNED,
+ TOK_TK_STRUCT,
+ TOK_TK_UNION,
+ TOK_TK_ENUM,
+ TOK_TK_TYPEDEF,
+
+ // Storage Class/Specifier Keywords
+ TOK_SCSK_AUTO,
+ TOK_SCSK_REGISTER,
+ TOK_SCSK_STATIC,
+ TOK_SCSK_EXTERN,
+ TOK_SCSK_CONST,
+ TOK_SCSK_VOLATILE,
+
+ // Misc Keywords
+ TOK_MK_SIZEOF,
+
+ // Operators
+ TOK_OP_ADD, // +
+ TOK_OP_SUB, // -
+ TOK_OP_MUL, // *
+ TOK_OP_DIV, // /
+ TOK_OP_MOD, // %
+ TOK_OP_BIT_AND, // &
+ TOK_OP_BIT_OR, // |
+ TOK_OP_BIT_XOR, // ^
+ TOK_OP_BIT_NOT, // ~
+ TOK_OP_LSHIFT, // <<
+ TOK_OP_RSHIFT, // >>
+ TOK_OP_NOT, // !
+ TOK_OP_ASSIGN, // =
+ TOK_OP_LT, // <
+ TOK_OP_GT, // >
+ TOK_OP_INC, // ++
+ TOK_OP_DEC, // --
+ TOK_OP_EQ, // ==
+ TOK_OP_NE, // !=
+ TOK_OP_LE, // <=
+ TOK_OP_GE, // >=
+ TOK_OP_AND, // &&
+ TOK_OP_OR, // ||
+ TOK_OP_MEMBER_POINTER, // ->
+ TOK_OP_MEMBER, // .
+ TOK_OP_COND_DECISION, // :
+ TOK_OP_COND, // ?
+ TOK_OP_ASSIGN_ADD, // +=
+ TOK_OP_ASSIGN_SUB, // -=
+ TOK_OP_ASSIGN_MUL, // *=
+ TOK_OP_ASSIGN_DIV, // /=
+ TOK_OP_ASSIGN_MOD, // %=
+ TOK_OP_ASSIGN_BITAND, // &=
+ TOK_OP_ASSIGN_BITOR, // |=
+ TOK_OP_ASSIGN_BITXOR, // ^=
+ TOK_OP_ASSIGN_LSHIFT, // <<=
+ TOK_OP_ASSIGN_RSHIFT, // >>=
+
+ // Separators
+ TOK_SEP_LEFT_PAREN, // (
+ TOK_SEP_RIGHT_PAREN, // )
+ TOK_SEP_LEFT_BRACKET, // [
+ TOK_SEP_RIGHT_BRACKET, // ]
+ TOK_SEP_LEFT_BRACE, // {
+ TOK_SEP_RIGHT_BRACE, // }
+ TOK_SEP_COMMA, // ,
+ TOK_SEP_SEMICOLON, // ;
+ TOK_SEP_DOT, // .
+ TOK_SEP_ELLIPSIS, // ...
+ TOK_SEP_HASH, // #
+
+ // Identifiers
+ TOK_ID,
+
+ // Constants
+ TOK_CONST_INTEGER_U32, // u
+ TOK_CONST_INTEGER_U64, // ul
+ TOK_CONST_INTEGER_S32, // (no suffix)
+ TOK_CONST_INTEGER_S64, // l
+ TOK_CONST_FLOAT_32, // f
+ TOK_CONST_FLOAT_64, // (no suffix)
+ TOK_CONST_CHAR, // 'c'
+ TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
+
+ // Special
+ TOK_SPECIAL_EOF,
+ TOK_SPECIAL_ERROR,
+} c_token_types;
+---
+
+@s
+We bring this all together in `token.h`. Line and column are exposed as global variables because `skip_whitespace` will need to update them.
+--- token.h
+#ifndef TOKEN_H
+#define TOKEN_H
+#include <stdint.h> // We use this for int64_t
+@{Token Types}
+@{Opaque Token Type}
+@{Token Creation and Destruction}
+@{Token Interface}
+extern int column;
+extern int line;
+#endif
+---
+
+@s Token Implementation
+Now that we have the interface, we can implement the token data structure. We'll need a couple of things:
+* The token type.
+* A way to store extra data.
+* Implementations of the functions we defined in the interface.
+
+@s
+One problem is we haven't defined a way to verify that the token we're getting isn't corrupt. We'll use a tag for that.
+
+You might notice that a zero-length array is used in the token data structure. This is a GCC extension that allows us to allocate memory for the token data structure and the token data in one allocation. This is a bit of a hack, but it's a common pattern in C code.
+--- Token Data Structure
+#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
+
+struct token {
+ long magic;
+ int line;
+ int column;
+ short kind;
+ long opt_data[0];
+};
+
+typedef struct token token_t;
+
+struct token_data {
+ union {
+ int64_t i;
+ double f;
+ const char *s;
+ char c;
+ } data;
+};
+
+typedef struct token_data token_data_t;
+int column = 1;
+int line = 1;
+---
+
+
+@s
+We'll need to implement an interface for accessing the token data and a macro for accessing optional data.
+--- Token Data Access
+#define token_data(token) ((struct token_data *)((token)->opt_data))
+
+c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
+}
+
+int64_t token_int(token_t *token) {
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
+}
+
+double token_float(token_t *token) {
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
+}
+
+const char *token_string(token_t *token) {
+ assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
+}
+
+char token_char(token_t *token) {
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
+}
+
+int token_line(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
+}
+
+int token_column(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
+}
+---
+
+@s
+For debugging, I'll add a function to print the token type.
+--- Token Debugging
+const char *token_name_from_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "TOK_CTK_IF";
+ case TOK_CTK_ELSE:
+ return "TOK_CTK_ELSE";
+ case TOK_CTK_SWITCH:
+ return "TOK_CTK_SWITCH";
+ case TOK_CTK_CASE:
+ return "TOK_CTK_CASE";
+ case TOK_CTK_DEFAULT:
+ return "TOK_CTK_DEFAULT";
+ case TOK_CTK_WHILE:
+ return "TOK_CTK_WHILE";
+ case TOK_CTK_DO:
+ return "TOK_CTK_DO";
+ case TOK_CTK_FOR:
+ return "TOK_CTK_FOR";
+ case TOK_CTK_CONTINUE:
+ return "TOK_CTK_CONTINUE";
+ case TOK_CTK_BREAK:
+ return "TOK_CTK_BREAK";
+ case TOK_CTK_RETURN:
+ return "TOK_CTK_RETURN";
+ case TOK_CTK_GOTO:
+ return "TOK_CTK_GOTO";
+ case TOK_TK_VOID:
+ return "TOK_TK_VOID";
+ case TOK_TK_CHAR:
+ return "TOK_TK_CHAR";
+ case TOK_TK_SHORT:
+ return "TOK_TK_SHORT";
+ case TOK_TK_INT:
+ return "TOK_TK_INT";
+ case TOK_TK_LONG:
+ return "TOK_TK_LONG";
+ case TOK_TK_FLOAT:
+ return "TOK_TK_FLOAT";
+ case TOK_TK_DOUBLE:
+ return "TOK_TK_DOUBLE";
+ case TOK_TK_SIGNED:
+ return "TOK_TK_SIGNED";
+ case TOK_TK_UNSIGNED:
+ return "TOK_TK_UNSIGNED";
+ case TOK_TK_STRUCT:
+ return "TOK_TK_STRUCT";
+ case TOK_TK_UNION:
+ return "TOK_TK_UNION";
+ case TOK_TK_ENUM:
+ return "TOK_TK_ENUM";
+ case TOK_TK_TYPEDEF:
+ return "TOK_TK_TYPEDEF";
+ case TOK_SCSK_AUTO:
+ return "TOK_SCSK_AUTO";
+ case TOK_SCSK_REGISTER:
+ return "TOK_SCSK_REGISTER";
+ case TOK_SCSK_STATIC:
+ return "TOK_SCSK_STATIC";
+ case TOK_SCSK_EXTERN:
+ return "TOK_SCSK_EXTERN";
+ case TOK_SCSK_CONST:
+ return "TOK_SCSK_CONST";
+ case TOK_SCSK_VOLATILE:
+ return "TOK_SCSK_VOLATILE";
+ case TOK_MK_SIZEOF:
+ return "TOK_MK_SIZEOF";
+ case TOK_OP_ADD:
+ return "TOK_OP_ADD";
+ case TOK_OP_SUB:
+ return "TOK_OP_SUB";
+ case TOK_OP_MUL:
+ return "TOK_OP_MUL";
+ case TOK_OP_DIV:
+ return "TOK_OP_DIV";
+ case TOK_OP_MOD:
+ return "TOK_OP_MOD";
+ case TOK_OP_BIT_AND:
+ return "TOK_OP_BIT_AND";
+ case TOK_OP_BIT_OR:
+ return "TOK_OP_BIT_OR";
+ case TOK_OP_BIT_XOR:
+ return "TOK_OP_BIT_XOR";
+ case TOK_OP_BIT_NOT:
+ return "TOK_OP_BIT_NOT";
+ case TOK_OP_LSHIFT:
+ return "TOK_OP_LSHIFT";
+ case TOK_OP_RSHIFT:
+ return "TOK_OP_RSHIFT";
+ case TOK_OP_NOT:
+ return "TOK_OP_NOT";
+ case TOK_OP_ASSIGN:
+ return "TOK_OP_ASSIGN";
+ case TOK_OP_LT:
+ return "TOK_OP_LT";
+ case TOK_OP_GT:
+ return "TOK_OP_GT";
+ case TOK_OP_INC:
+ return "TOK_OP_INC";
+ case TOK_OP_DEC:
+ return "TOK_OP_DEC";
+ case TOK_OP_EQ:
+ return "TOK_OP_EQ";
+ case TOK_OP_NE:
+ return "TOK_OP_NE";
+ case TOK_OP_LE:
+ return "TOK_OP_LE";
+ case TOK_OP_GE:
+ return "TOK_OP_GE";
+ case TOK_OP_AND:
+ return "TOK_OP_AND";
+ case TOK_OP_OR:
+ return "TOK_OP_OR";
+ case TOK_OP_MEMBER_POINTER:
+ return "TOK_OP_MEMBER_POINTER";
+ case TOK_OP_MEMBER:
+ return "TOK_OP_MEMBER";
+ case TOK_OP_COND_DECISION:
+ return "TOK_OP_COND_DECISION";
+ case TOK_OP_COND:
+ return "TOK_OP_COND";
+ case TOK_OP_ASSIGN_ADD:
+ return "TOK_OP_ASSIGN_ADD";
+ case TOK_OP_ASSIGN_SUB:
+ return "TOK_OP_ASSIGN_SUB";
+ case TOK_OP_ASSIGN_MUL:
+ return "TOK_OP_ASSIGN_MUL";
+ case TOK_OP_ASSIGN_DIV:
+ return "TOK_OP_ASSIGN_DIV";
+ case TOK_OP_ASSIGN_MOD:
+ return "TOK_OP_ASSIGN_MOD";
+ case TOK_OP_ASSIGN_BITAND:
+ return "TOK_OP_ASSIGN_BITAND";
+ case TOK_OP_ASSIGN_BITOR:
+ return "TOK_OP_ASSIGN_BITOR";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "TOK_OP_ASSIGN_BITXOR";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "TOK_OP_ASSIGN_LSHIFT";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return "TOK_OP_ASSIGN_RSHIFT";
+ case TOK_SEP_HASH:
+ return "TOK_SEP_HASH";
+ case TOK_ID:
+ return "TOK_ID";
+ case TOK_CONST_INTEGER_U32:
+ return "TOK_CONST_INTEGER_U32";
+ case TOK_CONST_INTEGER_U64:
+ return "TOK_CONST_INTEGER_U64";
+ case TOK_CONST_INTEGER_S32:
+ return "TOK_CONST_INTEGER_S32";
+ case TOK_CONST_INTEGER_S64:
+ return "TOK_CONST_INTEGER_S64";
+ case TOK_CONST_FLOAT_32:
+ return "TOK_CONST_FLOAT_32";
+ case TOK_CONST_FLOAT_64:
+ return "TOK_CONST_FLOAT_64";
+ case TOK_CONST_CHAR:
+ return "TOK_CONST_CHAR";
+ case TOK_CONST_STRING_ASCII:
+ return "TOK_CONST_STRING_ASCII";
+ case TOK_SPECIAL_EOF:
+ return "TOK_SPECIAL_EOF";
+ case TOK_SPECIAL_ERROR:
+ return "TOK_SPECIAL_ERROR";
+ case TOK_SEP_LEFT_PAREN:
+ return "TOK_SEP_LEFT_PAREN";
+ case TOK_SEP_RIGHT_PAREN:
+ return "TOK_SEP_RIGHT_PAREN";
+ case TOK_SEP_LEFT_BRACKET:
+ return "TOK_SEP_LEFT_BRACKET";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "TOK_SEP_RIGHT_BRACKET";
+ case TOK_SEP_LEFT_BRACE:
+ return "TOK_SEP_LEFT_BRACE";
+ case TOK_SEP_RIGHT_BRACE:
+ return "TOK_SEP_RIGHT_BRACE";
+ case TOK_SEP_COMMA:
+ return "TOK_SEP_COMMA";
+ case TOK_SEP_SEMICOLON:
+ return "TOK_SEP_SEMICOLON";
+ case TOK_SEP_DOT:
+ return "TOK_SEP_DOT";
+ case TOK_SEP_ELLIPSIS:
+ return "TOK_SEP_ELLIPSIS";
+ }
+ return "UNKNOWN";
+}
+
+char *re_escape_string(const char *str) {
+ int len = strlen(str);
+ char *buf = malloc(len * 2 + 1);
+ if (buf == NULL) {
+ fprintf(stderr, "Out of memory. Cannot escape string\n");
+ exit(1);
+ }
+ int i = 0;
+ for (int j = 0; j < len; j++) {
+ switch (str[j]) {
+ case '\a':
+ buf[i++] = '\\';
+ buf[i++] = 'a';
+ break;
+ case '\b':
+ buf[i++] = '\\';
+ buf[i++] = 'b';
+ break;
+ case '\f':
+ buf[i++] = '\\';
+ buf[i++] = 'f';
+ break;
+ case '\n':
+ buf[i++] = '\\';
+ buf[i++] = 'n';
+ break;
+ case '\r':
+ buf[i++] = '\\';
+ buf[i++] = 'r';
+ break;
+ case '\t':
+ buf[i++] = '\\';
+ buf[i++] = 't';
+ break;
+ case '\v':
+ buf[i++] = '\\';
+ buf[i++] = 'v';
+ break;
+ case '\\':
+ buf[i++] = '\\';
+ buf[i++] = '\\';
+ break;
+ case '\'':
+ buf[i++] = '\\';
+ buf[i++] = '\'';
+ break;
+ case '"':
+ buf[i++] = '\\';
+ buf[i++] = '"';
+ break;
+ default:
+ buf[i++] = str[j];
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return buf;
+}
+
+void print_token(token_t *tok) {
+ if (tok == NULL) {
+ printf("NULL\n");
+ return;
+ }
+ const char *name = token_name_from_type(tok->kind);
+ switch (tok->kind) {
+ case TOK_ID:
+ case TOK_CONST_STRING_ASCII: {
+ char *escaped = re_escape_string(token_string(tok));
+ printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+ free(escaped);
+ break;
+ }
+ case TOK_CONST_CHAR:
+ printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_S64:
+ case TOK_CONST_INTEGER_U64:
+ printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+ break;
+ default:
+ printf("%s@%d:%d\n", name, tok->line, tok->column);
+ break;
+ }
+}
+---
+
+@s
+Now we can implement the functions we defined in the interface.
+--- Token Creation and Destruction
+token_t *token_data_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_1;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_2;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.i = i;
+ return token;
+}
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.f = f;
+ return token;
+}
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.c = c;
+ return token;
+}
+
+void token_destroy(token_t *token) {
+ if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) {
+ if (token->kind == TOK_CONST_STRING_ASCII) {
+ free((char *)token_data(token)->data.s);
+ }
+ free(token);
+ } else {
+ fputs("Corrupt token\n", stderr);
+ exit(1);
+ }
+}
+---
+
+@s
+`token_create_string` can be implemented either the easy way or the right way. Let's try the easy way.
+--- Token Create String
+token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) {
+ token_t *token = token_create(kind, lin, col, len);
+ token_data(token)->data.s = strdup(s);
+ return token;
+}
+---
+
+@s
+There's an issue with this approach. `token_create_string` will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to `fprintf`, `fscanf`, `strchr`, `strtok`, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage.
+
+To fix this, we use a hash table to store the strings. We'll define a hash table in `hash_table.h` and `hash_table.c`.
+
+@s Hash Table
+A hash table is a data structure that maps keys to values. It's commonly used to store information, such as variables and functions in a symbol table. To implement a generic hash table, we'll need several things:
+* A function to hash the keys.
+* A function to compare keys.
+* An opaque type for the hash table.
+* A function to destroy deleted keys and values.
+
+Let's start with the interface.
+
+@s
+--- Hash Table Opaque Types
+typedef struct hash_table hash_table_t;
+typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
+typedef void (*hash_table_dtor)(void *value, int is_key);
+---
+
+@s
+--- Hash Table Creation and Destruction
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor);
+void hash_table_destroy(hash_table_t *table);
+---
+
+@s
+--- Hash Table Access
+void *hash_table_get(hash_table_t *table, void *key);
+void hash_table_put(hash_table_t *table, void *key, void *value);
+void hash_table_remove(hash_table_t *table, void *key);
+---
+
+@s
+--- hash_table.h
+#ifndef HASH_TABLE_H
+#define HASH_TABLE_H
+@{Hash Table Opaque Types}
+@{Hash Table Creation and Destruction}
+@{Hash Table Access}
+#endif
+---
+
+@s
+Let's implement the hash table now.
+
+--- hash_table.c
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "hash_table.h"
+
+@{Hash Table Data Structure}
+@{Hash Table Entry Data Structure}
+
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) {
+ @{Allocate and Initialize Hash Table}
+ return table;
+}
+
+void hash_table_destroy(hash_table_t *table) {
+ @{Destroy Entries}
+ free(table->entries);
+ free(table);
+}
+
+void *hash_table_get(hash_table_t *table, void *key) {
+ @{Get Entry By Hash}
+ @{Loop Through Entries and Return Value if Match}
+ return NULL;
+}
+
+void hash_table_put(hash_table_t *table, void *key, void *value) {
+ @{Get Entry By Hash}
+ @{Loop Through Entries and Replace Value if Key Matches}
+ @{Allocate New Entry if No Match}
+}
+
+void hash_table_remove(hash_table_t *table, void *key) {
+ @{Get Entry By Hash}
+ @{Loop Through Entries and Remove Entry if Key Matches}
+}
+
+#ifdef TEST_HASH_TABLE
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+int string_cmp(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+
+unsigned long string_hash(void *key) {
+ unsigned long hash = 5381;
+ char *str = (char *)key;
+ while (*str != '\0') {
+ hash = ((hash << 5) + hash) + *str;
+ str++;
+ }
+ return hash;
+}
+
+int main() {
+ hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL);
+ hash_table_put(table, "foo", "bar");
+ hash_table_put(table, "foo", "baz");
+ assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0);
+ hash_table_remove(table, "foo");
+ assert(hash_table_get(table, "foo") == NULL);
+ hash_table_destroy(table);
+ return 0;
+}
+#endif
+---
+
+@s
+For the hash table data structure, we'll define a pointer to an array of entries, the size of the array, and the hash/comparison functions.
+
+--- Hash Table Data Structure
+struct hash_table {
+ struct hash_table_entry **entries;
+ int size;
+ hash_table_cmp_fn cmp;
+ hash_table_hash_fn hash;
+ hash_table_dtor dtor;
+};
+---
+
+@s
+Entries in the hash table will have a key, a value, and a link to the next entry in the chain.
+
+--- Hash Table Entry Data Structure
+struct hash_table_entry {
+ void *key;
+ void *value;
+ struct hash_table_entry *next;
+};
+---
+
+@s
+Allocating a hash table involves allocating memory for the hash table itself and the entries, zeroing out the entries, and setting the hash and comparison functions.
+
+--- Allocate and Initialize Hash Table
+hash_table_t *table = malloc(sizeof(struct hash_table));
+if (table == NULL) {
+ fputs("Out of memory, could not allocate hash table\n", stderr);
+ exit(1);
+}
+table->entries = calloc(size, sizeof(struct hash_table_entry *));
+if (table->entries == NULL) {
+ fputs("Out of memory, could not allocate hash table entries\n", stderr);
+ exit(1);
+}
+table->size = size;
+table->cmp = cmp;
+table->hash = hash;
+table->dtor = dtor;
+---
+
+@s
+To destroy a hash table, we loop through the entries, freeing the keys and values, and then free the entries and the table itself.
+
+--- Destroy Entries
+for (int i = 0; i < table->size; i++) {
+ struct hash_table_entry *entry = table->entries[i];
+ while (entry != NULL) {
+ struct hash_table_entry *next = entry->next;
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
+ }
+ free(entry);
+ entry = next;
+ }
+}
+---
+
+@s
+To get an entry from the hash table, we hash the key, loop through the entries, and return the value if we find a match.
+
+--- Get Entry By Hash
+unsigned int hash = table->hash(key) % table->size;
+struct hash_table_entry *entry = table->entries[hash];
+---
+
+@s
+To put an entry in the hash table, we hash the key, loop through the entries, and replace the value if we find a match.
+
+--- Loop Through Entries and Replace Value if Key Matches
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ entry->value = value;
+ return;
+ }
+ entry = entry->next;
+}
+---
+
+@s
+If we don't find a match, we allocate a new entry, set the key and value, and insert it at the head of the linked list.
+
+This exploits a property in computer science called locality of reference. The gist of that is that when you write to a piece of memory, you're likely to read from it again soon. By putting the new entry at the head of the linked list, we increase the chances that we'll find it quickly next time.
+
+--- Allocate New Entry if No Match
+struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry));
+if (new_entry == NULL) {
+ fputs("Out of memory, could not allocate hash table entry\n", stderr);
+ exit(1);
+}
+new_entry->key = key;
+new_entry->value = value;
+new_entry->next = table->entries[hash];
+table->entries[hash] = new_entry;
+---
+
+@s
+To remove an entry from the hash table, we hash the key, loop through the entries, and remove the entry if we find a match.
+
+--- Loop Through Entries and Remove Entry if Key Matches
+struct hash_table_entry *prev = NULL;
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (prev == NULL) {
+ table->entries[hash] = entry->next;
+ } else {
+ prev->next = entry->next;
+ }
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
+ }
+ free(entry);
+ return;
+ }
+ prev = entry;
+ entry = entry->next;
+}
+---
+
+@s
+To find a value associated with a given key in the hash table, we hash the string, loop through the entries, and return the value if a match is found.
+
+--- Loop Through Entries and Return Value if Match
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ return entry->value;
+ }
+ entry = entry->next;
+}
+---
+
+@s
+We're now almost ready to implement `token_create_string` the right way. First, we'll need a good hash function.
+
+Hash functions are a very interesting topic and there's a lot of good research on them. The hash function we use should be fast, have a low collision rate, and be able to handle strings of any length.
+
+We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful.
+
+Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 376 were unique. That's a collision rate of 99.999999%!
+
+Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well.
+
+The first time I wrote this, I used the hash function from the 'Red Dragon Book' (Compilers: Principles, Techniques, and Tools).
+--- Hash Function
+static unsigned long hash_string(void *key) {
+ unsigned long hash = 0, g;
+ char *p = key;
+ while (*p) {
+ hash = (hash \<\< 4) + *p++;
+ if ((g = hash \& 0xf0000000) != 0) {
+ hash ^= g \>\> 24;
+ hash ^= g;
+ }
+ }
+ return hash;
+}
+---
+This is a bit slow on modern processors because it's not very cache-friendly. We can do better. Let's use its child, ELFHash, from libc.
+
+As you can see in the code below, this function avoids extra operations and should be much faster.
+--- Hash Function :=
+static unsigned int hash_string(void *key) {
+ unsigned long hash = 0, hi = 0;
+ char *p = key;
+ hash = *p;
+ if (hash != 0 && p[1] != 0) {
+ hash = (hash << 4) + p[1];
+ if (p[2] != 0) {
+ hash = (hash << 4) + p[2];
+ if (p[3] != 0) {
+ hash = (hash << 4) + p[3];
+ if (p[4] != 0) {
+ hash = (hash << 4) + p[4];
+ p += 5;
+ while (*p != 0) {
+ hash = (hash << 4) + *p++;
+ hi = hash & 0xf0000000l;
+ hash ^= hi >> 24;
+ }
+ hash &= 0x0fffffffl;
+ }
+ }
+ }
+ }
+ return hash;
+}
+---
+
+@s
+We also need a comparison function for strings.
+--- String Comparison
+static int cmp_string(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+---
+
+@s
+Finally, we'll need a destructor for entries.
+--- String Destructor
+static void dtor_string(void *value, int is_key) {
+ if (is_key) {
+ free(value); // Since the key and value are the same, we only need to free once.
+ }
+}
+---
+
+@s
+Now we can implement `token_create_string` the right way.
+
+You might notice that we're using the same key and value. This way of using a hash table is normally called a set. We're using it to store strings, but we could use it to store anything we want to deduplicate.
+--- Token Create String --- :=
+@{String Comparison}
+@{Hash Function}
+@{String Destructor}
+hash_table_t *string_table;
+token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
+ if (string_table == NULL) {
+ string_table = hash_table_create(2048, cmp_string, hash_string, dtor_string);
+ }
+ token_t *token = token_data_create(kind, lin, col, len);
+ char *key = hash_table_get(string_table, (void *)s);
+ if (key == NULL) {
+ key = strdup(s);
+ hash_table_put(string_table, key, key);
+ }
+ token_data(token)->data.s = key;
+ return token;
+}
+---
+
+
+@s
+We'll add an external declaration for `string_table` in `token.h` so other programs can take advantage of it.
+--- token.h :=
+#ifndef TOKEN_H
+#define TOKEN_H
+#include <stdint.h> // We use this for int64_t
+#include "hash_table.h" // We need this for the string table
+@{Token Types}
+@{Opaque Token Type}
+@{Token Creation and Destruction Interface}
+@{Token Interface}
+extern hash_table_t *string_table;
+extern int column;
+extern int line;
+#endif
+---
+
+@s
+Finally, we implement the token data structure in `token.c`.
+--- token.c
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "token.h"
+#include "hash_table.h"
+@{Token Data Structure}
+@{Token Data Access}
+@{Token Creation and Destruction}
+@{Token Create String}
+@{Token Debugging}
+---
+
+@s Input
+
+Input will provide a simple interface for reading characters from a file. The stream itself is deliberately hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that.
+
+@s
+--- Input Interface
+void input_init(const char *filename);
+int input_getc(void);
+void input_ungetc(int c);
+void input_destroy(void);
+---
+
+When the program wants to start reading a file, it calls `input_init` with the filename. It can then call `input_getc` to get the next character in the file. If there's no more input, `input_getc` will return `EOF`.
+
+There's also an `input_ungetc` function, which allows the program to put a character back into the stream. I'll only allow one character to be put back, but that should be enough for the tokenizer.
+
+Finally, when the program is done reading the file, it should call `input_destroy` to clean up.
+
+@s Input Design Decisions
+
+Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into memory, we'll need to read it in chunks. There are a couple of choices for how to do this:
+
+1. **Read a line at a time**: This is a more natural approach, but it has two drawbacks. First, it requires a large buffer to store the line (C normally specifies `BUFSIZ` as 8192 bytes). Second, if the line is longer than `BUFSIZ`, we'll have to read the line in chunks anyway.
+
+2. **Choose some arbitrary buffer size and read that many bytes at a time**: This is the approach I'm going to take. It's a little less natural, but it's more memory efficient.
+
+Input will read chunks of 128 bytes at a time, reusing the same static buffer. This limitation is not visible to the tokenizer, which will only see the `input_getc` interface.
+
+When the buffer is exhausted, `input_getc` will call `nextline`, which will read the next chunk of the file.
+
+@s Input Implementation
+
+The implementation of the input module is pretty straightforward. We have the following data structures and defines as globals:
+
+@s
+--- Input Data
+#define CHUNK_SIZE 128
+static char buffer[CHUNK_SIZE];
+static int buffer_pos = 0;
+static int buffer_size = 0;
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
+
+static FILE *file = NULL;
+---
+
+When the program calls `input_init`, we open the file.
+
+@s
+--- Input Initialization
+void input_init(const char *filename) {
+ file = fopen(filename, "r");
+ if (file == NULL) {
+ fprintf(stderr, "Error: Cannot open file %s\n", filename);
+ exit(1);
+ }
+}
+---
+
+When the program calls `input_getc`, we return the next character in the buffer. If the buffer is exhausted, we call `nextline`. We also track the line and column.
+
+@s
+--- Input Get Character
+int input_getc(void) {
+ if (unget_buffer_stack_pos > 0) {
+ return unget_buffer_stack[--unget_buffer_stack_pos];
+ }
+ if (buffer_pos == buffer_size) {
+ buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
+ buffer_pos = 0;
+ }
+ if (buffer_size == 0) {
+ return EOF;
+ }
+ char c = buffer[buffer_pos++];
+ return c;
+}
+---
+
+When the program calls `input_ungetc`, we save the character in the `unget_buffer`.
+
+@s
+--- Input Unget Character
+void input_ungetc(int c) {
+ unget_buffer_stack[unget_buffer_stack_pos++] = c;
+}
+---
+
+Since we're not using dynamic memory allocation, cleanup is pretty simple.
+
+@s
+--- Input Destroy
+void input_destroy(void) {
+ fclose(file);
+}
+---
+
+@s
+We put the whole thing together in `input.c`.
+--- input.c
+#include <stdio.h>
+#include <stdlib.h>
+#include "input.h"
+@{Input Data}
+@{Input Initialization}
+@{Input Get Character}
+@{Input Unget Character}
+@{Input Destroy}
+---
+
+@s
+We'll need an external declaration for `file` in `input.h` so other programs can take advantage of it.
+--- input.h
+#ifndef INPUT_H
+#define INPUT_H
+@{Input Interface}
+#endif
+---
+
+@s
+We'll implement the lexer interface in `tokenizer.h`
+--- tokenizer.h
+#ifndef TOKENIZER_H
+#define TOKENIZER_H
+#include "token.h"
+#include "input.h"
+@{Tokenization Interface}
+#endif
+---
+
+@s
+The tokenization interface will have a couple of functions. `next_token` will return the next token in the input stream, `init_tokenizer` will initialize the tokenizer, and `destroy_tokenizer` will clean up.
+
+We'll also have some helper functions for lookahead and matching.
+
+'peek_token' will return the next token without consuming it (it technically does advance the input stream, but it saves the token so it can be reused).
+
+'consume' will consume the next token if it matches a given kind. If it doesn't match, it will print an error message and exit.
+--- Tokenization Interface
+void init_tokenizer(const char *filename);
+void destroy_tokenizer(void);
+token_t *next_token(void);
+void reject_token(token_t *token);
+token_t *peek_token(void);
+void consume(c_token_types kind);
+void consume_alt(c_token_types *kinds, int n);
+---
+
+@s
+Now we can finally implement the tokenizer.
+--- tokenizer.c
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "tokenizer.h"
+#include "token.h"
+#include "input.h"
+token_t *left_stack[8];
+int left_stack_pos = 0;
+@{Utility Functions}
+@{Tokenization Function}
+---
+
+@s
+Utility functions are everything that doesn't directly tokenize the input.
+--- Utility Functions
+void init_tokenizer(const char *filename) {
+ input_init(filename);
+}
+
+void destroy_tokenizer(void) {
+ input_destroy();
+}
+
+void reject_token(token_t *token) {
+ left_stack[left_stack_pos++] = token;
+}
+
+token_t *peek_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[left_stack_pos - 1];
+ }
+ token_t *token = next_token();
+ reject_token(token);
+ return token;
+}
+
+@{Stringify Type}
+
+void consume(c_token_types kind) {
+ token_t *token = next_token();
+ if (token_type(token) != kind) {
+ fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
+ exit(1);
+ }
+ token_destroy(token);
+}
+
+void consume_alt(c_token_types *kinds, int n) {
+ token_t *token = next_token();
+ for (int i = 0; i < n; i++) {
+ if (token_type(token) == kinds[i]) {
+ token_destroy(token);
+ return;
+ }
+ }
+ fprintf(stderr, "Error: Expected one of the following tokens: ");
+ for (int i = 0; i < n; i++) {
+ fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
+ }
+ fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
+ exit(1);
+}
+---
+
+@s
+We'll need a helper function to convert token types to strings. It's pretty simple, just tedious.
+--- Stringify Type
+const char *stringify_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "if";
+ case TOK_CTK_ELSE:
+ return "else";
+ case TOK_CTK_SWITCH:
+ return "switch";
+ case TOK_CTK_CASE:
+ return "case";
+ case TOK_CTK_DEFAULT:
+ return "default";
+ case TOK_CTK_WHILE:
+ return "while";
+ case TOK_CTK_DO:
+ return "do";
+ case TOK_CTK_FOR:
+ return "for";
+ case TOK_CTK_CONTINUE:
+ return "continue";
+ case TOK_CTK_BREAK:
+ return "break";
+ case TOK_CTK_RETURN:
+ return "return";
+ case TOK_CTK_GOTO:
+ return "goto";
+ case TOK_TK_VOID:
+ return "void";
+ case TOK_TK_CHAR:
+ return "char";
+ case TOK_TK_SHORT:
+ return "short";
+ case TOK_TK_INT:
+ return "int";
+ case TOK_TK_LONG:
+ return "long";
+ case TOK_TK_FLOAT:
+ return "float";
+ case TOK_TK_DOUBLE:
+ return "double";
+ case TOK_TK_SIGNED:
+ return "signed";
+ case TOK_TK_UNSIGNED:
+ return "unsigned";
+ case TOK_TK_STRUCT:
+ return "struct";
+ case TOK_TK_UNION:
+ return "union";
+ case TOK_TK_ENUM:
+ return "enum";
+ case TOK_TK_TYPEDEF:
+ return "typedef";
+ case TOK_SCSK_AUTO:
+ return "auto";
+ case TOK_SCSK_REGISTER:
+ return "register";
+ case TOK_SCSK_STATIC:
+ return "static";
+ case TOK_SCSK_EXTERN:
+ return "extern";
+ case TOK_SCSK_CONST:
+ return "const";
+ case TOK_SCSK_VOLATILE:
+ return "volatile";
+ case TOK_MK_SIZEOF:
+ return "sizeof";
+ case TOK_OP_ADD:
+ return "+";
+ case TOK_OP_SUB:
+ return "-";
+ case TOK_OP_MUL:
+ return "*";
+ case TOK_OP_DIV:
+ return "/";
+ case TOK_OP_MOD:
+ return "%";
+ case TOK_OP_BIT_AND:
+ return "&";
+ case TOK_OP_BIT_OR:
+ return "|";
+ case TOK_OP_BIT_XOR:
+ return "^";
+ case TOK_OP_BIT_NOT:
+ return "~";
+ case TOK_OP_LSHIFT:
+ return "<<";
+ case TOK_OP_RSHIFT:
+ return ">>";
+ case TOK_OP_NOT:
+ return "!";
+ case TOK_OP_ASSIGN:
+ return "=";
+ case TOK_OP_LT:
+ return "<";
+ case TOK_OP_GT:
+ return ">";
+ case TOK_OP_INC:
+ return "++";
+ case TOK_OP_DEC:
+ return "--";
+ case TOK_OP_EQ:
+ return "==";
+ case TOK_OP_NE:
+ return "!=";
+ case TOK_OP_LE:
+ return "<=";
+ case TOK_OP_GE:
+ return ">=";
+ case TOK_OP_AND:
+ return "&&";
+ case TOK_OP_OR:
+ return "||";
+ case TOK_OP_MEMBER_POINTER:
+ return "->";
+ case TOK_OP_MEMBER:
+ return ".";
+ case TOK_OP_COND_DECISION:
+ return ":";
+ case TOK_OP_COND:
+ return "?";
+ case TOK_OP_ASSIGN_ADD:
+ return "+=";
+ case TOK_OP_ASSIGN_SUB:
+ return "-=";
+ case TOK_OP_ASSIGN_MUL:
+ return "*=";
+ case TOK_OP_ASSIGN_DIV:
+ return "/=";
+ case TOK_OP_ASSIGN_MOD:
+ return "%=";
+ case TOK_OP_ASSIGN_BITAND:
+ return "&=";
+ case TOK_OP_ASSIGN_BITOR:
+ return "|=";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "^=";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "<<=";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return ">>=";
+ case TOK_SEP_HASH:
+ return "#";
+ case TOK_ID:
+ return "identifier";
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_U64:
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_S64:
+ return "integer constant";
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ return "floating constant";
+ case TOK_CONST_CHAR:
+ return "character constant";
+ case TOK_CONST_STRING_ASCII:
+ return "string constant";
+ case TOK_SPECIAL_EOF:
+ return "EOF";
+ case TOK_SPECIAL_ERROR:
+ return "error";
+ case TOK_SEP_LEFT_PAREN:
+ return "(";
+ case TOK_SEP_RIGHT_PAREN:
+ return ")";
+ case TOK_SEP_LEFT_BRACKET:
+ return "[";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "]";
+ case TOK_SEP_LEFT_BRACE:
+ return "{";
+ case TOK_SEP_RIGHT_BRACE:
+ return "}";
+ case TOK_SEP_COMMA:
+ return ",";
+ case TOK_SEP_SEMICOLON:
+ return ";";
+ case TOK_SEP_DOT:
+ return ".";
+ case TOK_SEP_ELLIPSIS:
+ return "...";
+ }
+ return "UNKNOWN";
+}
+---
+
+@s
+Now we can implement the tokenization function. The pattern is pretty simple: we call each of the tokenization functions in turn until we find a match. If we don't find a match, we print an error message and exit.
+You might wonder why skip_whitespace can return a token. This makes handling the divide operator easier as comments also start with a slash.
+--- Tokenization Function
+char file_name[1024];
+@{Warning/Error Functions}
+@{Skip Whitespace}
+@{Tokenize Identifier}
+@{Tokenize Number}
+@{Tokenize String}
+@{Tokenize Character}
+@{Tokenize Operator}
+token_t *next_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[--left_stack_pos];
+ }
+ token_t *tok = skip_whitespace();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_identifier();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_number();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_char_constant();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_string_literal();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_operator();
+ if (tok != NULL) {
+ return tok;
+ }
+ int c = input_getc();
+ if (c == EOF) {
+ return NULL;
+ }
+ tok_warn(
+ "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+ line, column);
+ return next_token();
+}
+
+#ifdef TEST_TOKENIZER
+@{Run Test}
+#endif
+---
+
+@s
+We'll need a couple of helper functions to skip whitespace and print warnings/errors.
+--- Warning/Error Functions
+void tok_error(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+
+void tok_warn(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+---
+
+@s
+The `skip_whitespace` function is pretty simple. It just skips over any comments, whitespace, and line directives.
+--- Skip Whitespace
+static token_t *skip_whitespace(void) {
+ int c;
+ while ((c = input_getc()) != EOF) {
+ if (isspace(c)) { // Whitespace
+ if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ } else if (c == '#') // GCC preprocessor line control directive.
+ {
+ char buf[512];
+ int i = 0;
+ while ((c = input_getc()) != EOF && c != '\n') {
+ buf[i++] = c;
+ column++;
+ }
+ buf[i] = '\0';
+ if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+ column = 1;
+ } else {
+ tok_error("Invalid #line directive\n");
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ } else if (c == '/') { // Comment
+ c = input_getc();
+ if (c == '/') {
+ while ((c = input_getc()) != EOF && c != '\n') {
+ column++;
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ line++;
+ column = 1;
+ } else if (c == '*') { // Multiline comment
+ while ((c = input_getc()) != EOF) {
+ if (c == '*') {
+ c = input_getc();
+ if (c == '/') {
+ break;
+ }
+ } else if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ } else { // Handled here to simplify the code.
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ } else {
+ input_ungetc(c);
+ return NULL;
+ }
+ }
+ return NULL;
+}
+---
+
+@s
+The `read_identifier` function reads an identifier from the input stream. C identifiers can contain letters, digits, and underscores, but they can't start with a digit.
+--- Tokenize Identifier
+@{Get Keyword}
+static token_t *read_identifier(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ if (!isalpha(c) && c != '_') {
+ input_ungetc(c);
+ return NULL;
+ }
+ buf[i++] = c;
+ while ((c = input_getc()) != EOF) {
+ if (!isalnum(c) && c != '_') {
+ input_ungetc(c);
+ break;
+ }
+ buf[i++] = c;
+ if (i >= 1008) {
+ tok_error("Identifier too long\n");
+ exit(1);
+ }
+ }
+ buf[i] = '\0';
+ column += i;
+ // Check if it's a keyword
+ c_token_types kind = get_keyword(buf, i);
+ if (kind != TOK_ID) {
+ return token_create(kind, line, column, i);
+ }
+ return token_create_string(kind, line, column, buf, i);
+}
+---
+
+@s
+The `get_keyword` function is a simple decision tree for identifying keywords. The code is pretty tedious, but it works.
+--- Get Keyword
+c_token_types get_keyword(const char *buf, int len) {
+ switch (buf[0]) {
+ case 'a':
+ if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_SCSK_AUTO;
+ break;
+
+ case 'b':
+ if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+ buf[4] == 'k')
+ return TOK_CTK_BREAK;
+ break;
+
+ case 'c':
+ switch (buf[1]) {
+ case 'a':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_CASE;
+ break;
+ case 'h':
+ if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+ return TOK_TK_CHAR;
+ break;
+ case 'o':
+ if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+ return TOK_SCSK_CONST;
+ if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+ return TOK_CTK_CONTINUE;
+ break;
+ }
+ break;
+
+ case 'd':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+ buf[5] == 'l' && buf[6] == 't')
+ return TOK_CTK_DEFAULT;
+ break;
+ case 'o':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_DO;
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+ buf[5] == 'e')
+ return TOK_TK_DOUBLE;
+ break;
+ }
+ break;
+
+ case 'e':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_ELSE;
+ break;
+ case 'n':
+ if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+ return TOK_TK_ENUM;
+ break;
+ case 'x':
+ if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_SCSK_EXTERN;
+ break;
+ }
+ break;
+
+ case 'f':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+ return TOK_TK_FLOAT;
+ break;
+ case 'o':
+ if (len == 3 && buf[2] == 'r')
+ return TOK_CTK_FOR;
+ break;
+ }
+ break;
+
+ case 'g':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_CTK_GOTO;
+ break;
+
+ case 'i':
+ switch (buf[1]) {
+ case 'f':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_IF;
+ break;
+ case 'n':
+ if (len == 3 && buf[2] == 't')
+ return TOK_TK_INT;
+ break;
+ }
+ break;
+
+ case 'l':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+ return TOK_TK_LONG;
+ break;
+
+ case 'r':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+ buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+ return TOK_SCSK_REGISTER;
+ if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_CTK_RETURN;
+ break;
+ }
+ break;
+
+ case 's':
+ switch (buf[1]) {
+ case 'h':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+ return TOK_TK_SHORT;
+ break;
+ case 't':
+ if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'c')
+ return TOK_SCSK_STATIC;
+ break;
+ case 'i':
+ if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+ buf[5] == 'd')
+ return TOK_TK_SIGNED;
+ if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+ buf[5] == 'f')
+ return TOK_MK_SIZEOF;
+ break;
+ case 'r':
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+ return TOK_TK_STRUCT;
+ break;
+ case 'w':
+ if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+ buf[5] == 'h')
+ return TOK_CTK_SWITCH;
+ break;
+ }
+ break;
+
+ case 't':
+ if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+ buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+ return TOK_TK_TYPEDEF;
+ break;
+
+ case 'u':
+ switch (buf[1]) {
+ case 'n':
+ if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+ return TOK_TK_UNION;
+ if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+ buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+ return TOK_TK_UNSIGNED;
+ break;
+ }
+ break;
+
+ case 'v':
+ switch (buf[1]) {
+ case 'o':
+ if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+ return TOK_TK_VOID;
+ if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+ buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+ return TOK_SCSK_VOLATILE;
+ break;
+ }
+ break;
+
+ case 'w':
+ if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+ buf[4] == 'e')
+ return TOK_CTK_WHILE;
+ break;
+
+ default:
+ return TOK_ID;
+ }
+ return TOK_ID;
+}
+---
+
+@s
+The `read_operator` function works similarly to the `read_identifier` function. It uses a decision tree to identify operators.
+--- Tokenize Operator
+
+token_t *read_operator(void) {
+ int c;
+ c = input_getc();
+ switch (c) {
+ case '!': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_NE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_NOT, line, column, 1);
+ }
+ case '%': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MOD, line, column, 1);
+ }
+ case '&': {
+ c = input_getc();
+ if (c == '&')
+ return token_create(TOK_OP_AND, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_AND, line, column, 1);
+ }
+ case '(':
+ return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+ case ')':
+ return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+ case '*': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MUL, line, column, 1);
+ }
+ case '+': {
+ c = input_getc();
+ if (c == '+')
+ return token_create(TOK_OP_INC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ADD, line, column, 2);
+ }
+ case ',':
+ return token_create(TOK_SEP_COMMA, line, column, 1);
+ case '-': {
+ c = input_getc();
+ if (c == '-')
+ return token_create(TOK_OP_DEC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+ if (c == '>')
+ return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_SUB, line, column, 1);
+ }
+ case '.': {
+ c = input_getc();
+ if (c == '.') {
+ c = input_getc();
+ if (c == '.') {
+ return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+ } else {
+ // Bail out, can't store more than one unget
+ tok_error("Unexpected character '.' at line %d, column %d\n", line,
+ column);
+ exit(1);
+ }
+ }
+ return token_create('.', line, column, 1);
+ }
+ case '/': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ case ':':
+ return token_create(TOK_OP_COND_DECISION, line, column, 1);
+ case ';':
+ return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+ case '<': {
+ c = input_getc();
+ if (c == '<') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_LSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_LE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_LT, line, column, 1);
+ }
+ case '=': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ASSIGN, line, column, 1);
+ }
+ case '>': {
+ c = input_getc();
+ if (c == '>') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_RSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_GE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_GT, line, column, 1);
+ }
+ case '?':
+ return token_create(TOK_OP_COND, line, column, 1);
+ case '[':
+ return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+ case ']':
+ return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+ case '^': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_XOR, line, column, 1);
+ }
+ case '{':
+ return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+ case '|': {
+ c = input_getc();
+ if (c == '|')
+ return token_create(TOK_OP_OR, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_OR, line, column, 1);
+ }
+ case '}':
+ return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+ case '~':
+ return token_create(TOK_OP_BIT_NOT, line, column, 1);
+ default:
+ input_ungetc(c);
+ return NULL;
+ }
+
+ return NULL;
+}
+---
+
+@s
+The `read_number` function reads a number from the input stream. It can be an integer or a floating-point number.
+
+I've broken it up a bit to make it easier to read.
+
+--- Tokenize Number
+static token_t *read_number(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ @{Check for valid prefix}
+ int radix = 10;
+ @{Process Radix}
+ int is_float = 0;
+ @{Read Number Loop}
+ buf[i] = '\0';
+ @{Process Suffixes}
+ @{Check for conflicting suffixes}
+ if (is_float) {
+ @{Convert to float}
+ } else {
+ @{Convert to integer}
+ }
+ return NULL;
+}
+---
+
+@s
+To determine if a character is a valid prefix for a number, we need to check if it's a digit or a period followed by a digit
+--- Check for valid prefix
+ // If we don't have a digit or decimal point, it's not a number
+ if (!isdigit(c) && c != '.') {
+ input_ungetc(c);
+ return NULL;
+ }
+ // Decimal point followed by non-digit is a struct member
+ if (c == '.') {
+ char cnext = input_getc();
+ if (!isdigit(cnext)) {
+ input_ungetc(cnext);
+ return token_create(TOK_OP_MEMBER, line, column, 1);
+ }
+ input_ungetc(cnext);
+ }
+---
+
+@s
+A C constant starting with a zero is either an octal or hexadecimal constant. We need to check the next character to determine which one it is.
+--- Process Radix
+ // Check for hex and octal.
+ if (c == '0') {
+ char cnext = input_getc();
+ if (cnext == 'x' || cnext == 'X') {
+ // Hex, discard the 0x
+ radix = 16;
+ } else {
+ // Octal, discard the 0
+ input_ungetc(cnext);
+ radix = 8;
+ }
+ } else {
+ // Decimal, append the first digit
+ buf[i++] = c;
+ }
+---
+
+@s
+
+--- Read Number Loop
+ while ((c = input_getc()) != EOF) {
+ // Since there can be multiple writes to the buffer, we want to make sure we
+ // don't overflow by giving a 4 byte pad
+ if (i > 1020) {
+ tok_error("Number too long\n");
+ return NULL;
+ }
+ // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+ // a-f/A-F for hex
+ if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+ (radix == 8 && c >= '0' && c <= '7')) {
+ buf[i++] = c;
+ // Decimal point and not a float yet, must be a float
+ } else if (c == '.' && !is_float) {
+ is_float = 1;
+ if (radix != 10) {
+ tok_error("Invalid floating point constant, expected decimal, got %s\n",
+ radix == 16 ? "hexadecimal" : "octal");
+ return NULL;
+ }
+ buf[i++] = c;
+ }
+ // Exponent on the end of a constant. (By standard this forces it to be a
+ // float)
+ else if (c == 'e' || c == 'E') {
+ buf[i++] = c;
+ c = input_getc();
+ // Sign on the exponent
+ if (c == '+' || c == '-') {
+ buf[i++] = c;
+ c = input_getc();
+ }
+ // Exponent must be a digit, I.E no 1e1.2
+ if (!isdigit(c)) {
+ tok_error("Invalid floating point exponent\n");
+ return NULL;
+ }
+ buf[i++] = c;
+ is_float = 1;
+ } else {
+ // Reached the end, unget the character so other functions can read it
+ input_ungetc(c);
+ break;
+ }
+ }
+---
+
+@s
+C constants can have suffixes to indicate their type. We need to check for these suffixes and set the appropriate flags.
+
+--- Process Suffixes
+ int is_unsigned = 0;
+ int is_long = 0;
+ int is_single = 0;
+ while (1) {
+ c = input_getc();
+ if (c == 'u' || c == 'U') {
+ if (is_unsigned) {
+ tok_warn(
+ "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+ }
+ is_unsigned = 1;
+ } else if (c == 'l' || c == 'L') {
+ if (is_long) {
+ tok_warn(
+ "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+ }
+ is_long = 1;
+ } else if (c == 'f' || c == 'F') {
+ if (is_single) {
+ tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+ "ignored\n");
+ }
+ is_single = 1;
+ } else {
+ input_ungetc(c);
+ break;
+ }
+ }
+---
+
+@s
+If we find conflicting suffixes, we print a warning and ignore the suffixes.
+--- Check for conflicting suffixes
+ if (is_single && is_long) {
+ tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+ "constant. Ignoring 'l'\n");
+ is_long = 0;
+ }
+ if (is_single && is_unsigned) {
+ tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+ "constant. Ignoring 'u'\n");
+ is_unsigned = 0;
+ }
+ if (is_single && !is_float) {
+ tok_warn(
+ "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+ is_single = 0;
+ }
+---
+
+@s
+If the constant is a floating-point number, we convert it to a float. We need to make sure that the number is in range for the given type and check for errors from strtod
+
+--- Convert to float
+ errno = 0;
+ // Strtod generates a unix-style error when it's given something out of
+ // range, so we want to get on top of that quickly instead of ignoring it
+ // That way we can avoid some nasty NAN-propagation in the constant folder.
+ double f = strtod(buf, NULL);
+ if (errno == ERANGE) {
+ tok_error("Floating point constant out of range\n");
+ return NULL;
+ }
+ // Warn if the constant is out of range for a float, I.E it's too big or too
+ // small
+ if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+ tok_warn(
+ "Warning: Floating point constant %f is out of range for float\n", f);
+ }
+ // Warn if the constant is too precise for a float
+ if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+ tok_warn("Warning: Converting double precision floating point constant "
+ "%f to float loses "
+ "precision\n",
+ f);
+ }
+ return token_create_float(is_single ? TOK_CONST_FLOAT_32
+ : TOK_CONST_FLOAT_64,
+ line, column, f, i);
+---
+
+@s
+If the constant is an integer, we convert it to an integer. We need to make sure that the number is in range for the given type and check for errors from strtoll
+
+--- Convert to integer
+ errno = 0;
+ uint64_t int_ = strtoull(buf, NULL, radix);
+ // Same as above, but for integers
+ if (errno == ERANGE) {
+ tok_error("Integer constant out of range\n");
+ return NULL;
+ }
+ if (is_unsigned) {
+ if (is_long) {
+ return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+ } else {
+ if (int_ > UINT32_MAX) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for unsigned "
+ "int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+ }
+ } else {
+ if (is_long) {
+ // If the highest bit is set, that means this will overflow a signed
+ // long (Due to two's complement)
+ if (int_ & (1UL << 63)) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for long long\n",
+ i);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+ } else {
+ if (int_ & (1UL << 31)) {
+ tok_warn("Warning: Integer constant %lld is out of range for int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+ }
+ }
+---
+
+@s
+The `read_char_constant` function reads a character constant from the input stream. It can be a single character or a multi-character escape sequence.
+--- Tokenize Character
+static token_t *read_char_constant(void) {
+ int c;
+ int len = 0;
+ c = input_getc();
+ if (c != '\'') {
+ input_ungetc(c);
+ return NULL;
+ }
+ len++;
+ c = input_getc();
+ if (c == '\'') {
+ tok_error("Empty character constant\n");
+ return NULL;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&len);
+ }
+ int val = c;
+ c = input_getc();
+ if (c != '\'') {
+ tok_error("Expected closing quote for character constant\n");
+ return NULL;
+ }
+ len++;
+ return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}
+---
+
+@s
+The `read_string_literal` function reads a string literal from the input stream.
+
+For this function, an automatic-lifetime buffer is used to store the string it becomes too large. At that point, a heap-allocated buffer is used.
+This way we can avoid unnecessary heap allocations for small strings.
+--- Tokenize String
+@{Read Escape Sequence}
+static token_t *read_string_literal(void) {
+ int c;
+ c = input_getc();
+ if (c != '"') {
+ input_ungetc(c);
+ return NULL;
+ }
+ int i = 0;
+ char s_buf[512];
+ char *buf = s_buf;
+ int len = 512;
+ int esc_pad = 0;
+ while ((c = input_getc()) != EOF) {
+ if (c == '"') {
+ // Implicit skip of closing quote
+ break;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&esc_pad);
+ if (c == 0) {
+ return NULL;
+ }
+ }
+ if (i >= len) {
+ if (buf == s_buf) {
+ buf = malloc(1024);
+ if (buf == NULL) {
+ fputs("Out of memory. Could not parse string literal.\n", stderr);
+ exit(1);
+ }
+ memcpy(buf, s_buf, 512);
+ len *= 2;
+ } else {
+ len *= 2;
+ buf = realloc(buf, len);
+ }
+ }
+ buf[i++] = c;
+ }
+ buf[i] = '\0';
+ if (c == EOF) {
+ tok_error("Unterminated string literal\n");
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return NULL;
+ }
+
+ token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf,
+ i + esc_pad + 2);
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return tok;
+}
+---
+
+@s
+Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases.
+
+--- Read Escape Sequence
+static char read_escape_sequence(int *len) {
+ int c = input_getc();
+ *len += 1;
+ switch (c) {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '\'':
+ return '\'';
+ case '"':
+ return '"';
+ case '?':
+ return '?';
+ case '\\':
+ return '\\';
+ case '0':
+ return '\0';
+ case 'x': {
+ c = input_getc();
+ if (!isxdigit(c)) {
+ tok_error("Invalid hexadecimal escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isxdigit(c)) {
+ *len += 1;
+ val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+ default:
+ if (!isdigit(c)) {
+ tok_error("Invalid escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isdigit(c)) {
+ *len += 1;
+ val = val * 8 + c - '0';
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+}
+---
+
+@s
+Finally, I'll add some code for running the tokenizer as its own program. This way we can test it out.
+--- Run Test
+char *preprocess(char *in) {
+ char *output_name = malloc(1024);
+ snprintf(output_name, 1024, "%s.preprocessed", in);
+ char *command = malloc(2048);
+ snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
+ system(command);
+ free(command);
+ return output_name;
+}
+
+// Tokenize the input file
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
+ return 1;
+ }
+ char *input_name = argv[1];
+ char *preprocessed = preprocess(input_name);
+ init_tokenizer(preprocessed);
+ token_t *tok;
+ while ((tok = next_token()) != NULL) {
+ print_token(tok);
+ token_destroy(tok);
+ }
+ destroy_tokenizer();
+ remove(preprocessed);
+ free(preprocessed);
+ return 0;
+}
+---
+
+### Bugs/Errata
+
+I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed.
+
+* had `buffer_pos == buffer_size - 1`, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to `buffer_size == 0` check which if true returns EOF, preventing input initialization. Fixed by changing to `buffer_pos == buffer_size`.
+* assertion `token->kind == TOK_CONST_STRING_ASCII` failed in token\_string. Forgot to expand check for identifiers which also use token\_string. Fixed by changing to `token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID`.
+* token\_create\_string - call to `hash_table_get` with freed key. Fixed by moving the call to free after the call to `hash_table_get`.
+* ibid - Design of hash table and call to `hash_table_get` in token\_create\_string created double free. Fixed by rewriting part of function.
+* Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.
+* Destructor for string literals missing in tokenizer teardown, added it in.
+* read\_number - check `int_ > INT32_MAX` does not work due to some weird casting. Added explicit cast.
+* read\_char\_constant - Forgot to handle '\\0'. Added.
+* skip\_whitespace - When a division operator occurs in code, skip\_whitespace assumes it's a comment. Fixed by adding a check for division operator.
+* hash\_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced with ELF hash function.
+* read\_identifier - strlen gets called 3 times even though we already get the string len by incrementing an array index. Ew. Used i instead of strlen.
+* read\_identifier - stringized version of keywords stored, not needed. Code added to call token\_create instead of token\_create\_string for keywords.
+* Everywhere - Checks added for memory allocation failure.
+* Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.
+
+### Conclusion
+
+That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust.
+
+Next time, we'll start on the parser.
+
+# Source code, biblography
+
+Source code for the tokenizer is available [here](/projects/cminus/code/tokenizer.c), header file is available [here](/projects/cminus/code/tokenizer.h).
+
+Source code for the input module is available [here](/projects/cminus/code/input.c), header file is available [here](/projects/cminus/code/input.h).
+
+Source code for the hash table is available [here](/projects/cminus/code/hash_table.c), header file is available [here](/projects/cminus/code/hash_table.h).
+
+Source code for the token module is available [here](/projects/cminus/code/token.c), header file is available [here](/projects/cminus/code/token.h).
+
+A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable Compiler for ANSI C. Grammars are from The C Reference Manual.
+
+I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).
+
+Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.
+
+The code for the ELF hash function is from the glibc source code.
+
+The idea for decision trees came from LCC.
+
+Literate programming rendered using [literate](https://github.com/zyedidia/Literate).
+
+ <footer style=" text-align: center; padding: 20px;">
+ <p>© 2024 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
+ license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p>
+ </footer>+
\ No newline at end of file
diff --git a/projects/cminus/code/makefile b/projects/cminus/code/makefile
@@ -0,0 +1,10 @@
+all:
+ lit -c lexer.lit
+ lit lexer.lit
+ mv lexer.html ..
+
+clean:
+ rm -f *.h
+ rm -f *.c
+ rm -f *.html
+ rm -f ../lexer.html+
\ No newline at end of file
diff --git a/projects/cminus/code/token.c b/projects/cminus/code/token.c
@@ -0,0 +1,477 @@
+/* token.c */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "token.h"
+#include "hash_table.h"
+/* Token Data Structure */
+#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
+
+struct token {
+ long magic;
+ int line;
+ int column;
+ short kind;
+ long opt_data[0];
+};
+
+typedef struct token token_t;
+
+struct token_data {
+ union {
+ int64_t i;
+ double f;
+ const char *s;
+ char c;
+ } data;
+};
+
+typedef struct token_data token_data_t;
+int column = 1;
+int line = 1;
+
+/* Token Data Access */
+#define token_data(token) ((struct token_data *)((token)->opt_data))
+
+c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
+}
+
+int64_t token_int(token_t *token) {
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
+}
+
+double token_float(token_t *token) {
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
+}
+
+const char *token_string(token_t *token) {
+ assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
+}
+
+char token_char(token_t *token) {
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
+}
+
+int token_line(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
+}
+
+int token_column(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
+}
+
+/* Token Creation and Destruction */
+token_t *token_data_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_1;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_2;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.i = i;
+ return token;
+}
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.f = f;
+ return token;
+}
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.c = c;
+ return token;
+}
+
+void token_destroy(token_t *token) {
+ if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) {
+ if (token->kind == TOK_CONST_STRING_ASCII) {
+ free((char *)token_data(token)->data.s);
+ }
+ free(token);
+ } else {
+ fputs("Corrupt token\n", stderr);
+ exit(1);
+ }
+}
+
+/* Token Create String */
+/* String Comparison */
+static int cmp_string(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+
+/* Hash Function */
+static unsigned int hash_string(void *key) {
+ unsigned long hash = 0, hi = 0;
+ char *p = key;
+ hash = *p;
+ if (hash != 0 && p[1] != 0) {
+ hash = (hash << 4) + p[1];
+ if (p[2] != 0) {
+ hash = (hash << 4) + p[2];
+ if (p[3] != 0) {
+ hash = (hash << 4) + p[3];
+ if (p[4] != 0) {
+ hash = (hash << 4) + p[4];
+ p += 5;
+ while (*p != 0) {
+ hash = (hash << 4) + *p++;
+ hi = hash & 0xf0000000l;
+ hash ^= hi >> 24;
+ }
+ hash &= 0x0fffffffl;
+ }
+ }
+ }
+ }
+ return hash;
+}
+
+/* String Destructor */
+static void dtor_string(void *value, int is_key) {
+ if (is_key) {
+ free(value); // Since the key and value are the same, we only need to free once.
+ }
+}
+
+hash_table_t *string_table;
+token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
+ if (string_table == NULL) {
+ string_table = hash_table_create(2048, cmp_string, hash_string, dtor_string);
+ }
+ token_t *token = token_data_create(kind, lin, col, len);
+ char *key = hash_table_get(string_table, (void *)s);
+ if (key == NULL) {
+ key = strdup(s);
+ hash_table_put(string_table, key, key);
+ }
+ token_data(token)->data.s = key;
+ return token;
+}
+
+/* Token Debugging */
+const char *token_name_from_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "TOK_CTK_IF";
+ case TOK_CTK_ELSE:
+ return "TOK_CTK_ELSE";
+ case TOK_CTK_SWITCH:
+ return "TOK_CTK_SWITCH";
+ case TOK_CTK_CASE:
+ return "TOK_CTK_CASE";
+ case TOK_CTK_DEFAULT:
+ return "TOK_CTK_DEFAULT";
+ case TOK_CTK_WHILE:
+ return "TOK_CTK_WHILE";
+ case TOK_CTK_DO:
+ return "TOK_CTK_DO";
+ case TOK_CTK_FOR:
+ return "TOK_CTK_FOR";
+ case TOK_CTK_CONTINUE:
+ return "TOK_CTK_CONTINUE";
+ case TOK_CTK_BREAK:
+ return "TOK_CTK_BREAK";
+ case TOK_CTK_RETURN:
+ return "TOK_CTK_RETURN";
+ case TOK_CTK_GOTO:
+ return "TOK_CTK_GOTO";
+ case TOK_TK_VOID:
+ return "TOK_TK_VOID";
+ case TOK_TK_CHAR:
+ return "TOK_TK_CHAR";
+ case TOK_TK_SHORT:
+ return "TOK_TK_SHORT";
+ case TOK_TK_INT:
+ return "TOK_TK_INT";
+ case TOK_TK_LONG:
+ return "TOK_TK_LONG";
+ case TOK_TK_FLOAT:
+ return "TOK_TK_FLOAT";
+ case TOK_TK_DOUBLE:
+ return "TOK_TK_DOUBLE";
+ case TOK_TK_SIGNED:
+ return "TOK_TK_SIGNED";
+ case TOK_TK_UNSIGNED:
+ return "TOK_TK_UNSIGNED";
+ case TOK_TK_STRUCT:
+ return "TOK_TK_STRUCT";
+ case TOK_TK_UNION:
+ return "TOK_TK_UNION";
+ case TOK_TK_ENUM:
+ return "TOK_TK_ENUM";
+ case TOK_TK_TYPEDEF:
+ return "TOK_TK_TYPEDEF";
+ case TOK_SCSK_AUTO:
+ return "TOK_SCSK_AUTO";
+ case TOK_SCSK_REGISTER:
+ return "TOK_SCSK_REGISTER";
+ case TOK_SCSK_STATIC:
+ return "TOK_SCSK_STATIC";
+ case TOK_SCSK_EXTERN:
+ return "TOK_SCSK_EXTERN";
+ case TOK_SCSK_CONST:
+ return "TOK_SCSK_CONST";
+ case TOK_SCSK_VOLATILE:
+ return "TOK_SCSK_VOLATILE";
+ case TOK_MK_SIZEOF:
+ return "TOK_MK_SIZEOF";
+ case TOK_OP_ADD:
+ return "TOK_OP_ADD";
+ case TOK_OP_SUB:
+ return "TOK_OP_SUB";
+ case TOK_OP_MUL:
+ return "TOK_OP_MUL";
+ case TOK_OP_DIV:
+ return "TOK_OP_DIV";
+ case TOK_OP_MOD:
+ return "TOK_OP_MOD";
+ case TOK_OP_BIT_AND:
+ return "TOK_OP_BIT_AND";
+ case TOK_OP_BIT_OR:
+ return "TOK_OP_BIT_OR";
+ case TOK_OP_BIT_XOR:
+ return "TOK_OP_BIT_XOR";
+ case TOK_OP_BIT_NOT:
+ return "TOK_OP_BIT_NOT";
+ case TOK_OP_LSHIFT:
+ return "TOK_OP_LSHIFT";
+ case TOK_OP_RSHIFT:
+ return "TOK_OP_RSHIFT";
+ case TOK_OP_NOT:
+ return "TOK_OP_NOT";
+ case TOK_OP_ASSIGN:
+ return "TOK_OP_ASSIGN";
+ case TOK_OP_LT:
+ return "TOK_OP_LT";
+ case TOK_OP_GT:
+ return "TOK_OP_GT";
+ case TOK_OP_INC:
+ return "TOK_OP_INC";
+ case TOK_OP_DEC:
+ return "TOK_OP_DEC";
+ case TOK_OP_EQ:
+ return "TOK_OP_EQ";
+ case TOK_OP_NE:
+ return "TOK_OP_NE";
+ case TOK_OP_LE:
+ return "TOK_OP_LE";
+ case TOK_OP_GE:
+ return "TOK_OP_GE";
+ case TOK_OP_AND:
+ return "TOK_OP_AND";
+ case TOK_OP_OR:
+ return "TOK_OP_OR";
+ case TOK_OP_MEMBER_POINTER:
+ return "TOK_OP_MEMBER_POINTER";
+ case TOK_OP_MEMBER:
+ return "TOK_OP_MEMBER";
+ case TOK_OP_COND_DECISION:
+ return "TOK_OP_COND_DECISION";
+ case TOK_OP_COND:
+ return "TOK_OP_COND";
+ case TOK_OP_ASSIGN_ADD:
+ return "TOK_OP_ASSIGN_ADD";
+ case TOK_OP_ASSIGN_SUB:
+ return "TOK_OP_ASSIGN_SUB";
+ case TOK_OP_ASSIGN_MUL:
+ return "TOK_OP_ASSIGN_MUL";
+ case TOK_OP_ASSIGN_DIV:
+ return "TOK_OP_ASSIGN_DIV";
+ case TOK_OP_ASSIGN_MOD:
+ return "TOK_OP_ASSIGN_MOD";
+ case TOK_OP_ASSIGN_BITAND:
+ return "TOK_OP_ASSIGN_BITAND";
+ case TOK_OP_ASSIGN_BITOR:
+ return "TOK_OP_ASSIGN_BITOR";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "TOK_OP_ASSIGN_BITXOR";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "TOK_OP_ASSIGN_LSHIFT";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return "TOK_OP_ASSIGN_RSHIFT";
+ case TOK_SEP_HASH:
+ return "TOK_SEP_HASH";
+ case TOK_ID:
+ return "TOK_ID";
+ case TOK_CONST_INTEGER_U32:
+ return "TOK_CONST_INTEGER_U32";
+ case TOK_CONST_INTEGER_U64:
+ return "TOK_CONST_INTEGER_U64";
+ case TOK_CONST_INTEGER_S32:
+ return "TOK_CONST_INTEGER_S32";
+ case TOK_CONST_INTEGER_S64:
+ return "TOK_CONST_INTEGER_S64";
+ case TOK_CONST_FLOAT_32:
+ return "TOK_CONST_FLOAT_32";
+ case TOK_CONST_FLOAT_64:
+ return "TOK_CONST_FLOAT_64";
+ case TOK_CONST_CHAR:
+ return "TOK_CONST_CHAR";
+ case TOK_CONST_STRING_ASCII:
+ return "TOK_CONST_STRING_ASCII";
+ case TOK_SPECIAL_EOF:
+ return "TOK_SPECIAL_EOF";
+ case TOK_SPECIAL_ERROR:
+ return "TOK_SPECIAL_ERROR";
+ case TOK_SEP_LEFT_PAREN:
+ return "TOK_SEP_LEFT_PAREN";
+ case TOK_SEP_RIGHT_PAREN:
+ return "TOK_SEP_RIGHT_PAREN";
+ case TOK_SEP_LEFT_BRACKET:
+ return "TOK_SEP_LEFT_BRACKET";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "TOK_SEP_RIGHT_BRACKET";
+ case TOK_SEP_LEFT_BRACE:
+ return "TOK_SEP_LEFT_BRACE";
+ case TOK_SEP_RIGHT_BRACE:
+ return "TOK_SEP_RIGHT_BRACE";
+ case TOK_SEP_COMMA:
+ return "TOK_SEP_COMMA";
+ case TOK_SEP_SEMICOLON:
+ return "TOK_SEP_SEMICOLON";
+ case TOK_SEP_DOT:
+ return "TOK_SEP_DOT";
+ case TOK_SEP_ELLIPSIS:
+ return "TOK_SEP_ELLIPSIS";
+ }
+ return "UNKNOWN";
+}
+
+char *re_escape_string(const char *str) {
+ int len = strlen(str);
+ char *buf = malloc(len * 2 + 1);
+ if (buf == NULL) {
+ fprintf(stderr, "Out of memory. Cannot escape string\n");
+ exit(1);
+ }
+ int i = 0;
+ for (int j = 0; j < len; j++) {
+ switch (str[j]) {
+ case '\a':
+ buf[i++] = '\\';
+ buf[i++] = 'a';
+ break;
+ case '\b':
+ buf[i++] = '\\';
+ buf[i++] = 'b';
+ break;
+ case '\f':
+ buf[i++] = '\\';
+ buf[i++] = 'f';
+ break;
+ case '\n':
+ buf[i++] = '\\';
+ buf[i++] = 'n';
+ break;
+ case '\r':
+ buf[i++] = '\\';
+ buf[i++] = 'r';
+ break;
+ case '\t':
+ buf[i++] = '\\';
+ buf[i++] = 't';
+ break;
+ case '\v':
+ buf[i++] = '\\';
+ buf[i++] = 'v';
+ break;
+ case '\\':
+ buf[i++] = '\\';
+ buf[i++] = '\\';
+ break;
+ case '\'':
+ buf[i++] = '\\';
+ buf[i++] = '\'';
+ break;
+ case '"':
+ buf[i++] = '\\';
+ buf[i++] = '"';
+ break;
+ default:
+ buf[i++] = str[j];
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return buf;
+}
+
+void print_token(token_t *tok) {
+ if (tok == NULL) {
+ printf("NULL\n");
+ return;
+ }
+ const char *name = token_name_from_type(tok->kind);
+ switch (tok->kind) {
+ case TOK_ID:
+ case TOK_CONST_STRING_ASCII: {
+ char *escaped = re_escape_string(token_string(tok));
+ printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+ free(escaped);
+ break;
+ }
+ case TOK_CONST_CHAR:
+ printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_S64:
+ case TOK_CONST_INTEGER_U64:
+ printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+ break;
+ default:
+ printf("%s@%d:%d\n", name, tok->line, tok->column);
+ break;
+ }
+}
+
+
diff --git a/projects/cminus/code/token.h b/projects/cminus/code/token.h
@@ -0,0 +1,157 @@
+/* token.h */
+#ifndef TOKEN_H
+#define TOKEN_H
+#include <stdint.h> // We use this for int64_t
+#include "hash_table.h" // We need this for the string table
+/* Token Types */
+typedef enum {
+ // Control Keywords
+ TOK_CTK_IF,
+ TOK_CTK_ELSE,
+ TOK_CTK_SWITCH,
+ TOK_CTK_CASE,
+ TOK_CTK_DEFAULT,
+ TOK_CTK_WHILE,
+ TOK_CTK_DO,
+ TOK_CTK_FOR,
+ TOK_CTK_CONTINUE,
+ TOK_CTK_BREAK,
+ TOK_CTK_RETURN,
+ TOK_CTK_GOTO,
+
+ // Type Keywords
+ TOK_TK_VOID,
+ TOK_TK_CHAR,
+ TOK_TK_SHORT,
+ TOK_TK_INT,
+ TOK_TK_LONG,
+ TOK_TK_FLOAT,
+ TOK_TK_DOUBLE,
+ TOK_TK_SIGNED,
+ TOK_TK_UNSIGNED,
+ TOK_TK_STRUCT,
+ TOK_TK_UNION,
+ TOK_TK_ENUM,
+ TOK_TK_TYPEDEF,
+
+ // Storage Class/Specifier Keywords
+ TOK_SCSK_AUTO,
+ TOK_SCSK_REGISTER,
+ TOK_SCSK_STATIC,
+ TOK_SCSK_EXTERN,
+ TOK_SCSK_CONST,
+ TOK_SCSK_VOLATILE,
+
+ // Misc Keywords
+ TOK_MK_SIZEOF,
+
+ // Operators
+ TOK_OP_ADD, // +
+ TOK_OP_SUB, // -
+ TOK_OP_MUL, // *
+ TOK_OP_DIV, // /
+ TOK_OP_MOD, // %
+ TOK_OP_BIT_AND, // &
+ TOK_OP_BIT_OR, // |
+ TOK_OP_BIT_XOR, // ^
+ TOK_OP_BIT_NOT, // ~
+ TOK_OP_LSHIFT, // <<
+ TOK_OP_RSHIFT, // >>
+ TOK_OP_NOT, // !
+ TOK_OP_ASSIGN, // =
+ TOK_OP_LT, // <
+ TOK_OP_GT, // >
+ TOK_OP_INC, // ++
+ TOK_OP_DEC, // --
+ TOK_OP_EQ, // ==
+ TOK_OP_NE, // !=
+ TOK_OP_LE, // <=
+ TOK_OP_GE, // >=
+ TOK_OP_AND, // &&
+ TOK_OP_OR, // ||
+ TOK_OP_MEMBER_POINTER, // ->
+ TOK_OP_MEMBER, // .
+ TOK_OP_COND_DECISION, // :
+ TOK_OP_COND, // ?
+ TOK_OP_ASSIGN_ADD, // +=
+ TOK_OP_ASSIGN_SUB, // -=
+ TOK_OP_ASSIGN_MUL, // *=
+ TOK_OP_ASSIGN_DIV, // /=
+ TOK_OP_ASSIGN_MOD, // %=
+ TOK_OP_ASSIGN_BITAND, // &=
+ TOK_OP_ASSIGN_BITOR, // |=
+ TOK_OP_ASSIGN_BITXOR, // ^=
+ TOK_OP_ASSIGN_LSHIFT, // <<=
+ TOK_OP_ASSIGN_RSHIFT, // >>=
+
+ // Separators
+ TOK_SEP_LEFT_PAREN, // (
+ TOK_SEP_RIGHT_PAREN, // )
+ TOK_SEP_LEFT_BRACKET, // [
+ TOK_SEP_RIGHT_BRACKET, // ]
+ TOK_SEP_LEFT_BRACE, // {
+ TOK_SEP_RIGHT_BRACE, // }
+ TOK_SEP_COMMA, // ,
+ TOK_SEP_SEMICOLON, // ;
+ TOK_SEP_DOT, // .
+ TOK_SEP_ELLIPSIS, // ...
+ TOK_SEP_HASH, // #
+
+ // Identifiers
+ TOK_ID,
+
+ // Constants
+ TOK_CONST_INTEGER_U32, // u
+ TOK_CONST_INTEGER_U64, // ul
+ TOK_CONST_INTEGER_S32, // (no suffix)
+ TOK_CONST_INTEGER_S64, // l
+ TOK_CONST_FLOAT_32, // f
+ TOK_CONST_FLOAT_64, // (no suffix)
+ TOK_CONST_CHAR, // 'c'
+ TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
+
+ // Special
+ TOK_SPECIAL_EOF,
+ TOK_SPECIAL_ERROR,
+} c_token_types;
+
+/* Opaque Token Type */
+typedef struct token token_t;
+
+/* Token Creation and Destruction Interface */
+token_t *token_data_create(c_token_types kind, int lin, int col, int len);
+
+token_t *token_create(c_token_types kind, int lin, int col, int len);
+
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len);
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len);
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len);
+
+token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len);
+
+void token_destroy(token_t *token);
+
+/* Token Interface */
+c_token_types token_type(token_t *token);
+
+int64_t token_int(token_t *token);
+
+double token_float(token_t *token);
+
+const char *token_string(token_t *token);
+
+char token_char(token_t *token);
+
+int token_line(token_t *token);
+
+int token_column(token_t *token);
+
+void print_token(token_t *tok);
+
+extern hash_table_t *string_table;
+extern int column;
+extern int line;
+#endif
+
diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c
@@ -1,6 +1,4 @@
-#include "tokenizer.h"
-#include "hash_table.h"
-#include "input.h"
+/* tokenizer.c */
#include <assert.h>
#include <ctype.h>
#include <errno.h>
@@ -11,203 +9,247 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-// Token data
-#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
-struct token {
- long magic;
- int line;
- int column;
- short kind;
- long opt_data[0];
-};
-
-typedef struct token token_t;
-
-struct token_data {
- union {
- long long i;
- double f;
- const char *s;
- char c;
- } data;
-};
-
-typedef struct token_data token_data_t;
-
-// Token creation
-int line = 1;
-int column = 1;
-char file_name[1024];
-#define token_data(token) ((struct token_data *)((token)->opt_data))
-static token_t *token_data_create(c_token_types kind, int lin, int col,
- int len) {
- token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
- if (token == NULL) {
- fputs("Out of memory\n", stderr);
- exit(1);
- }
- token->magic = TOK_MAGIC_1;
- token->line = lin;
- token->column = col;
- column += len;
- token->kind = kind;
- return token;
-}
-
-static token_t *token_create(c_token_types kind, int lin, int col, int len) {
- token_t *token = malloc(sizeof(token_t));
- if (token == NULL) {
- fputs("Out of memory\n", stderr);
- exit(1);
- }
- token->magic = TOK_MAGIC_2;
- token->line = lin;
- token->column = col;
- column += len;
- token->kind = kind;
- return token;
-}
-
-static token_t *token_create_int(c_token_types kind, int lin, int col,
- int64_t i, int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.i = i;
- return token;
+#include "tokenizer.h"
+#include "token.h"
+#include "input.h"
+token_t *left_stack[8];
+int left_stack_pos = 0;
+/* Utility Functions */
+void init_tokenizer(const char *filename) {
+ input_init(filename);
}
-static token_t *token_create_float(c_token_types kind, int lin, int col,
- double f, int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.f = f;
- return token;
+void destroy_tokenizer(void) {
+ input_destroy();
}
-static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
- int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.c = c;
- return token;
+void reject_token(token_t *token) {
+ left_stack[left_stack_pos++] = token;
}
-static unsigned int hash_string(void *key) {
- unsigned long hash = 0, hi = 0;
- char *p = key;
- hash = *p;
- if (hash != 0 && p[1] != 0) {
- hash = (hash << 4) + p[1];
- if (p[2] != 0) {
- hash = (hash << 4) + p[2];
- if (p[3] != 0) {
- hash = (hash << 4) + p[3];
- if (p[4] != 0) {
- hash = (hash << 4) + p[4];
- p += 5;
- while (*p != 0) {
- hash = (hash << 4) + *p++;
- hi = hash & 0xf0000000l;
- hash ^= hi >> 24;
- }
- hash &= 0x0fffffffl;
- }
- }
- }
+token_t *peek_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[left_stack_pos - 1];
}
- return hash;
-}
-
-static int cmp_string(void *key1, void *key2) {
- return strcmp((char *)key1, (char *)key2);
+ token_t *token = next_token();
+ reject_token(token);
+ return token;
}
-static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int lin, int col,
- const char *s, int len) {
- if (string_table == NULL) {
- string_table = hash_table_create(2048, cmp_string, hash_string);
- }
- token_t *token = token_data_create(kind, lin, col, len);
- char *key = hash_table_get(string_table, (void *)s);
- if (key == NULL) {
- key = strdup(s);
- hash_table_put(string_table, key, key, 1);
+/* Stringify Type */
+const char *stringify_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "if";
+ case TOK_CTK_ELSE:
+ return "else";
+ case TOK_CTK_SWITCH:
+ return "switch";
+ case TOK_CTK_CASE:
+ return "case";
+ case TOK_CTK_DEFAULT:
+ return "default";
+ case TOK_CTK_WHILE:
+ return "while";
+ case TOK_CTK_DO:
+ return "do";
+ case TOK_CTK_FOR:
+ return "for";
+ case TOK_CTK_CONTINUE:
+ return "continue";
+ case TOK_CTK_BREAK:
+ return "break";
+ case TOK_CTK_RETURN:
+ return "return";
+ case TOK_CTK_GOTO:
+ return "goto";
+ case TOK_TK_VOID:
+ return "void";
+ case TOK_TK_CHAR:
+ return "char";
+ case TOK_TK_SHORT:
+ return "short";
+ case TOK_TK_INT:
+ return "int";
+ case TOK_TK_LONG:
+ return "long";
+ case TOK_TK_FLOAT:
+ return "float";
+ case TOK_TK_DOUBLE:
+ return "double";
+ case TOK_TK_SIGNED:
+ return "signed";
+ case TOK_TK_UNSIGNED:
+ return "unsigned";
+ case TOK_TK_STRUCT:
+ return "struct";
+ case TOK_TK_UNION:
+ return "union";
+ case TOK_TK_ENUM:
+ return "enum";
+ case TOK_TK_TYPEDEF:
+ return "typedef";
+ case TOK_SCSK_AUTO:
+ return "auto";
+ case TOK_SCSK_REGISTER:
+ return "register";
+ case TOK_SCSK_STATIC:
+ return "static";
+ case TOK_SCSK_EXTERN:
+ return "extern";
+ case TOK_SCSK_CONST:
+ return "const";
+ case TOK_SCSK_VOLATILE:
+ return "volatile";
+ case TOK_MK_SIZEOF:
+ return "sizeof";
+ case TOK_OP_ADD:
+ return "+";
+ case TOK_OP_SUB:
+ return "-";
+ case TOK_OP_MUL:
+ return "*";
+ case TOK_OP_DIV:
+ return "/";
+ case TOK_OP_MOD:
+ return "%";
+ case TOK_OP_BIT_AND:
+ return "&";
+ case TOK_OP_BIT_OR:
+ return "|";
+ case TOK_OP_BIT_XOR:
+ return "^";
+ case TOK_OP_BIT_NOT:
+ return "~";
+ case TOK_OP_LSHIFT:
+ return "<<";
+ case TOK_OP_RSHIFT:
+ return ">>";
+ case TOK_OP_NOT:
+ return "!";
+ case TOK_OP_ASSIGN:
+ return "=";
+ case TOK_OP_LT:
+ return "<";
+ case TOK_OP_GT:
+ return ">";
+ case TOK_OP_INC:
+ return "++";
+ case TOK_OP_DEC:
+ return "--";
+ case TOK_OP_EQ:
+ return "==";
+ case TOK_OP_NE:
+ return "!=";
+ case TOK_OP_LE:
+ return "<=";
+ case TOK_OP_GE:
+ return ">=";
+ case TOK_OP_AND:
+ return "&&";
+ case TOK_OP_OR:
+ return "||";
+ case TOK_OP_MEMBER_POINTER:
+ return "->";
+ case TOK_OP_MEMBER:
+ return ".";
+ case TOK_OP_COND_DECISION:
+ return ":";
+ case TOK_OP_COND:
+ return "?";
+ case TOK_OP_ASSIGN_ADD:
+ return "+=";
+ case TOK_OP_ASSIGN_SUB:
+ return "-=";
+ case TOK_OP_ASSIGN_MUL:
+ return "*=";
+ case TOK_OP_ASSIGN_DIV:
+ return "/=";
+ case TOK_OP_ASSIGN_MOD:
+ return "%=";
+ case TOK_OP_ASSIGN_BITAND:
+ return "&=";
+ case TOK_OP_ASSIGN_BITOR:
+ return "|=";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "^=";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "<<=";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return ">>=";
+ case TOK_SEP_HASH:
+ return "#";
+ case TOK_ID:
+ return "identifier";
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_U64:
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_S64:
+ return "integer constant";
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ return "floating constant";
+ case TOK_CONST_CHAR:
+ return "character constant";
+ case TOK_CONST_STRING_ASCII:
+ return "string constant";
+ case TOK_SPECIAL_EOF:
+ return "EOF";
+ case TOK_SPECIAL_ERROR:
+ return "error";
+ case TOK_SEP_LEFT_PAREN:
+ return "(";
+ case TOK_SEP_RIGHT_PAREN:
+ return ")";
+ case TOK_SEP_LEFT_BRACKET:
+ return "[";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "]";
+ case TOK_SEP_LEFT_BRACE:
+ return "{";
+ case TOK_SEP_RIGHT_BRACE:
+ return "}";
+ case TOK_SEP_COMMA:
+ return ",";
+ case TOK_SEP_SEMICOLON:
+ return ";";
+ case TOK_SEP_DOT:
+ return ".";
+ case TOK_SEP_ELLIPSIS:
+ return "...";
}
- token_data(token)->data.s = key;
- return token;
-}
-
-// External token operations
-c_token_types token_type(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->kind;
-}
-
-int64_t token_int(token_t *token) {
- assert(token->kind == TOK_CONST_INTEGER_U32 ||
- token->kind == TOK_CONST_INTEGER_U64 ||
- token->kind == TOK_CONST_INTEGER_S32 ||
- token->kind == TOK_CONST_INTEGER_S64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.i;
-}
-
-double token_float(token_t *token) {
- assert(token->kind == TOK_CONST_FLOAT_32 ||
- token->kind == TOK_CONST_FLOAT_64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.f;
-}
-
-const char *token_string(token_t *token) {
- assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.s;
-}
-
-char token_char(token_t *token) {
- assert(token->kind == TOK_CONST_CHAR);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.c;
-}
-
-int token_line(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->line;
-}
-
-int token_column(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->column;
-}
-
-void token_destroy(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- // Don't free the string table, it's a global variable
- free(token);
+ return "UNKNOWN";
}
-token_t *unget_token = NULL;
-void tokenizer_init(const char *filename) { input_init(filename); }
-
-void tokenizer_unget(token_t *token) {
- assert(unget_token == NULL);
- unget_token = token;
-}
-void string_table_dtor(void *value, int is_key) {
- if (is_key) {
- free(value);
+void consume(c_token_types kind) {
+ token_t *token = next_token();
+ if (token_type(token) != kind) {
+ fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
+ exit(1);
}
+ token_destroy(token);
}
-void tokenizer_destroy(void) {
- input_destroy();
- if (string_table != NULL) {
- hash_table_destroy(string_table, string_table_dtor);
+void consume_alt(c_token_types *kinds, int n) {
+ token_t *token = next_token();
+ for (int i = 0; i < n; i++) {
+ if (token_type(token) == kinds[i]) {
+ token_destroy(token);
+ return;
+ }
+ }
+ fprintf(stderr, "Error: Expected one of the following tokens: ");
+ for (int i = 0; i < n; i++) {
+ fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
}
+ fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
+ exit(1);
}
-// Error handling
+/* Tokenization Function */
+char file_name[1024];
+/* Warning/Error Functions */
void tok_error(const char *fmt, ...) {
va_list args;
va_start(args, fmt);
@@ -226,11 +268,11 @@ void tok_warn(const char *fmt, ...) {
va_end(args);
}
-// Tokenizer
+/* Skip Whitespace */
static token_t *skip_whitespace(void) {
int c;
while ((c = input_getc()) != EOF) {
- if (isspace(c)) {
+ if (isspace(c)) { // Whitespace
if (c == '\n') {
line++;
column = 1;
@@ -254,7 +296,7 @@ static token_t *skip_whitespace(void) {
if (c == EOF) {
return NULL;
}
- } else if (c == '/') {
+ } else if (c == '/') { // Comment
c = input_getc();
if (c == '/') {
while ((c = input_getc()) != EOF && c != '\n') {
@@ -265,7 +307,7 @@ static token_t *skip_whitespace(void) {
}
line++;
column = 1;
- } else if (c == '*') {
+ } else if (c == '*') { // Multiline comment
while ((c = input_getc()) != EOF) {
if (c == '*') {
c = input_getc();
@@ -282,7 +324,7 @@ static token_t *skip_whitespace(void) {
if (c == EOF) {
return NULL;
}
- } else {
+ } else { // Handled here to simplify the code.
if (c == '=')
return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
input_ungetc(c);
@@ -296,6 +338,8 @@ static token_t *skip_whitespace(void) {
return NULL;
}
+/* Tokenize Identifier */
+/* Get Keyword */
c_token_types get_keyword(const char *buf, int len) {
switch (buf[0]) {
case 'a':
@@ -517,360 +561,216 @@ static token_t *read_identifier(void) {
return token_create_string(kind, line, column, buf, i);
}
-token_t *read_operator(void) {
+/* Tokenize Number */
+static token_t *read_number(void) {
int c;
+ char buf[1024];
+ int i = 0;
c = input_getc();
- switch (c) {
- case '!': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_NE, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_NOT, line, column, 1);
- }
- case '%': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_MOD, line, column, 1);
- }
- case '&': {
- c = input_getc();
- if (c == '&')
- return token_create(TOK_OP_AND, line, column, 2);
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_BIT_AND, line, column, 1);
- }
- case '(':
- return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
- case ')':
- return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
- case '*': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_MUL, line, column, 1);
- }
- case '+': {
- c = input_getc();
- if (c == '+')
- return token_create(TOK_OP_INC, line, column, 2);
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_ADD, line, column, 2);
- }
- case ',':
- return token_create(TOK_SEP_COMMA, line, column, 1);
- case '-': {
- c = input_getc();
- if (c == '-')
- return token_create(TOK_OP_DEC, line, column, 2);
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
- if (c == '>')
- return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_SUB, line, column, 1);
- }
- case '.': {
- c = input_getc();
+ /* Check for valid prefix */
+ // If we don't have a digit or decimal point, it's not a number
+ if (!isdigit(c) && c != '.') {
+ input_ungetc(c);
+ return NULL;
+ }
+ // Decimal point followed by non-digit is a struct member
if (c == '.') {
+ char cnext = input_getc();
+ if (!isdigit(cnext)) {
+ input_ungetc(cnext);
+ return token_create(TOK_OP_MEMBER, line, column, 1);
+ }
+ input_ungetc(cnext);
+ }
+
+ int radix = 10;
+ /* Process Radix */
+ // Check for hex and octal.
+ if (c == '0') {
+ char cnext = input_getc();
+ if (cnext == 'x' || cnext == 'X') {
+ // Hex, discard the 0x
+ radix = 16;
+ } else {
+ // Octal, discard the 0
+ input_ungetc(cnext);
+ radix = 8;
+ }
+ } else {
+ // Decimal, append the first digit
+ buf[i++] = c;
+ }
+
+ int is_float = 0;
+ /* Read Number Loop */
+ while ((c = input_getc()) != EOF) {
+ // Since there can be multiple writes to the buffer, we want to make sure we
+ // don't overflow by giving a 4 byte pad
+ if (i > 1020) {
+ tok_error("Number too long\n");
+ return NULL;
+ }
+ // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+ // a-f/A-F for hex
+ if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+ (radix == 8 && c >= '0' && c <= '7')) {
+ buf[i++] = c;
+ // Decimal point and not a float yet, must be a float
+ } else if (c == '.' && !is_float) {
+ is_float = 1;
+ if (radix != 10) {
+ tok_error("Invalid floating point constant, expected decimal, got %s\n",
+ radix == 16 ? "hexadecimal" : "octal");
+ return NULL;
+ }
+ buf[i++] = c;
+ }
+ // Exponent on the end of a constant. (By standard this forces it to be a
+ // float)
+ else if (c == 'e' || c == 'E') {
+ buf[i++] = c;
+ c = input_getc();
+ // Sign on the exponent
+ if (c == '+' || c == '-') {
+ buf[i++] = c;
+ c = input_getc();
+ }
+ // Exponent must be a digit, I.E no 1e1.2
+ if (!isdigit(c)) {
+ tok_error("Invalid floating point exponent\n");
+ return NULL;
+ }
+ buf[i++] = c;
+ is_float = 1;
+ } else {
+ // Reached the end, unget the character so other functions can read it
+ input_ungetc(c);
+ break;
+ }
+ }
+
+ buf[i] = '\0';
+ /* Process Suffixes */
+ int is_unsigned = 0;
+ int is_long = 0;
+ int is_single = 0;
+ while (1) {
c = input_getc();
- if (c == '.') {
- return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+ if (c == 'u' || c == 'U') {
+ if (is_unsigned) {
+ tok_warn(
+ "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+ }
+ is_unsigned = 1;
+ } else if (c == 'l' || c == 'L') {
+ if (is_long) {
+ tok_warn(
+ "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+ }
+ is_long = 1;
+ } else if (c == 'f' || c == 'F') {
+ if (is_single) {
+ tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+ "ignored\n");
+ }
+ is_single = 1;
} else {
- // Bail out, can't store more than one unget
- tok_error("Unexpected character '.' at line %d, column %d\n", line,
- column);
- exit(1);
+ input_ungetc(c);
+ break;
}
}
- return token_create('.', line, column, 1);
- }
- case '/': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_DIV, line, column, 1);
- }
- case ':':
- return token_create(TOK_OP_COND_DECISION, line, column, 1);
- case ';':
- return token_create(TOK_SEP_SEMICOLON, line, column, 1);
- case '<': {
- c = input_getc();
- if (c == '<') {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
- input_ungetc(c);
- return token_create(TOK_OP_LSHIFT, line, column, 2);
- }
- if (c == '=')
- return token_create(TOK_OP_LE, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_LT, line, column, 1);
- }
- case '=': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_ASSIGN, line, column, 1);
- }
- case '>': {
- c = input_getc();
- if (c == '>') {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
- input_ungetc(c);
- return token_create(TOK_OP_RSHIFT, line, column, 2);
- }
- if (c == '=')
- return token_create(TOK_OP_GE, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_GT, line, column, 1);
- }
- case '?':
- return token_create(TOK_OP_COND, line, column, 1);
- case '[':
- return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
- case ']':
- return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
- case '^': {
- c = input_getc();
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_BIT_XOR, line, column, 1);
- }
- case '{':
- return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
- case '|': {
- c = input_getc();
- if (c == '|')
- return token_create(TOK_OP_OR, line, column, 2);
- if (c == '=')
- return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
- input_ungetc(c);
- return token_create(TOK_OP_BIT_OR, line, column, 1);
- }
- case '}':
- return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
- case '~':
- return token_create(TOK_OP_BIT_NOT, line, column, 1);
- default:
- input_ungetc(c);
- return NULL;
- }
-}
-static token_t *read_number(void) {
- int c;
- char buf[1024];
- int i = 0;
- c = input_getc();
- // If we don't have a digit or decimal point, it's not a number
- if (!isdigit(c) && c != '.') {
- input_ungetc(c);
- return NULL;
- }
- // Decimal point followed by non-digit is a struct member
- if (c == '.') {
- char cnext = input_getc();
- if (!isdigit(cnext)) {
- input_ungetc(cnext);
- return token_create(TOK_OP_MEMBER, line, column, 1);
- }
- input_ungetc(cnext);
- }
- int radix = 10;
- int is_float = 0;
- // Check for hex and octal.
- if (c == '0') {
- char cnext = input_getc();
- if (cnext == 'x' || cnext == 'X') {
- // Hex, discard the 0x
- radix = 16;
- } else {
- // Octal, discard the 0
- input_ungetc(cnext);
- radix = 8;
+ /* Check for conflicting suffixes */
+ if (is_single && is_long) {
+ tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+ "constant. Ignoring 'l'\n");
+ is_long = 0;
}
- } else {
- // Decimal, append the first digit
- buf[i++] = c;
- }
- // Read the rest of the number
- while ((c = input_getc()) != EOF) {
- // Since there can be multiple writes to the buffer, we want to make sure we
- // don't overflow by giving a 4 byte pad
- if (i > 1020) {
- tok_error("Number too long\n");
- return NULL;
- }
- // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
- // a-f/A-F for hex
- if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
- (radix == 8 && c >= '0' && c <= '7')) {
- buf[i++] = c;
- // Decimal point and not a float yet, must be a float
- } else if (c == '.' && !is_float) {
- is_float = 1;
- if (radix != 10) {
- tok_error("Invalid floating point constant, expected decimal, got %s\n",
- radix == 16 ? "hexadecimal" : "octal");
- return NULL;
- }
- buf[i++] = c;
- }
- // Exponent on the end of a constant. (By standard this forces it to be a
- // float)
- else if (c == 'e' || c == 'E') {
- buf[i++] = c;
- c = input_getc();
- // Sign on the exponent
- if (c == '+' || c == '-') {
- buf[i++] = c;
- c = input_getc();
- }
- // Exponent must be a digit, I.E no 1e1.2
- if (!isdigit(c)) {
- tok_error("Invalid floating point exponent\n");
- return NULL;
- }
- buf[i++] = c;
- is_float = 1;
- } else {
- // Reached the end, unget the character so other functions can read it
- input_ungetc(c);
- break;
- }
- }
- buf[i] = '\0';
- // Check for suffixes
- int is_unsigned = 0;
- int is_long = 0;
- int is_single = 0;
- // Loop to get all possible suffixes. Warn when duplicated.
- while (1) {
- c = input_getc();
- if (c == 'u' || c == 'U') {
- if (is_unsigned) {
- tok_warn(
- "Warning: Duplicate suffix 'u' for integer constant ignored\n");
- }
- is_unsigned = 1;
- } else if (c == 'l' || c == 'L') {
- if (is_long) {
- tok_warn(
- "Warning: Duplicate suffix 'l' for integer constant ignored\n");
- }
- is_long = 1;
- } else if (c == 'f' || c == 'F') {
- if (is_single) {
- tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
- "ignored\n");
- }
- is_single = 1;
- } else {
- input_ungetc(c);
- break;
- }
- }
- // Resolve invalid suffixes. Doesn't error because you can still compile with
- // them.
- if (is_single && is_long) {
- tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
- "constant. Ignoring 'l'\n");
- is_long = 0;
- }
- if (is_single && is_unsigned) {
- tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
- "constant. Ignoring 'u'\n");
- is_unsigned = 0;
- }
- if (is_single && !is_float) {
- tok_warn(
- "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
- is_single = 0;
- }
- // use the strtox functions to convert the string to a number
- if (is_float) {
- errno = 0;
- // Strtod generates a unix-style error when it's given something out of
- // range, so we want to get on top of that quickly instead of ignoring it
- // That way we can avoid some nasty NAN-propagation in the constant folder.
- double f = strtod(buf, NULL);
- if (errno == ERANGE) {
- tok_error("Floating point constant out of range\n");
- return NULL;
+ if (is_single && is_unsigned) {
+ tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+ "constant. Ignoring 'u'\n");
+ is_unsigned = 0;
}
- // Warn if the constant is out of range for a float, I.E it's too big or too
- // small
- if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+ if (is_single && !is_float) {
tok_warn(
- "Warning: Floating point constant %f is out of range for float\n", f);
- }
- // Warn if the constant is too precise for a float
- if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
- tok_warn("Warning: Converting double precision floating point constant "
- "%f to float loses "
- "precision\n",
- f);
+ "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+ is_single = 0;
}
- return token_create_float(is_single ? TOK_CONST_FLOAT_32
- : TOK_CONST_FLOAT_64,
- line, column, f, i);
- } else {
- errno = 0;
- uint64_t int_ = strtoull(buf, NULL, radix);
- // Same as above, but for integers
- if (errno == ERANGE) {
- tok_error("Integer constant out of range\n");
- return NULL;
- }
- if (is_unsigned) {
- if (is_long) {
- return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
- } else {
- if (int_ > UINT32_MAX) {
- tok_warn(
- "Warning: Integer constant %lld is out of range for unsigned "
- "int\n",
- int_);
+
+ if (is_float) {
+ /* Convert to float */
+ errno = 0;
+ // Strtod generates a unix-style error when it's given something out of
+ // range, so we want to get on top of that quickly instead of ignoring it
+ // That way we can avoid some nasty NAN-propagation in the constant folder.
+ double f = strtod(buf, NULL);
+ if (errno == ERANGE) {
+ tok_error("Floating point constant out of range\n");
+ return NULL;
}
- return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
- }
- } else {
- if (is_long) {
- // If the highest bit is set, that means this will overflow a signed
- // long (Due to two's complement)
- if (int_ & (1UL << 63)) {
+ // Warn if the constant is out of range for a float, I.E it's too big or too
+ // small
+ if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
tok_warn(
- "Warning: Integer constant %lld is out of range for long long\n",
- i);
+ "Warning: Floating point constant %f is out of range for float\n", f);
}
- return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
- } else {
- if (int_ & (1UL << 31)) {
- tok_warn("Warning: Integer constant %lld is out of range for int\n",
- int_);
+ // Warn if the constant is too precise for a float
+ if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+ tok_warn("Warning: Converting double precision floating point constant "
+ "%f to float loses "
+ "precision\n",
+ f);
}
- return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
- }
- }
+ return token_create_float(is_single ? TOK_CONST_FLOAT_32
+ : TOK_CONST_FLOAT_64,
+ line, column, f, i);
+
+ } else {
+ /* Convert to integer */
+ errno = 0;
+ uint64_t int_ = strtoull(buf, NULL, radix);
+ // Same as above, but for integers
+ if (errno == ERANGE) {
+ tok_error("Integer constant out of range\n");
+ return NULL;
+ }
+ if (is_unsigned) {
+ if (is_long) {
+ return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+ } else {
+ if (int_ > UINT32_MAX) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for unsigned "
+ "int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+ }
+ } else {
+ if (is_long) {
+ // If the highest bit is set, that means this will overflow a signed
+ // long (Due to two's complement)
+ if (int_ & (1UL << 63)) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for long long\n",
+ i);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+ } else {
+ if (int_ & (1UL << 31)) {
+ tok_warn("Warning: Integer constant %lld is out of range for int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+ }
+ }
+
}
return NULL;
}
+/* Tokenize String */
+/* Read Escape Sequence */
static char read_escape_sequence(int *len) {
int c = input_getc();
*len += 1;
@@ -930,33 +830,6 @@ static char read_escape_sequence(int *len) {
}
}
-static token_t *read_char_constant(void) {
- int c;
- int len = 0;
- c = input_getc();
- if (c != '\'') {
- input_ungetc(c);
- return NULL;
- }
- len++;
- c = input_getc();
- if (c == '\'') {
- tok_error("Empty character constant\n");
- return NULL;
- }
- if (c == '\\') {
- c = read_escape_sequence(&len);
- }
- int val = c;
- c = input_getc();
- if (c != '\'') {
- tok_error("Expected closing quote for character constant\n");
- return NULL;
- }
- len++;
- return token_create_char(TOK_CONST_CHAR, line, column, val, len);
-}
-
static token_t *read_string_literal(void) {
int c;
c = input_getc();
@@ -965,8 +838,6 @@ static token_t *read_string_literal(void) {
return NULL;
}
int i = 0;
- // Malloc is used for the buf here, the pointer stays function local as string
- // interning duplicates the string
char s_buf[512];
char *buf = s_buf;
int len = 512;
@@ -1015,11 +886,196 @@ static token_t *read_string_literal(void) {
return tok;
}
-token_t *read_token(void) {
- if (unget_token != NULL) {
- token_t *tok = unget_token;
- unget_token = NULL;
- return tok;
+/* Tokenize Character */
+static token_t *read_char_constant(void) {
+ int c;
+ int len = 0;
+ c = input_getc();
+ if (c != '\'') {
+ input_ungetc(c);
+ return NULL;
+ }
+ len++;
+ c = input_getc();
+ if (c == '\'') {
+ tok_error("Empty character constant\n");
+ return NULL;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&len);
+ }
+ int val = c;
+ c = input_getc();
+ if (c != '\'') {
+ tok_error("Expected closing quote for character constant\n");
+ return NULL;
+ }
+ len++;
+ return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}
+
+/* Tokenize Operator */
+
+token_t *read_operator(void) {
+ int c;
+ c = input_getc();
+ switch (c) {
+ case '!': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_NE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_NOT, line, column, 1);
+ }
+ case '%': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MOD, line, column, 1);
+ }
+ case '&': {
+ c = input_getc();
+ if (c == '&')
+ return token_create(TOK_OP_AND, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_AND, line, column, 1);
+ }
+ case '(':
+ return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+ case ')':
+ return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+ case '*': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MUL, line, column, 1);
+ }
+ case '+': {
+ c = input_getc();
+ if (c == '+')
+ return token_create(TOK_OP_INC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ADD, line, column, 2);
+ }
+ case ',':
+ return token_create(TOK_SEP_COMMA, line, column, 1);
+ case '-': {
+ c = input_getc();
+ if (c == '-')
+ return token_create(TOK_OP_DEC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+ if (c == '>')
+ return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_SUB, line, column, 1);
+ }
+ case '.': {
+ c = input_getc();
+ if (c == '.') {
+ c = input_getc();
+ if (c == '.') {
+ return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+ } else {
+ // Bail out, can't store more than one unget
+ tok_error("Unexpected character '.' at line %d, column %d\n", line,
+ column);
+ exit(1);
+ }
+ }
+ return token_create('.', line, column, 1);
+ }
+ case '/': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ case ':':
+ return token_create(TOK_OP_COND_DECISION, line, column, 1);
+ case ';':
+ return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+ case '<': {
+ c = input_getc();
+ if (c == '<') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_LSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_LE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_LT, line, column, 1);
+ }
+ case '=': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ASSIGN, line, column, 1);
+ }
+ case '>': {
+ c = input_getc();
+ if (c == '>') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_RSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_GE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_GT, line, column, 1);
+ }
+ case '?':
+ return token_create(TOK_OP_COND, line, column, 1);
+ case '[':
+ return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+ case ']':
+ return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+ case '^': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_XOR, line, column, 1);
+ }
+ case '{':
+ return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+ case '|': {
+ c = input_getc();
+ if (c == '|')
+ return token_create(TOK_OP_OR, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_OR, line, column, 1);
+ }
+ case '}':
+ return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+ case '~':
+ return token_create(TOK_OP_BIT_NOT, line, column, 1);
+ default:
+ input_ungetc(c);
+ return NULL;
+ }
+
+ return NULL;
+}
+
+token_t *next_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[--left_stack_pos];
}
token_t *tok = skip_whitespace();
if (tok != NULL) {
@@ -1052,291 +1108,11 @@ token_t *read_token(void) {
tok_warn(
"Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
line, column);
- return read_token();
+ return next_token();
}
-const char *token_name_from_type(c_token_types type) {
- switch (type) {
- case TOK_CTK_IF:
- return "TOK_CTK_IF";
- case TOK_CTK_ELSE:
- return "TOK_CTK_ELSE";
- case TOK_CTK_SWITCH:
- return "TOK_CTK_SWITCH";
- case TOK_CTK_CASE:
- return "TOK_CTK_CASE";
- case TOK_CTK_DEFAULT:
- return "TOK_CTK_DEFAULT";
- case TOK_CTK_WHILE:
- return "TOK_CTK_WHILE";
- case TOK_CTK_DO:
- return "TOK_CTK_DO";
- case TOK_CTK_FOR:
- return "TOK_CTK_FOR";
- case TOK_CTK_CONTINUE:
- return "TOK_CTK_CONTINUE";
- case TOK_CTK_BREAK:
- return "TOK_CTK_BREAK";
- case TOK_CTK_RETURN:
- return "TOK_CTK_RETURN";
- case TOK_CTK_GOTO:
- return "TOK_CTK_GOTO";
- case TOK_TK_VOID:
- return "TOK_TK_VOID";
- case TOK_TK_CHAR:
- return "TOK_TK_CHAR";
- case TOK_TK_SHORT:
- return "TOK_TK_SHORT";
- case TOK_TK_INT:
- return "TOK_TK_INT";
- case TOK_TK_LONG:
- return "TOK_TK_LONG";
- case TOK_TK_FLOAT:
- return "TOK_TK_FLOAT";
- case TOK_TK_DOUBLE:
- return "TOK_TK_DOUBLE";
- case TOK_TK_SIGNED:
- return "TOK_TK_SIGNED";
- case TOK_TK_UNSIGNED:
- return "TOK_TK_UNSIGNED";
- case TOK_TK_STRUCT:
- return "TOK_TK_STRUCT";
- case TOK_TK_UNION:
- return "TOK_TK_UNION";
- case TOK_TK_ENUM:
- return "TOK_TK_ENUM";
- case TOK_TK_TYPEDEF:
- return "TOK_TK_TYPEDEF";
- case TOK_SCSK_AUTO:
- return "TOK_SCSK_AUTO";
- case TOK_SCSK_REGISTER:
- return "TOK_SCSK_REGISTER";
- case TOK_SCSK_STATIC:
- return "TOK_SCSK_STATIC";
- case TOK_SCSK_EXTERN:
- return "TOK_SCSK_EXTERN";
- case TOK_SCSK_CONST:
- return "TOK_SCSK_CONST";
- case TOK_SCSK_VOLATILE:
- return "TOK_SCSK_VOLATILE";
- case TOK_MK_SIZEOF:
- return "TOK_MK_SIZEOF";
- case TOK_OP_ADD:
- return "TOK_OP_ADD";
- case TOK_OP_SUB:
- return "TOK_OP_SUB";
- case TOK_OP_MUL:
- return "TOK_OP_MUL";
- case TOK_OP_DIV:
- return "TOK_OP_DIV";
- case TOK_OP_MOD:
- return "TOK_OP_MOD";
- case TOK_OP_BIT_AND:
- return "TOK_OP_BIT_AND";
- case TOK_OP_BIT_OR:
- return "TOK_OP_BIT_OR";
- case TOK_OP_BIT_XOR:
- return "TOK_OP_BIT_XOR";
- case TOK_OP_BIT_NOT:
- return "TOK_OP_BIT_NOT";
- case TOK_OP_LSHIFT:
- return "TOK_OP_LSHIFT";
- case TOK_OP_RSHIFT:
- return "TOK_OP_RSHIFT";
- case TOK_OP_NOT:
- return "TOK_OP_NOT";
- case TOK_OP_ASSIGN:
- return "TOK_OP_ASSIGN";
- case TOK_OP_LT:
- return "TOK_OP_LT";
- case TOK_OP_GT:
- return "TOK_OP_GT";
- case TOK_OP_INC:
- return "TOK_OP_INC";
- case TOK_OP_DEC:
- return "TOK_OP_DEC";
- case TOK_OP_EQ:
- return "TOK_OP_EQ";
- case TOK_OP_NE:
- return "TOK_OP_NE";
- case TOK_OP_LE:
- return "TOK_OP_LE";
- case TOK_OP_GE:
- return "TOK_OP_GE";
- case TOK_OP_AND:
- return "TOK_OP_AND";
- case TOK_OP_OR:
- return "TOK_OP_OR";
- case TOK_OP_MEMBER_POINTER:
- return "TOK_OP_MEMBER_POINTER";
- case TOK_OP_MEMBER:
- return "TOK_OP_MEMBER";
- case TOK_OP_COND_DECISION:
- return "TOK_OP_COND_DECISION";
- case TOK_OP_COND:
- return "TOK_OP_COND";
- case TOK_OP_ASSIGN_ADD:
- return "TOK_OP_ASSIGN_ADD";
- case TOK_OP_ASSIGN_SUB:
- return "TOK_OP_ASSIGN_SUB";
- case TOK_OP_ASSIGN_MUL:
- return "TOK_OP_ASSIGN_MUL";
- case TOK_OP_ASSIGN_DIV:
- return "TOK_OP_ASSIGN_DIV";
- case TOK_OP_ASSIGN_MOD:
- return "TOK_OP_ASSIGN_MOD";
- case TOK_OP_ASSIGN_BITAND:
- return "TOK_OP_ASSIGN_BITAND";
- case TOK_OP_ASSIGN_BITOR:
- return "TOK_OP_ASSIGN_BITOR";
- case TOK_OP_ASSIGN_BITXOR:
- return "TOK_OP_ASSIGN_BITXOR";
- case TOK_OP_ASSIGN_LSHIFT:
- return "TOK_OP_ASSIGN_LSHIFT";
- case TOK_OP_ASSIGN_RSHIFT:
- return "TOK_OP_ASSIGN_RSHIFT";
- case TOK_SEP_HASH:
- return "TOK_SEP_HASH";
- case TOK_ID:
- return "TOK_ID";
- case TOK_CONST_INTEGER_U32:
- return "TOK_CONST_INTEGER_U32";
- case TOK_CONST_INTEGER_U64:
- return "TOK_CONST_INTEGER_U64";
- case TOK_CONST_INTEGER_S32:
- return "TOK_CONST_INTEGER_S32";
- case TOK_CONST_INTEGER_S64:
- return "TOK_CONST_INTEGER_S64";
- case TOK_CONST_FLOAT_32:
- return "TOK_CONST_FLOAT_32";
- case TOK_CONST_FLOAT_64:
- return "TOK_CONST_FLOAT_64";
- case TOK_CONST_CHAR:
- return "TOK_CONST_CHAR";
- case TOK_CONST_STRING_ASCII:
- return "TOK_CONST_STRING_ASCII";
- case TOK_SPECIAL_EOF:
- return "TOK_SPECIAL_EOF";
- case TOK_SPECIAL_ERROR:
- return "TOK_SPECIAL_ERROR";
- case TOK_SEP_LEFT_PAREN:
- return "TOK_SEP_LEFT_PAREN";
- case TOK_SEP_RIGHT_PAREN:
- return "TOK_SEP_RIGHT_PAREN";
- case TOK_SEP_LEFT_BRACKET:
- return "TOK_SEP_LEFT_BRACKET";
- case TOK_SEP_RIGHT_BRACKET:
- return "TOK_SEP_RIGHT_BRACKET";
- case TOK_SEP_LEFT_BRACE:
- return "TOK_SEP_LEFT_BRACE";
- case TOK_SEP_RIGHT_BRACE:
- return "TOK_SEP_RIGHT_BRACE";
- case TOK_SEP_COMMA:
- return "TOK_SEP_COMMA";
- case TOK_SEP_SEMICOLON:
- return "TOK_SEP_SEMICOLON";
- case TOK_SEP_DOT:
- return "TOK_SEP_DOT";
- case TOK_SEP_ELLIPSIS:
- return "TOK_SEP_ELLIPSIS";
- }
- return "UNKNOWN";
-}
-
-char *re_escape_string(const char *str) {
- int len = strlen(str);
- char *buf = malloc(len * 2 + 1);
- if (buf == NULL) {
- fprintf(stderr, "Out of memory. Cannot escape string\n");
- exit(1);
- }
- int i = 0;
- for (int j = 0; j < len; j++) {
- switch (str[j]) {
- case '\a':
- buf[i++] = '\\';
- buf[i++] = 'a';
- break;
- case '\b':
- buf[i++] = '\\';
- buf[i++] = 'b';
- break;
- case '\f':
- buf[i++] = '\\';
- buf[i++] = 'f';
- break;
- case '\n':
- buf[i++] = '\\';
- buf[i++] = 'n';
- break;
- case '\r':
- buf[i++] = '\\';
- buf[i++] = 'r';
- break;
- case '\t':
- buf[i++] = '\\';
- buf[i++] = 't';
- break;
- case '\v':
- buf[i++] = '\\';
- buf[i++] = 'v';
- break;
- case '\\':
- buf[i++] = '\\';
- buf[i++] = '\\';
- break;
- case '\'':
- buf[i++] = '\\';
- buf[i++] = '\'';
- break;
- case '"':
- buf[i++] = '\\';
- buf[i++] = '"';
- break;
- default:
- buf[i++] = str[j];
- break;
- }
- }
- buf[i] = '\0';
- return buf;
-}
-
-void print_token(token_t *tok) {
- if (tok == NULL) {
- printf("NULL\n");
- return;
- }
- const char *name = token_name_from_type(tok->kind);
- switch (tok->kind) {
- case TOK_ID:
- case TOK_CONST_STRING_ASCII: {
- char *escaped = re_escape_string(token_string(tok));
- printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
- free(escaped);
- break;
- }
- case TOK_CONST_CHAR:
- printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
- break;
- case TOK_CONST_INTEGER_S32:
- case TOK_CONST_INTEGER_U32:
- case TOK_CONST_INTEGER_S64:
- case TOK_CONST_INTEGER_U64:
- printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
- break;
- case TOK_CONST_FLOAT_32:
- case TOK_CONST_FLOAT_64:
- printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
- break;
- default:
- printf("%s@%d:%d\n", name, tok->line, tok->column);
- break;
- }
-}
-
-#ifdef TOK_TEST
-
+#ifdef TEST_TOKENIZER
+/* Run Test */
char *preprocess(char *in) {
char *output_name = malloc(1024);
snprintf(output_name, 1024, "%s.preprocessed", in);
@@ -1347,7 +1123,7 @@ char *preprocess(char *in) {
return output_name;
}
-// Tokenize ourselves
+// Tokenize the input file
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
@@ -1355,16 +1131,18 @@ int main(int argc, char **argv) {
}
char *input_name = argv[1];
char *preprocessed = preprocess(input_name);
- tokenizer_init(preprocessed);
+ init_tokenizer(preprocessed);
token_t *tok;
- while ((tok = read_token()) != NULL) {
+ while ((tok = next_token()) != NULL) {
print_token(tok);
token_destroy(tok);
}
- tokenizer_destroy();
+ destroy_tokenizer();
remove(preprocessed);
free(preprocessed);
return 0;
}
#endif
+
+
diff --git a/projects/cminus/code/tokenizer.h b/projects/cminus/code/tokenizer.h
@@ -1,132 +1,16 @@
+/* tokenizer.h */
#ifndef TOKENIZER_H
#define TOKENIZER_H
-#include <stdint.h>
-
-typedef struct token token_t;
-typedef enum {
- // Control Keywords
- TOK_CTK_IF,
- TOK_CTK_ELSE,
- TOK_CTK_SWITCH,
- TOK_CTK_CASE,
- TOK_CTK_DEFAULT,
- TOK_CTK_WHILE,
- TOK_CTK_DO,
- TOK_CTK_FOR,
- TOK_CTK_CONTINUE,
- TOK_CTK_BREAK,
- TOK_CTK_RETURN,
- TOK_CTK_GOTO,
-
- // Type Keywords
- TOK_TK_VOID,
- TOK_TK_CHAR,
- TOK_TK_SHORT,
- TOK_TK_INT,
- TOK_TK_LONG,
- TOK_TK_FLOAT,
- TOK_TK_DOUBLE,
- TOK_TK_SIGNED,
- TOK_TK_UNSIGNED,
- TOK_TK_STRUCT,
- TOK_TK_UNION,
- TOK_TK_ENUM,
- TOK_TK_TYPEDEF,
-
- // Storage Class/Specifier Keywords
- TOK_SCSK_AUTO,
- TOK_SCSK_REGISTER,
- TOK_SCSK_STATIC,
- TOK_SCSK_EXTERN,
- TOK_SCSK_CONST,
- TOK_SCSK_VOLATILE,
-
- // Misc Keywords
- TOK_MK_SIZEOF,
-
- // Operators
- TOK_OP_ADD, // +
- TOK_OP_SUB, // -
- TOK_OP_MUL, // *
- TOK_OP_DIV, // /
- TOK_OP_MOD, // %
- TOK_OP_BIT_AND, // &
- TOK_OP_BIT_OR, // |
- TOK_OP_BIT_XOR, // ^
- TOK_OP_BIT_NOT, // ~
- TOK_OP_LSHIFT, // <<
- TOK_OP_RSHIFT, // >>
- TOK_OP_NOT, // !
- TOK_OP_ASSIGN, // =
- TOK_OP_LT, // <
- TOK_OP_GT, // >
- TOK_OP_INC, // ++
- TOK_OP_DEC, // --
- TOK_OP_EQ, // ==
- TOK_OP_NE, // !=
- TOK_OP_LE, // <=
- TOK_OP_GE, // >=
- TOK_OP_AND, // &&
- TOK_OP_OR, // ||
- TOK_OP_MEMBER_POINTER, // ->
- TOK_OP_MEMBER, // .
- TOK_OP_COND_DECISION, // :
- TOK_OP_COND, // ?
- TOK_OP_ASSIGN_ADD, // +=
- TOK_OP_ASSIGN_SUB, // -=
- TOK_OP_ASSIGN_MUL, // *=
- TOK_OP_ASSIGN_DIV, // /=
- TOK_OP_ASSIGN_MOD, // %=
- TOK_OP_ASSIGN_BITAND, // &=
- TOK_OP_ASSIGN_BITOR, // |=
- TOK_OP_ASSIGN_BITXOR, // ^=
- TOK_OP_ASSIGN_LSHIFT, // <<=
- TOK_OP_ASSIGN_RSHIFT, // >>=
-
- // Separators
- TOK_SEP_LEFT_PAREN, // (
- TOK_SEP_RIGHT_PAREN, // )
- TOK_SEP_LEFT_BRACKET, // [
- TOK_SEP_RIGHT_BRACKET, // ]
- TOK_SEP_LEFT_BRACE, // {
- TOK_SEP_RIGHT_BRACE, // }
- TOK_SEP_COMMA, // ,
- TOK_SEP_SEMICOLON, // ;
- TOK_SEP_DOT, // .
- TOK_SEP_ELLIPSIS, // ...
- TOK_SEP_HASH, // #
-
- // Identifiers
- TOK_ID, // Any identifier not ending in _t
-
- // Constants
- TOK_CONST_INTEGER_U32, // u
- TOK_CONST_INTEGER_U64, // ul
- TOK_CONST_INTEGER_S32, // (no suffix)
- TOK_CONST_INTEGER_S64, // l
- TOK_CONST_FLOAT_32, // f
- TOK_CONST_FLOAT_64, // (no suffix)
- TOK_CONST_CHAR, // 'c'
- TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
-
- // Special
- TOK_SPECIAL_EOF,
- TOK_SPECIAL_ERROR,
-} c_token_types;
-
-
-void tokenizer_init(const char *filename);
-token_t *tokenizer_get(void);
-void tokenizer_unget(token_t *token);
-void tokenizer_destroy(void);
-
-c_token_types token_type(token_t *token);
-int64_t token_int(token_t *token);
-double token_float(token_t *token);
-const char *token_string(token_t *token);
-char token_char(token_t *token);
-int token_line(token_t *token);
-int token_column(token_t *token);
-void token_destroy(token_t *token);
+#include "token.h"
+#include "input.h"
+/* Tokenization Interface */
+void init_tokenizer(const char *filename);
+void destroy_tokenizer(void);
+token_t *next_token(void);
+void reject_token(token_t *token);
+token_t *peek_token(void);
+void consume(c_token_types kind);
+void consume_alt(c_token_types *kinds, int n);
#endif
+
diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html
@@ -1,269 +1,391 @@
<!DOCTYPE html>
-<html lang="en">
-
+<html>
<head>
- <meta charset="utf-8">
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
- <meta name="description" content="Welcome to my alright blog">
- <meta name="author" content="Reagan Fischer">
- <title>Lexer</title>
- <link rel="icon" type="image/x-icon" href="/images/win95.png">
- <link rel="apple-touch-icon" href="/images/win95.png">
- <link rel="stylesheet" href="/projects/style.css">
- <script src="/js/prettify/run_prettify.js"></script>
+<meta charset="utf-8">
+<title>Lexer</title>
+<script>
+!function(){var q=null;window.PR_SHOULD_USE_CONTINUATION=!0;
+(function(){function R(a){function d(e){var b=e.charCodeAt(0);if(b!==92)return b;var a=e.charAt(1);return(b=r[a])?b:"0"<=a&&a<="7"?parseInt(e.substring(1),8):a==="u"||a==="x"?parseInt(e.substring(2),16):e.charCodeAt(1)}function g(e){if(e<32)return(e<16?"\\x0":"\\x")+e.toString(16);e=String.fromCharCode(e);return e==="\\"||e==="-"||e==="]"||e==="^"?"\\"+e:e}function b(e){var b=e.substring(1,e.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),e=[],a=
+b[0]==="^",c=["["];a&&c.push("^");for(var a=a?1:0,f=b.length;a<f;++a){var h=b[a];if(/\\[bdsw]/i.test(h))c.push(h);else{var h=d(h),l;a+2<f&&"-"===b[a+1]?(l=d(b[a+2]),a+=2):l=h;e.push([h,l]);l<65||h>122||(l<65||h>90||e.push([Math.max(65,h)|32,Math.min(l,90)|32]),l<97||h>122||e.push([Math.max(97,h)&-33,Math.min(l,122)&-33]))}}e.sort(function(e,a){return e[0]-a[0]||a[1]-e[1]});b=[];f=[];for(a=0;a<e.length;++a)h=e[a],h[0]<=f[1]+1?f[1]=Math.max(f[1],h[1]):b.push(f=h);for(a=0;a<b.length;++a)h=b[a],c.push(g(h[0])),
+h[1]>h[0]&&(h[1]+1>h[0]&&c.push("-"),c.push(g(h[1])));c.push("]");return c.join("")}function s(e){for(var a=e.source.match(/\[(?:[^\\\] ]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),c=a.length,d=[],f=0,h=0;f<c;++f){var l=a[f];l==="("?++h:"\\"===l.charAt(0)&&(l=+l.substring(1))&&(l<=h?d[l]=-1:a[f]=g(l))}for(f=1;f<d.length;++f)-1===d[f]&&(d[f]=++x);for(h=f=0;f<c;++f)l=a[f],l==="("?(++h,d[h]||(a[f]="(?:")):"\\"===l.charAt(0)&&(l=+l.substring(1))&&l<=h&&
+(a[f]="\\"+d[l]);for(f=0;f<c;++f)"^"===a[f]&&"^"!==a[f+1]&&(a[f]="");if(e.ignoreCase&&m)for(f=0;f<c;++f)l=a[f],e=l.charAt(0),l.length>=2&&e==="["?a[f]=b(l):e!=="\\"&&(a[f]=l.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return a.join("")}for(var x=0,m=!1,j=!1,k=0,c=a.length;k<c;++k){var i=a[k];if(i.ignoreCase)j=!0;else if(/[a-z]/i.test(i.source.replace(/\\u[\da-f]{4}|\\x[\da-f]{2}|\\[^UXux]/gi,""))){m=!0;j=!1;break}}for(var r={b:8,t:9,n:10,v:11,
+f:12,r:13},n=[],k=0,c=a.length;k<c;++k){i=a[k];if(i.global||i.multiline)throw Error(""+i);n.push("(?:"+s(i)+")")}return RegExp(n.join("|"),j?"gi":"g")}function S(a,d){function g(a){var c=a.nodeType;if(c==1){if(!b.test(a.className)){for(c=a.firstChild;c;c=c.nextSibling)g(c);c=a.nodeName.toLowerCase();if("br"===c||"li"===c)s[j]="\n",m[j<<1]=x++,m[j++<<1|1]=a}}else if(c==3||c==4)c=a.nodeValue,c.length&&(c=d?c.replace(/\r\n?/g,"\n"):c.replace(/[\t\n\r ]+/g," "),s[j]=c,m[j<<1]=x,x+=c.length,m[j++<<1|1]=
+a)}var b=/(?:^|\s)nocode(?:\s|$)/,s=[],x=0,m=[],j=0;g(a);return{a:s.join("").replace(/\n$/,""),d:m}}function H(a,d,g,b){d&&(a={a:d,e:a},g(a),b.push.apply(b,a.g))}function T(a){for(var d=void 0,g=a.firstChild;g;g=g.nextSibling)var b=g.nodeType,d=b===1?d?a:g:b===3?U.test(g.nodeValue)?a:d:d;return d===a?void 0:d}function D(a,d){function g(a){for(var j=a.e,k=[j,"pln"],c=0,i=a.a.match(s)||[],r={},n=0,e=i.length;n<e;++n){var z=i[n],w=r[z],t=void 0,f;if(typeof w==="string")f=!1;else{var h=b[z.charAt(0)];
+if(h)t=z.match(h[1]),w=h[0];else{for(f=0;f<x;++f)if(h=d[f],t=z.match(h[1])){w=h[0];break}t||(w="pln")}if((f=w.length>=5&&"lang-"===w.substring(0,5))&&!(t&&typeof t[1]==="string"))f=!1,w="src";f||(r[z]=w)}h=c;c+=z.length;if(f){f=t[1];var l=z.indexOf(f),B=l+f.length;t[2]&&(B=z.length-t[2].length,l=B-f.length);w=w.substring(5);H(j+h,z.substring(0,l),g,k);H(j+h+l,f,I(w,f),k);H(j+h+B,z.substring(B),g,k)}else k.push(j+h,w)}a.g=k}var b={},s;(function(){for(var g=a.concat(d),j=[],k={},c=0,i=g.length;c<i;++c){var r=
+g[c],n=r[3];if(n)for(var e=n.length;--e>=0;)b[n.charAt(e)]=r;r=r[1];n=""+r;k.hasOwnProperty(n)||(j.push(r),k[n]=q)}j.push(/[\S\s]/);s=R(j)})();var x=d.length;return g}function v(a){var d=[],g=[];a.tripleQuotedStrings?d.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?d.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/,
+q,"'\"`"]):d.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&g.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var b=a.hashComments;b&&(a.cStyleComments?(b>1?d.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):d.push(["com",/^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),g.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h(?:h|pp|\+\+)?|[a-z]\w*)>/,q])):d.push(["com",
+/^#[^\n\r]*/,q,"#"]));a.cStyleComments&&(g.push(["com",/^\/\/[^\n\r]*/,q]),g.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));if(b=a.regexLiterals){var s=(b=b>1?"":"\n\r")?".":"[\\S\\s]";g.push(["lang-regex",RegExp("^(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*("+("/(?=[^/*"+b+"])(?:[^/\\x5B\\x5C"+b+"]|\\x5C"+s+"|\\x5B(?:[^\\x5C\\x5D"+b+"]|\\x5C"+
+s+")*(?:\\x5D|$))+/")+")")])}(b=a.types)&&g.push(["typ",b]);b=(""+a.keywords).replace(/^ | $/g,"");b.length&&g.push(["kwd",RegExp("^(?:"+b.replace(/[\s,]+/g,"|")+")\\b"),q]);d.push(["pln",/^\s+/,q," \r\n\t\u00a0"]);b="^.[^\\s\\w.$@'\"`/\\\\]*";a.regexLiterals&&(b+="(?!s*/)");g.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/,
+q],["pun",RegExp(b),q]);return D(d,g)}function J(a,d,g){function b(a){var c=a.nodeType;if(c==1&&!x.test(a.className))if("br"===a.nodeName)s(a),a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)b(a);else if((c==3||c==4)&&g){var d=a.nodeValue,i=d.match(m);if(i)c=d.substring(0,i.index),a.nodeValue=c,(d=d.substring(i.index+i[0].length))&&a.parentNode.insertBefore(j.createTextNode(d),a.nextSibling),s(a),c||a.parentNode.removeChild(a)}}function s(a){function b(a,c){var d=
+c?a.cloneNode(!1):a,e=a.parentNode;if(e){var e=b(e,1),g=a.nextSibling;e.appendChild(d);for(var i=g;i;i=g)g=i.nextSibling,e.appendChild(i)}return d}for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),d;(d=a.parentNode)&&d.nodeType===1;)a=d;c.push(a)}for(var x=/(?:^|\s)nocode(?:\s|$)/,m=/\r\n?|\n/,j=a.ownerDocument,k=j.createElement("li");a.firstChild;)k.appendChild(a.firstChild);for(var c=[k],i=0;i<c.length;++i)b(c[i]);d===(d|0)&&c[0].setAttribute("value",d);var r=j.createElement("ol");
+r.className="linenums";for(var d=Math.max(0,d-1|0)||0,i=0,n=c.length;i<n;++i)k=c[i],k.className="L"+(i+d)%10,k.firstChild||k.appendChild(j.createTextNode("\u00a0")),r.appendChild(k);a.appendChild(r)}function p(a,d){for(var g=d.length;--g>=0;){var b=d[g];F.hasOwnProperty(b)?E.console&&console.warn("cannot override language handler %s",b):F[b]=a}}function I(a,d){if(!a||!F.hasOwnProperty(a))a=/^\s*</.test(d)?"default-markup":"default-code";return F[a]}function K(a){var d=a.h;try{var g=S(a.c,a.i),b=g.a;
+a.a=b;a.d=g.d;a.e=0;I(d,b)(a);var s=/\bMSIE\s(\d+)/.exec(navigator.userAgent),s=s&&+s[1]<=8,d=/\n/g,x=a.a,m=x.length,g=0,j=a.d,k=j.length,b=0,c=a.g,i=c.length,r=0;c[i]=m;var n,e;for(e=n=0;e<i;)c[e]!==c[e+2]?(c[n++]=c[e++],c[n++]=c[e++]):e+=2;i=n;for(e=n=0;e<i;){for(var p=c[e],w=c[e+1],t=e+2;t+2<=i&&c[t+1]===w;)t+=2;c[n++]=p;c[n++]=w;e=t}c.length=n;var f=a.c,h;if(f)h=f.style.display,f.style.display="none";try{for(;b<k;){var l=j[b+2]||m,B=c[r+2]||m,t=Math.min(l,B),A=j[b+1],G;if(A.nodeType!==1&&(G=x.substring(g,
+t))){s&&(G=G.replace(d,"\r"));A.nodeValue=G;var L=A.ownerDocument,o=L.createElement("span");o.className=c[r+1];var v=A.parentNode;v.replaceChild(o,A);o.appendChild(A);g<l&&(j[b+1]=A=L.createTextNode(x.substring(t,l)),v.insertBefore(A,o.nextSibling))}g=t;g>=l&&(b+=2);g>=B&&(r+=2)}}finally{if(f)f.style.display=h}}catch(u){E.console&&console.log(u&&u.stack||u)}}var E=window,y=["break,continue,do,else,for,if,return,while"],C=[[y,"auto,case,char,const,default,double,enum,extern,float,goto,inline,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"],
+"catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],M=[C,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,delegate,dynamic_cast,explicit,export,friend,generic,late_check,mutable,namespace,nullptr,property,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],V=[C,"abstract,assert,boolean,byte,extends,final,finally,implements,import,instanceof,interface,null,native,package,strictfp,super,synchronized,throws,transient"],
+N=[C,"abstract,as,base,bool,by,byte,checked,decimal,delegate,descending,dynamic,event,finally,fixed,foreach,from,group,implicit,in,interface,internal,into,is,let,lock,null,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var,virtual,where"],C=[C,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],O=[y,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"],
+P=[y,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],W=[y,"as,assert,const,copy,drop,enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,pub,pure,ref,self,static,struct,true,trait,type,unsafe,use"],y=[y,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],Q=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)\b/,
+U=/\S/,X=v({keywords:[M,N,C,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",O,P,y],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),F={};p(X,["default-code"]);p(D([],[["pln",/^[^<?]+/],["dec",/^<!\w[^>]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-",
+/^<xmp\b[^>]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^<script\b[^>]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^<style\b[^>]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i] ]),["default-markup","htm","html","mxml","xhtml","xml","xsl"]);p(D([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"] ],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/],
+["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css",/^style\s*=\s*([^\s"'>]+)/i] ]),["in.tag"]);p(D([],[["atv",/^[\S\s]+/] ]),["uq.val"]);p(v({keywords:M,hashComments:!0,cStyleComments:!0,types:Q}),["c","cc","cpp","cxx","cyc","m"]);p(v({keywords:"null,true,false"}),["json"]);p(v({keywords:N,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:Q}),
+["cs"]);p(v({keywords:V,cStyleComments:!0}),["java"]);p(v({keywords:y,hashComments:!0,multiLineStrings:!0}),["bash","bsh","csh","sh"]);p(v({keywords:O,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}),["cv","py","python"]);p(v({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:2}),["perl","pl","pm"]);p(v({keywords:P,
+hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb","ruby"]);p(v({keywords:C,cStyleComments:!0,regexLiterals:!0}),["javascript","js"]);p(v({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,throw,true,try,unless,until,when,while,yes",hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);p(v({keywords:W,cStyleComments:!0,multilineStrings:!0}),["rc","rs","rust"]);
+p(D([],[["str",/^[\S\s]+/] ]),["regex"]);var Y=E.PR={createSimpleLexer:D,registerLangHandler:p,sourceDecorator:v,PR_ATTRIB_NAME:"atn",PR_ATTRIB_VALUE:"atv",PR_COMMENT:"com",PR_DECLARATION:"dec",PR_KEYWORD:"kwd",PR_LITERAL:"lit",PR_NOCODE:"nocode",PR_PLAIN:"pln",PR_PUNCTUATION:"pun",PR_SOURCE:"src",PR_STRING:"str",PR_TAG:"tag",PR_TYPE:"typ",prettyPrintOne:E.prettyPrintOne=function(a,d,g){var b=document.createElement("div");b.innerHTML="<pre>"+a+"</pre>";b=b.firstChild;g&&J(b,g,!0);K({h:d,j:g,c:b,i:1});
+return b.innerHTML},prettyPrint:E.prettyPrint=function(a,d){function g(){for(var b=E.PR_SHOULD_USE_CONTINUATION?c.now()+250:Infinity;i<p.length&&c.now()<b;i++){for(var d=p[i],j=h,k=d;k=k.previousSibling;){var m=k.nodeType,o=(m===7||m===8)&&k.nodeValue;if(o?!/^\??prettify\b/.test(o):m!==3||/\S/.test(k.nodeValue))break;if(o){j={};o.replace(/\b(\w+)=([\w%+\-.:]+)/g,function(a,b,c){j[b]=c});break}}k=d.className;if((j!==h||e.test(k))&&!v.test(k)){m=!1;for(o=d.parentNode;o;o=o.parentNode)if(f.test(o.tagName)&&
+o.className&&e.test(o.className)){m=!0;break}if(!m){d.className+=" prettyprinted";m=j.lang;if(!m){var m=k.match(n),y;if(!m&&(y=T(d))&&t.test(y.tagName))m=y.className.match(n);m&&(m=m[1])}if(w.test(d.tagName))o=1;else var o=d.currentStyle,u=s.defaultView,o=(o=o?o.whiteSpace:u&&u.getComputedStyle?u.getComputedStyle(d,q).getPropertyValue("white-space"):0)&&"pre"===o.substring(0,3);u=j.linenums;if(!(u=u==="true"||+u))u=(u=k.match(/\blinenums\b(?::(\d+))?/))?u[1]&&u[1].length?+u[1]:!0:!1;u&&J(d,u,o);r=
+{h:m,c:d,j:u,i:o};K(r)}}}i<p.length?setTimeout(g,250):"function"===typeof a&&a()}for(var b=d||document.body,s=b.ownerDocument||document,b=[b.getElementsByTagName("pre"),b.getElementsByTagName("code"),b.getElementsByTagName("xmp")],p=[],m=0;m<b.length;++m)for(var j=0,k=b[m].length;j<k;++j)p.push(b[m][j]);var b=q,c=Date;c.now||(c={now:function(){return+new Date}});var i=0,r,n=/\blang(?:uage)?-([\w.]+)(?!\S)/,e=/\bprettyprint\b/,v=/\bprettyprinted\b/,w=/pre|xmp/i,t=/^code$/i,f=/^(?:pre|code|xmp)$/i,
+h={};g()}};typeof define==="function"&&define.amd&&define("google-code-prettify",[],function(){return Y})})();}()
+</script>
+<style>
+.pln{color:#1b181b}.str{color:#918b3b}.kwd{color:#7b59c0}.com{color:#9e8f9e}.typ{color:#516aec}.lit{color:#a65926}.clo,.opn,.pun{color:#1b181b}.tag{color:#ca402b}.atn{color:#a65926}.atv{color:#159393}.dec{color:#a65926}.var{color:#ca402b}.fun{color:#516aec}pre.prettyprint{background:#f7f3f7;color:#ab9bab;font-family:Menlo,Consolas,"Bitstream Vera Sans Mono","DejaVu Sans Mono",Monaco,monospace;font-size:12px;line-height:1.5;border:1px solid #d8cad8;padding:10px}ol.linenums{margin-top:0;margin-bottom:0}
+body{min-width:200px;max-width:850px;margin:0 auto;padding:30px;}.chapter-nav{font-size: 10pt;}a:link,a:visited{color:#00f}.codeblock_name,code,pre.prettyprint{font-family:Monaco,"Lucida Console",monospace}body{font-size:14pt}.codeblock_name,.math,.seealso,code{font-size:10pt}.codeblock{page-break-inside:avoid;padding-bottom:15px}.math{text-indent:0}pre.prettyprint{font-size:10pt;padding:10px;border-radius:10px;border:none;white-space:pre-wrap}.codeblock_name{margin-top:1.25em;display:block}a:link{text-decoration:none}a:link:not(.lit):hover{color:#00f;text-decoration:underline}a:link:active{color:red}h4{padding-right:1.25em}h4.noheading{margin-bottom:0}h1{text-align:center}code{padding:2px}pre{-moz-tab-size:4;-o-tab-size:4;tab-size:4}p:not(.notp){margin:0;text-indent:2em}.two-col{list-style-type:none}.two-col li:before{content:'-';padding:5px;margin-right:5px;color:orange;background-color:#fff;display:inline-block}@media print{body{font-size:10pt}pre.prettyprint{font-size:8pt}.seealso{font-size:9pt}.codeblock_name,.math,code{font-size:8pt}.math{text-indent:0}}
+/* code blocks (Style from jmeiners.com/lc3-vm, CC BY-NC-SA 4.0, used with attribution) */
+
+/* Quotes and Block Quotes */
+blockquote {
+ margin: 1.5em 10px;
+ padding: 0.5em 10px;
+ border-left: 5px solid #ccc;
+ color: #666;
+ background-color: #f9f9f9;
+ font-style: italic;
+}
+
+blockquote p {
+ margin: 0;
+ font-size: 1.2em;
+}
+
+q {
+ quotes: "“" "”" "‘" "’";
+ font-style: italic;
+}
+
+q::before {
+ content: open-quote;
+}
+
+q::after {
+ content: close-quote;
+}
+
+/*! Color themes for Google Code Prettify | MIT License | github.com/jmblog/color-themes-for-google-code-prettify */
+.prettyprint {
+ background: #f5f7ff;
+ font-family: Menlo, "Bitstream Vera Sans Mono", "DejaVu Sans Mono", Monaco, Consolas, monospace;
+ border: 0 !important;
+}
+
+.pln {
+ color: #202746;
+}
+
+/* Specify class=linenums on a pre to get line numbering */
+ol.linenums {
+ margin-top: 0;
+ margin-bottom: 0;
+ color: #202746;
+}
+
+li.L0,
+li.L1,
+li.L2,
+li.L3,
+li.L4,
+li.L5,
+li.L6,
+li.L7,
+li.L8,
+li.L9 {
+ padding-left: 1em;
+ background-color: #f5f7ff;
+ list-style-type: decimal;
+}
+
+@media screen {
+
+ /* string content */
+
+ .str {
+ color: #ac9739;
+ }
+
+ /* keyword */
+
+ .kwd {
+ color: #6679cc;
+ }
+
+ /* comment */
+
+ .com {
+ color: #202746;
+ }
+
+ /* type name */
+
+ .typ {
+ color: #3d8fd1;
+ }
+
+ /* literal value */
+
+ .lit {
+ color: #c76b29;
+ }
+
+ /* punctuation */
+
+ .pun {
+ color: #202746;
+ }
+
+ /* lisp open bracket */
+
+ .opn {
+ color: #202746;
+ }
+
+ /* lisp close bracket */
+
+ .clo {
+ color: #202746;
+ }
+
+ /* markup tag name */
+
+ .tag {
+ color: #c94922;
+ }
+
+ /* markup attribute name */
+
+ .atn {
+ color: #c76b29;
+ }
+
+ /* markup attribute value */
+
+ .atv {
+ color: #22a2c9;
+ }
+
+ /* declaration */
+
+ .dec {
+ color: #c76b29;
+ }
+
+ /* variable name */
+
+ .var {
+ color: #c94922;
+ }
+
+ /* function name */
+
+ .fun {
+ color: #3d8fd1;
+ }
+}</style>
</head>
+<body onload="prettyPrint()">
+<section>
+<h1>Lexer</h1>
+<a name="1:1"><div class="section"><h4>1. General Project Structure</h4></a>
+<p>Since this is the first article, I'll outline the project structure for the C- compiler.
+</p>
+<p>The project has a series of pretty typical stages:
+</p>
+<ol>
+<li>The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E").
+</li>
+<li>The parser. This takes the tokens and builds an abstract syntax tree (AST).
+</li>
+<li>The symbol table. This exists in a sort of in-between space next to the lexer and parser. It's used to store information about variables and functions.
+</li>
+<li>The type checker. This is used to ensure that the types of variables and functions are correct.
+</li>
+<li>The code generator. This takes the AST and generates an intermediate representation (IR).
+</li>
+<li>The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages.
+</li>
+<li>The lowerer. This takes the IR and lowers it to a simpler IR.
+</li>
+<li>The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers.
+</li>
+<li>The code emitter. This takes the IR and emits RISC-V assembly.
+</li>
+</ol>
+<p>As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena.
+</p>
-<body style="background-color: #cedeff; font-family: Arial, sans-serif;">
- <header style="text-align: center; padding: 20px;">
- <h1>Lexer</h1>
- <div style="text-align: center; margin-top: 10px;">
- <img src="/images/line.gif" alt="decorative line">
- </div>
- </header>
- <noscript>
- <div style="background-color: #ffcccc; padding: 10px; text-align: center;">
- <p><img src="/images/netscapenow.gif" alt="Netscape badge" width="88" height="31">
- This site might look pretty old, but it still uses JS for syntax highlighting. If you'd like to see code
- examples that don't look monochrome, please enable JS. or don't. I'm not your dad.</p>
- </div>
- </noscript>
- <main style="padding: 20px;">
- <h2>General Project Structure</h2>
- <p>Since this is the first article, I'll outline the project structure for the C- interpreter.</p>
- <p>The project has a series of pretty typical stages:</p>
- <ol>
- <li>The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I
- outsource that to "gcc -E"). </li>
- <li>The parser. This takes the tokens and builds an abstract syntax tree (AST). </li>
- <li>The symbol table. This exists in a sort of in-between space next to the lexer and parser. It's used to store
- information about variables and functions. </li>
- <li>The type checker. This is used to ensure that the types of variables and functions are correct. </li>
- <li>The code generator. This takes the AST and generates an intermediate representation (IR). </li>
- <li>The optimizer. This takes the IR and optimizes it. </li>
- <li>The VM. This takes the optimized IR and executes it. </li>
- <li>The garbage collector. This is used to manage memory. </li>
- </ol>
- <p>As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies
- memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on
- the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the
- memory used by a stage by simply freeing the arena. </p>
- <h2>Some Rules</h2>
- <p>Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep
- things simple and consistent. </p>
- <h4>1. PROGRAM LIKE IT'S 1999</h4>
- <blockquote>
- 640 KB ought to be enough for anybody. - Bill Gates
- </blockquote>
- <p>Maybe not that little, But I'm going to try to keep the project as simple
- as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory
- usage. </p>
- <p>This places a lot of constraints on the project, but I think it's a good exercise in minimalism. </p>
- <p>Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program
- structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the
- future). </p>
- <h4>2. PROGRAM IN C++--</h4>
- <p>I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try
- and keep data structures out of header files, and only expose functions that operate on those data structures, to
- create a sort of approximation of a class. This has a few benefits: </p>
- <ul>
- <li>Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than
- every file that includes the header. </li>
- <li>Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to
- misuse that data structure. </li>
- <li>Run time type checking. I can include some sort of tag in the first field of every data structure to ensure
- that the correct functions are being called. </li>
- </ul>
- <h4>3. DON'T GET FANCY</h4>
- <p>My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make
- something that works and can be understood by someone else. </p>
- <p>That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on
- simplicity and readability. </p>
- <h4>4. DESIGN FOR DEBUGGING</h4>
- <p>This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at
- any point. </p>
- <p>This might be painful, but it'll make debugging a lot simpler and let users look under the hood.</p>
- <h4>5. SMART DATA, STUPID CODE</h4>
- <p>A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design
- data structures which make the algorithms as simple as possible. </p>
- <p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a
- simple hash table. </p>
- <h4>Misc</h4>
- <p>The code on the blog is not the full code. I'm leaving out stuff like header guards and includes for brevity. The
- full code is available on the <a href="git.reagancfischer.dev/cminus.git">git repo</a>.</p>
- <h2>The Lexer</h2>
- <p>A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language.
- For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.),
- identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). </p>
- <p>Given a string like <code class="prettyprint">int main() { return 0; }</code>, the lexer would produce a series
- of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>,
- <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>,
- <code>RBRACE</code>.
- </p>
- <h3>Design</h3>
- <p>I'll break the lexer up into two modules, <code>tokenizer.c</code> and <code>input.c</code>. The input module
- will
- be
- responsible for reading the file and providing characters to the tokenizer, while the tokenizer module will be
- responsible
- for producing tokens. </p>
- <h3>Input</h3>
- <h4>Input Interface</h4>
- <p>Input will provide a simple interface for reading characters from a file. The stream itself is deliberately
- hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that. </p>
- <div class="code-block">
- <span class="file-name">input.h</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-interface" href="#:input-interface">Input
- interface</a></em></strong></span>
- <pre class="prettyprint"><code class="">void input_init(const char *filename);
-int input_getc(void);
-void input_ungetc(int c);
-void input_destroy(void);</code></pre>
- </div>
- <p>When the program wants to start reading a file, it calls <code>input_init</code> with the filename. It can then
- call <code>input_getc</code> to get the next character in the file. If there's no more input,
- <code>input_getc</code>
- will return <code>EOF</code>.
- </p>
- <p>There's also a <code>input_ungetc</code> function, which allows the program to put a character back into the
- stream. I'll only allow one character to be put back, but that should be enough for the tokenizer. </p>
- <p>Finally, when the program is done reading the file, it should call <code>input_destroy</code> to clean up. </p>
- <h4>Input Design Decisions</h4>
- <p>Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into
- memory, we'll need to read it in chunks. There are a couple of choices for how to do this:</p>
- <ol>
- <li>Read a line at a time. This is a more natural approach, but it has two drawbacks. First, it requires a large
- buffer to store the line (C normally specifies BUFSIZ as 8192 bytes). Second, if the line is longer than BUFSIZ,
- we'll have to read the line in chunks anyway. </li>
- <li>Choose some arbitrary buffer size and read that many bytes at a time. This is the approach I'm going to take.
- It's a little less natural, but it's more memory efficient. </li>
- </ol>
- <p>Input will read chunks of 128 bytes at a time, reusing the same static buffer. This limitation is not visible to
- the tokenizer, which will only see the <code>input_getc</code> interface. </p>
- <p>When the buffer is exhausted, <code>input_getc</code> will call <code>nextline</code>, which will read the next
- chunk of the file. </p>
- <h4>Input Implementation</h4>
- <p>The implementation of the input module is pretty straightforward. We have the following data structures and
- defines as globals: </p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-data" href="#:input-data">Input
- data</a></em></strong></span>
- <pre class="prettyprint"><code class="">#define CHUNK_SIZE 128
-static char buffer[CHUNK_SIZE];
-static int buffer_pos = 0;
-static int buffer_size = 0;
-static char unget_buffer = '\0';
-static char unget_buffer_stack[8];
-static int unget_buffer_stack_pos = 0;
+</div>
+<a name="1:2"><div class="section"><h4>2. Some Rules</h4></a>
+<p>Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent.
+</p>
+<ol>
+<li><p>PROGRAM LIKE IT'S 1999
+</p>
+</li>
+</ol>
+<blockquote><p> 640 KB ought to be enough for anybody. - Bill Gates
+</p>
+</blockquote>
+<p>Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage.
+</p>
+<p>This places a lot of constraints on the project, but I think it's a good exercise in minimalism.
+</p>
+<p>Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future).
+</p>
+<ol>
+<li><p>PROGRAM IN C++--
+</p>
+</li>
+</ol>
+<p>I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits:
+</p>
+<ul>
+<li>Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header.
+</li>
+<li>Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure.
+</li>
+<li>Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called.
+</li>
+</ul>
+<ol>
+<li><p>DON'T GET FANCY
+</p>
+</li>
+</ol>
+<p>My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else.
+</p>
+<p>That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability.
+</p>
+<ol>
+<li><p>DESIGN FOR DEBUGGING
+</p>
+</li>
+</ol>
+<p>This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point.
+</p>
+<p>This might be painful, but it'll make debugging a lot simpler and let users look under the hood.
+</p>
+<ol>
+<li><p>SMART DATA, STUPID CODE
+</p>
+</li>
+</ol>
+<p>A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible.
+</p>
+<p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table.
+</p>
-static FILE *file = NULL;
-</code></pre>
- </div>
- <p>When the program calls <code>input_init</code>, we open the file.</p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-init" href="#:input-init">Input
- initialization</a></em></strong></span>
- <pre class="prettyprint"><code class="">void input_init(const char *filename) {
- file = fopen(filename, "r");
- if (file == NULL) {
- fprintf(stderr, "Error: Cannot open file %s\n", filename);
- exit(1);
- }
-}</code></pre>
- </div>
- <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is
- exhausted, we call <code>nextline</code>. We also track the line and column. </p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-getc" href="#:input-getc">Input
- getc</a></em></strong></span>
- <pre class="prettyprint"><code class="">int input_getc(void) {
- if (unget_buffer_stack_pos > 0) {
- return unget_buffer_stack[--unget_buffer_stack_pos];
- }
- if (buffer_pos == buffer_size) {
- buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
- buffer_pos = 0;
- }
- if (buffer_size == 0) {
- return EOF;
- }
- char c = buffer[buffer_pos++];
- return c;
-}</code></pre>
- </div>
- <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. </p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-ungetc" href="#:input-ungetc">Input
- ungetc</a></em></strong></span>
- <pre
- class="prettyprint"><code class="">void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }</code></pre>
- </div>
- <p>Since we're not using dynamic memory allocation, cleanup is pretty simple.</p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-destroy" href="#:input-destroy">Input
- destroy</a></em></strong></span>
- <pre class="prettyprint"><code class="">void input_destroy(void) {
- fclose(file);
-}</code></pre>
- </div>
- <p>Finally, the <code>nextline</code> function reads the next chunk of the file into the buffer. </p>
- <div class="code-block">
- <span class="file-name">input.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":input-nextline" href="#:input-nextline">Input
- nextline</a></em></strong></span>
- <pre class="prettyprint"><code class="">static void nextline(void) {
- buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
- buffer_pos = 0;
-}</code></pre>
- </div>
-
- <h4>Summary</h4>
- <p>The input module's full code is available <a href="/projects/cminus/code/input.c">here for the implementation</a>
- and
- <a href="/projects/cminus/code/input.h">here for the interface</a>.
- </p>
-
- <h3>Tokenizer</h3>
- <h4>Tokenizer Interface</h4>
- <p>As discussed earlier, the tokenizer interface will consist of two parts. First, functions for initializing the
- lexer and getting tokens. Second, functions for reading from a given token. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.h</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":tokenizer-interface" href="#:tokenizer-interface">Tokenizer
- interface</a></em></strong></span>
- <pre class="prettyprint"><code class="">void tokenizer_init(const char *filename);
-token_t *tokenizer_get(void);
-void tokenizer_unget(token_t *token);
-void tokenizer_destroy(void);
+</div>
+<a name="1:3"><div class="section"><h4>3. Misc</h4></a>
+<p>THIS IS A LITERATE PROGRAM! Go to <a href="https://reagancfischer.dev/projects/cminus/code/lexer.lit">this link</a> to see the file that generated this HTML.
+</p>
+
+</div>
+<a name="1:4"><div class="section"><h4>4. The Lexer</h4></a>
+<p>A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language. For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.).
+</p>
+<p>Given a string like <code>int main() { return 0; }</code>, the lexer would produce a series of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>, <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>, <code>RBRACE</code>.
+</p>
+
+</div>
+<a name="1:5"><div class="section"><h4>5. Design</h4></a>
+<p>I'll break the lexer up into a couple of modules. <code>token.c</code> will contain the token data structure and functions to create and destroy tokens. <code>input.c</code> will contain the input data structure and functions to read from the input file. <code>tokenizer.c</code> will contain the main lexer logic.
+</p>
+
+</div>
+<a name="1:6"><div class="section"><h4>6. Token Interface</h4></a>
+<p>Tokens are the smallest unit of meaning in a language. They're used by the parser to build an abstract syntax tree (AST). We'll need a couple of things to represent a token:
+</p>
+<ul>
+<li>The type of token. This will be an enum, with values like <code>TOK_CTK_IF</code> or <code>TOK_CONST_INTEGER_U32</code>.
+</li>
+<li>The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do.
+</li>
+<li>The line and column of the token. This is used for error messages.
+</li>
+</ul>
+<p>As I mentioned earlier, we're trying to implement a sort of class system in C. For that, we'll need to hide the token implementation details behind an opaque pointer. We could just have a <code>void</code> pointer, but that stops us from being able to use compile-time type checking. Instead, we'll use a forward declaration of the token type in the header file, and then define the token type in the implementation file.
+</p>
+
+</div>
+<a name="1:7"><div class="section"><h4 class="noheading">7. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Opaque Token Type <a href="lexer.html#1:7">7</a>}</span>
+<pre class="prettyprint lang-c">
+typedef struct token token_t;
+</pre>
+
+
+<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p>
+</div>
+</div>
+<a name="1:8"><div class="section"><h4 class="noheading">8. </h4></a>
+<p>We'll need a couple of functions to create and destroy tokens.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Creation and Destruction Interface <a href="lexer.html#1:8">8</a>}</span>
+<pre class="prettyprint lang-c">
+token_t *token_data_create(c_token_types kind, int lin, int col, int len);
+
+token_t *token_create(c_token_types kind, int lin, int col, int len);
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len);
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len);
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len);
+
+token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len);
+
+void token_destroy(token_t *token);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:38">38</a></p>
+</div>
+</div>
+<a name="1:9"><div class="section"><h4 class="noheading">9. </h4></a>
+<p>We'll also need some functions to access the token data.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Interface <a href="lexer.html#1:9">9</a>}</span>
+<pre class="prettyprint lang-c">
c_token_types token_type(token_t *token);
+
int64_t token_int(token_t *token);
+
double token_float(token_t *token);
+
const char *token_string(token_t *token);
+
char token_char(token_t *token);
+
int token_line(token_t *token);
+
int token_column(token_t *token);
-void token_destroy(token_t *token);</code></pre>
- </div>
- <p>I'll also define an opaque struct for tokens and an enum for token types. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.h</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":token-types" href="#:token-types">Token
- types</a></em></strong></span>
- <pre class="prettyprint"><code class="">typedef struct token token_t;
+
+void print_token(token_t *tok);
+</pre>
+
+
+<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p>
+</div>
+</div>
+<a name="1:10"><div class="section"><h4 class="noheading">10. </h4></a>
+<p>We'll need some types to represent the different kinds of tokens.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Types <a href="lexer.html#1:10">10</a>}</span>
+<pre class="prettyprint lang-c">
typedef enum {
// Control Keywords
TOK_CTK_IF,
@@ -311,25 +433,25 @@ typedef enum {
TOK_OP_MUL, // *
TOK_OP_DIV, // /
TOK_OP_MOD, // %
- TOK_OP_BIT_AND, // &
+ TOK_OP_BIT_AND, // &
TOK_OP_BIT_OR, // |
TOK_OP_BIT_XOR, // ^
TOK_OP_BIT_NOT, // ~
- TOK_OP_LSHIFT, // <<
- TOK_OP_RSHIFT, // >>
+ TOK_OP_LSHIFT, // <<
+ TOK_OP_RSHIFT, // >>
TOK_OP_NOT, // !
TOK_OP_ASSIGN, // =
- TOK_OP_LT, // <
- TOK_OP_GT, // >
+ TOK_OP_LT, // <
+ TOK_OP_GT, // >
TOK_OP_INC, // ++
TOK_OP_DEC, // --
TOK_OP_EQ, // ==
TOK_OP_NE, // !=
- TOK_OP_LE, // <=
- TOK_OP_GE, // >=
- TOK_OP_AND, // &&
+ TOK_OP_LE, // <=
+ TOK_OP_GE, // >=
+ TOK_OP_AND, // &&
TOK_OP_OR, // ||
- TOK_OP_MEMBER_POINTER, // ->
+ TOK_OP_MEMBER_POINTER, // ->
TOK_OP_MEMBER, // .
TOK_OP_COND_DECISION, // :
TOK_OP_COND, // ?
@@ -338,11 +460,11 @@ typedef enum {
TOK_OP_ASSIGN_MUL, // *=
TOK_OP_ASSIGN_DIV, // /=
TOK_OP_ASSIGN_MOD, // %=
- TOK_OP_ASSIGN_BITAND, // &=
+ TOK_OP_ASSIGN_BITAND, // &=
TOK_OP_ASSIGN_BITOR, // |=
TOK_OP_ASSIGN_BITXOR, // ^=
- TOK_OP_ASSIGN_LSHIFT, // <<=
- TOK_OP_ASSIGN_RSHIFT, // >>=
+ TOK_OP_ASSIGN_LSHIFT, // <<=
+ TOK_OP_ASSIGN_RSHIFT, // >>=
// Separators
TOK_SEP_LEFT_PAREN, // (
@@ -358,7 +480,7 @@ typedef enum {
TOK_SEP_HASH, // #
// Identifiers
- TOK_ID, // Any identifier not ending in _t
+ TOK_ID,
// Constants
TOK_CONST_INTEGER_U32, // u
@@ -373,478 +495,1716 @@ typedef enum {
// Special
TOK_SPECIAL_EOF,
TOK_SPECIAL_ERROR,
-} c_token_types;</code></pre>
- </div>
- <h4>Tokenizer Design Decisions</h4>
- <p>A lot of introductory compiler courses focus on the design of deterministic finite automatas for lexing. While
- that's definitely interesting from a CS perspective, for this project I'll be focusing more on simplicity and
- readability, and so I'll write a lexer by hand. </p>
- <p>I'll implement the token API first, and then write the tokenizer. This'll be a 'one-at-a-time' tokenizer, and won't return a list of tokens. </p>
- <h4>Token Implementation</h4>
- <p>The token struct is pretty simple. It contains the token information, some metadata about the token, and the
- optional value of the token. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":token-struct" href="#:token-struct">Token
- struct</a></em></strong></span>
- <pre class="prettyprint"><code class="">#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
- #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
- struct token {
- long magic;
- int line;
- int column;
- short kind;
- long opt_data[0];
- };
-
- typedef struct token token_t;
-
- struct token_data {
- union {
- long long i;
- double f;
- const char *s;
- char c;
- } data;
- };
-
- typedef struct token_data token_data_t;</code></pre>
- </div>
- <p>By keeping the token data off of simple tokens (like keywords and operators), we save 8 bytes per token. This
- will add up quickly, as the lexer will produce a lot of tokens. </p>
- <p>Now that we have the token struct, we can implement some internal utilities for working with tokens. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":token-internals" href="#:token-internals">Token
- Internals</a></em></strong></span>
- <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data))
-static token_t *token_data_create(c_token_types kind, int lin, int col,
- int len) {
- token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
- if (token == NULL) {
- fputs("Out of memory\n", stderr);
- exit(1);
- }
- token->magic = TOK_MAGIC_1;
- token->line = lin;
- token->column = col;
- column += len;
- token->kind = kind;
- return token;
-}
+} c_token_types;
+</pre>
-static token_t *token_create(c_token_types kind, int lin, int col, int len) {
- token_t *token = malloc(sizeof(token_t));
- if (token == NULL) {
- fputs("Out of memory\n", stderr);
- exit(1);
- }
- token->magic = TOK_MAGIC_2;
- token->line = lin;
- token->column = col;
- column += len;
- token->kind = kind;
- return token;
-}
-static token_t *token_create_int(c_token_types kind, int lin, int col,
- int64_t i, int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.i = i;
- return token;
-}
+<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:38">38</a></p>
+</div>
+</div>
+<a name="1:11"><div class="section"><h4 class="noheading">11. </h4></a>
+<p>We bring this all together in <code>token.h</code>. Line and column are exposed as global variables because <code>skip_whitespace</code> will need to update them.
+</p>
-static token_t *token_create_float(c_token_types kind, int lin, int col,
- double f, int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.f = f;
- return token;
-}
+<div class="codeblock">
+<span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>}</span>
+<pre class="prettyprint lang-c">
+#ifndef TOKEN_H
+#define TOKEN_H
+#include <stdint.h> // We use this for int64_t
+<span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span>
+<span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span>
+<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:16">16</a>}</span>
+<span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span>
+extern int column;
+extern int line;
+#endif
+</pre>
-static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
- int len) {
- token_t *token = token_data_create(kind, lin, col, len);
- token_data(token)->data.c = c;
- return token;
-}
+<p class="seealso">Redefined in section <a href="lexer.html#1:38">38</a></p>
-static token_t *token_create_string(c_token_types kind, int lin, int col,
- const char *s, int len) {
- // Not yet
-}
-
-</code></pre>
- </div>
- <p>You might wonder why token_create_string was left out. Before you ask that, try counting up all the repeated
- strings in the code so far. Look at printf, for example. Let's say it's called 20 seperate times throughout the
- code. that's <math>20 * strlen("printf") = 140</math> bytes for the strings alone. That's a lot of duplicated
- memory.</p>
- <p>If you've got some CS under your belt, you're probably screaming "hash table" at your screen right now. And
- you're
- right. I'll be implementing a basic hash table local to the lexer to keep track of strings in memory.</p>
- <h4>Hash tables</h4>
- <p>If you're not familiar with hash tables, the basic idea is that you have a function that takes a key and returns
- a number. You then use that number to index into an array. The function calculates a (hopefully) unique value for
- each input, which means every value has its own unique array index. this means you can get an O(1) lookup for any
- value in the table. However, we don't have enough memory and hash functions aren't good enough for this ideal
- case, so we'll get collisions. I'll be using a linked hash table, which means that each array entry is a linked
- list consisting of the data and the next item in the chain. </p>
- <p>I'm sure I'll want to use a hash table again in the future, and not necessarily for strings or with strcmp as the
- key. So I'll make the hash table generic and a separate module. </p>
- <div class="code-block">
- <span class="file-name">hash_table.h</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":hash-table-interface" href="#:hash-table-interface">Hash
- table interface</a></em></strong></span>
- <pre class="prettyprint"><code class="">typedef struct hash_table hash_table_t;
-typedef struct hash_table_entry hash_table_entry_t;
-typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
-typedef unsigned int (*hash_table_hash_fn)(void *key);
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
- hash_table_hash_fn hash);
+</div>
+</div>
+<a name="1:12"><div class="section"><h4>12. Token Implementation</h4></a>
+<p>Now that we have the interface, we can implement the token data structure. We'll need a couple of things:
+</p>
+<ul>
+<li>The token type.
+</li>
+<li>A way to store extra data.
+</li>
+<li>Implementations of the functions we defined in the interface.
+</li>
+</ul>
-typedef void (*hash_table_dtor)(void *value, int is_key);
-void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
-void *hash_table_get(hash_table_t *table, void *key);
-int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
-void hash_table_remove(hash_table_t *table, void *key);</code></pre>
- </div>
- <p>Now that we have the hash table interface, we can implement the generic hash table. </p>
- <div class="code-block">
- <span class="file-name">hash_table.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":string-hash-table" href="#:string-hash-table">String
- hash table</a></em></strong></span>
- <pre class="prettyprint"><code class="">struct hash_table {
- hash_table_entry_t **entries;
- int size;
- hash_table_cmp_fn cmp;
- hash_table_hash_fn hash;
-};
-struct hash_table_entry {
- void *key;
- void *value;
- hash_table_entry_t *next;
+</div>
+<a name="1:13"><div class="section"><h4 class="noheading">13. </h4></a>
+<p>One problem is we haven't defined a way to verify that the token we're getting isn't corrupt. We'll use a tag for that.
+</p>
+<p>You might notice that a zero-length array is used in the token data structure. This is a GCC extension that allows us to allocate memory for the token data structure and the token data in one allocation. This is a bit of a hack, but it's a common pattern in C code.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Data Structure <a href="lexer.html#1:13">13</a>}</span>
+<pre class="prettyprint lang-c">
+#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
+
+struct token {
+ long magic;
+ int line;
+ int column;
+ short kind;
+ long opt_data[0];
};
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
- hash_table_hash_fn hash) {
- hash_table_t *table = malloc(sizeof(hash_table_t));
- if (table == NULL) {
- return NULL;
- }
- table->entries = calloc(size, sizeof(hash_table_entry_t *));
- if (table->entries == NULL) {
- free(table);
- return NULL;
- }
- table->size = size;
- table->cmp = cmp;
- table->hash = hash;
- return table;
-}
+typedef struct token token_t;
-void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
- for (int i = 0; i < table->size; i++) {
- hash_table_entry_t *entry = table->entries[i];
- while (entry != NULL) {
- hash_table_entry_t *next = entry->next;
- if (dtor != NULL) {
- dtor(entry->key, 1);
- dtor(entry->value, 0);
- }
- free(entry);
- entry = next;
- }
- }
- free(table->entries);
- free(table);
-}
+struct token_data {
+ union {
+ int64_t i;
+ double f;
+ const char *s;
+ char c;
+ } data;
+};
-void *hash_table_get(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
- while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
- return entry->value;
- }
- entry = entry->next;
- }
- return NULL;
-}
+typedef struct token_data token_data_t;
+int column = 1;
+int line = 1;
+</pre>
-int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
- while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
- if (replace) {
- entry->value = value;
- return 0;
- } else {
- return 1;
- }
- }
- entry = entry->next;
- }
- entry = malloc(sizeof(hash_table_entry_t));
- if (entry == NULL) {
- fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
- exit(1);
- }
- entry->key = key;
- entry->value = value;
- entry->next = table->entries[hash];
- table->entries[hash] = entry;
- return 0;
-}
-void hash_table_remove(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
- hash_table_entry_t *prev = NULL;
- while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
- if (prev == NULL) {
- table->entries[hash] = entry->next;
- } else {
- prev->next = entry->next;
- }
- free(entry);
- return;
- }
- prev = entry;
- entry = entry->next;
- }
-}</code></pre>
- </div>
- <p>One implementation detail of note is that hash_table_put returns 1 if the key already exists in the table, and,
- unless replace is set to 1, will not replace the value. </p>
- <p>This is useful as our token_create_string function can just check if the string already exists in the table and
- free the memory if it does. </p>
- <h4>String hashing</h4>
- <p>Now we can implement the string hash table for the tokenizer.</p>
- <p>A lot of research has been done on hash functions (See <a href="https://www.strchr.com/hash_functions">here for a
- good review of theory and comparison of various functions</a>). For this project, I'll be using the ELF hash
- function from glibc, which
- </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":string-hash-function" href="#:string-hash-function">String
- hash function</a></em></strong></span>
- <pre class="prettyprint"><code class="">static unsigned int hash_string(void *key) {
- unsigned long hash = 0, hi = 0;
- char *p = key;
- hash = *p;
- if (hash != 0 && p[1] != 0) {
- hash = (hash << 4) + p[1];
- if (p[2] != 0) {
- hash = (hash << 4) + p[2];
- if (p[3] != 0) {
- hash = (hash << 4) + p[3];
- if (p[4] != 0) {
- hash = (hash << 4) + p[4];
- p += 5;
- while (*p != 0) {
- hash = (hash << 4) + *p++;
- hi = hash & 0xf0000000l;
- hash ^= hi >> 24;
- }
- hash &= 0x0fffffffl;
- }
- }
- }
- }
- return hash;
-}</code></pre>
- </div>
- <p>We'll also need a comparison function for strings. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":string-compare-function" href="#:string-compare-function">String
- compare function</a></em></strong></span>
- <pre class="prettyprint"><code class="">static int cmp_string(void *key1, void *key2) {
- return strcmp((char *)key1, (char *)key2);
-}
-</code></pre>
- </div>
- <p>Now that we have the hash table, we can implement the string creation function. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":string-creation" href="#:string-creation">String
- creation</a></em></strong></span>
- <pre class="prettyprint"><code class="">static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int lin, int col,
- const char *s, int len) {
- if (string_table == NULL) {
- string_table = hash_table_create(2048, cmp_string, hash_string);
- }
- token_t *token = token_data_create(kind, lin, col, len);
- char *key = hash_table_get(string_table, (void *)s);
- if (key == NULL) {
- key = strdup(s);
- hash_table_put(string_table, key, key, 1);
- }
- token_data(token)->data.s = key;
- return token;
-}
-</code></pre>
- </div>
+<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:14"><div class="section"><h4 class="noheading">14. </h4></a>
+<p>We'll need to implement an interface for accessing the token data and a macro for accessing optional data.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Data Access <a href="lexer.html#1:14">14</a>}</span>
+<pre class="prettyprint lang-c">
+#define token_data(token) ((struct token_data *)((token)->opt_data))
- <h4>External Token API</h4>
- <p>Sticking with the theme of keeping data structures out of headers, the parser will have to access tokens using
- functions defined in the tokenizer. We'll implement those now. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":external-token-api" href="#:external-token-api">External
- token API</a></em></strong></span>
- <pre class="prettyprint"><code class="">c_token_types token_type(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->kind;
+c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
}
int64_t token_int(token_t *token) {
- assert(token->kind == TOK_CONST_INTEGER_U32 ||
- token->kind == TOK_CONST_INTEGER_U64 ||
- token->kind == TOK_CONST_INTEGER_S32 ||
- token->kind == TOK_CONST_INTEGER_S64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.i;
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
}
double token_float(token_t *token) {
- assert(token->kind == TOK_CONST_FLOAT_32 ||
- token->kind == TOK_CONST_FLOAT_64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.f;
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
}
const char *token_string(token_t *token) {
- assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.s;
+ assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
}
char token_char(token_t *token) {
- assert(token->kind == TOK_CONST_CHAR);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.c;
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
}
int token_line(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->line;
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
}
int token_column(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->column;
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
}
+</pre>
-void token_destroy(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- // Don't free the string table, it's a global variable
- free(token);
-}</code></pre>
- </div>
- <p>You might notice that this code uses a lot of asserts. I plan to do this for all functions which access internal
- data structures to help reduce bugs. </p>
- <h4>Tokenizer Implementation</h4>
- <p>I'll implement the less interesting functions first. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":tokenizer-implementation" href="#:tokenizer-implementation">Tokenizer
- implementation</a></em></strong></span>
- <pre class="prettyprint"><code class="">int line = 1;
-int column = 1;
-char file_name[1024];
-token_t *unget_token = NULL;
-void tokenizer_init(const char *filename)
-{
- input_init(filename);
-}
-void tokenizer_unget(token_t *token)
-{
- assert(unget_token == NULL);
- unget_token = token;
-}
+<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:15"><div class="section"><h4 class="noheading">15. </h4></a>
+<p>For debugging, I'll add a function to print the token type.
+</p>
-void tokenizer_destroy(void)
-{
- input_destroy();
- if (string_table != NULL)
- {
- hash_table_destroy(string_table);
- }
+<div class="codeblock">
+<span class="codeblock_name">{Token Debugging <a href="lexer.html#1:15">15</a>}</span>
+<pre class="prettyprint lang-c">
+const char *token_name_from_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "TOK_CTK_IF";
+ case TOK_CTK_ELSE:
+ return "TOK_CTK_ELSE";
+ case TOK_CTK_SWITCH:
+ return "TOK_CTK_SWITCH";
+ case TOK_CTK_CASE:
+ return "TOK_CTK_CASE";
+ case TOK_CTK_DEFAULT:
+ return "TOK_CTK_DEFAULT";
+ case TOK_CTK_WHILE:
+ return "TOK_CTK_WHILE";
+ case TOK_CTK_DO:
+ return "TOK_CTK_DO";
+ case TOK_CTK_FOR:
+ return "TOK_CTK_FOR";
+ case TOK_CTK_CONTINUE:
+ return "TOK_CTK_CONTINUE";
+ case TOK_CTK_BREAK:
+ return "TOK_CTK_BREAK";
+ case TOK_CTK_RETURN:
+ return "TOK_CTK_RETURN";
+ case TOK_CTK_GOTO:
+ return "TOK_CTK_GOTO";
+ case TOK_TK_VOID:
+ return "TOK_TK_VOID";
+ case TOK_TK_CHAR:
+ return "TOK_TK_CHAR";
+ case TOK_TK_SHORT:
+ return "TOK_TK_SHORT";
+ case TOK_TK_INT:
+ return "TOK_TK_INT";
+ case TOK_TK_LONG:
+ return "TOK_TK_LONG";
+ case TOK_TK_FLOAT:
+ return "TOK_TK_FLOAT";
+ case TOK_TK_DOUBLE:
+ return "TOK_TK_DOUBLE";
+ case TOK_TK_SIGNED:
+ return "TOK_TK_SIGNED";
+ case TOK_TK_UNSIGNED:
+ return "TOK_TK_UNSIGNED";
+ case TOK_TK_STRUCT:
+ return "TOK_TK_STRUCT";
+ case TOK_TK_UNION:
+ return "TOK_TK_UNION";
+ case TOK_TK_ENUM:
+ return "TOK_TK_ENUM";
+ case TOK_TK_TYPEDEF:
+ return "TOK_TK_TYPEDEF";
+ case TOK_SCSK_AUTO:
+ return "TOK_SCSK_AUTO";
+ case TOK_SCSK_REGISTER:
+ return "TOK_SCSK_REGISTER";
+ case TOK_SCSK_STATIC:
+ return "TOK_SCSK_STATIC";
+ case TOK_SCSK_EXTERN:
+ return "TOK_SCSK_EXTERN";
+ case TOK_SCSK_CONST:
+ return "TOK_SCSK_CONST";
+ case TOK_SCSK_VOLATILE:
+ return "TOK_SCSK_VOLATILE";
+ case TOK_MK_SIZEOF:
+ return "TOK_MK_SIZEOF";
+ case TOK_OP_ADD:
+ return "TOK_OP_ADD";
+ case TOK_OP_SUB:
+ return "TOK_OP_SUB";
+ case TOK_OP_MUL:
+ return "TOK_OP_MUL";
+ case TOK_OP_DIV:
+ return "TOK_OP_DIV";
+ case TOK_OP_MOD:
+ return "TOK_OP_MOD";
+ case TOK_OP_BIT_AND:
+ return "TOK_OP_BIT_AND";
+ case TOK_OP_BIT_OR:
+ return "TOK_OP_BIT_OR";
+ case TOK_OP_BIT_XOR:
+ return "TOK_OP_BIT_XOR";
+ case TOK_OP_BIT_NOT:
+ return "TOK_OP_BIT_NOT";
+ case TOK_OP_LSHIFT:
+ return "TOK_OP_LSHIFT";
+ case TOK_OP_RSHIFT:
+ return "TOK_OP_RSHIFT";
+ case TOK_OP_NOT:
+ return "TOK_OP_NOT";
+ case TOK_OP_ASSIGN:
+ return "TOK_OP_ASSIGN";
+ case TOK_OP_LT:
+ return "TOK_OP_LT";
+ case TOK_OP_GT:
+ return "TOK_OP_GT";
+ case TOK_OP_INC:
+ return "TOK_OP_INC";
+ case TOK_OP_DEC:
+ return "TOK_OP_DEC";
+ case TOK_OP_EQ:
+ return "TOK_OP_EQ";
+ case TOK_OP_NE:
+ return "TOK_OP_NE";
+ case TOK_OP_LE:
+ return "TOK_OP_LE";
+ case TOK_OP_GE:
+ return "TOK_OP_GE";
+ case TOK_OP_AND:
+ return "TOK_OP_AND";
+ case TOK_OP_OR:
+ return "TOK_OP_OR";
+ case TOK_OP_MEMBER_POINTER:
+ return "TOK_OP_MEMBER_POINTER";
+ case TOK_OP_MEMBER:
+ return "TOK_OP_MEMBER";
+ case TOK_OP_COND_DECISION:
+ return "TOK_OP_COND_DECISION";
+ case TOK_OP_COND:
+ return "TOK_OP_COND";
+ case TOK_OP_ASSIGN_ADD:
+ return "TOK_OP_ASSIGN_ADD";
+ case TOK_OP_ASSIGN_SUB:
+ return "TOK_OP_ASSIGN_SUB";
+ case TOK_OP_ASSIGN_MUL:
+ return "TOK_OP_ASSIGN_MUL";
+ case TOK_OP_ASSIGN_DIV:
+ return "TOK_OP_ASSIGN_DIV";
+ case TOK_OP_ASSIGN_MOD:
+ return "TOK_OP_ASSIGN_MOD";
+ case TOK_OP_ASSIGN_BITAND:
+ return "TOK_OP_ASSIGN_BITAND";
+ case TOK_OP_ASSIGN_BITOR:
+ return "TOK_OP_ASSIGN_BITOR";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "TOK_OP_ASSIGN_BITXOR";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "TOK_OP_ASSIGN_LSHIFT";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return "TOK_OP_ASSIGN_RSHIFT";
+ case TOK_SEP_HASH:
+ return "TOK_SEP_HASH";
+ case TOK_ID:
+ return "TOK_ID";
+ case TOK_CONST_INTEGER_U32:
+ return "TOK_CONST_INTEGER_U32";
+ case TOK_CONST_INTEGER_U64:
+ return "TOK_CONST_INTEGER_U64";
+ case TOK_CONST_INTEGER_S32:
+ return "TOK_CONST_INTEGER_S32";
+ case TOK_CONST_INTEGER_S64:
+ return "TOK_CONST_INTEGER_S64";
+ case TOK_CONST_FLOAT_32:
+ return "TOK_CONST_FLOAT_32";
+ case TOK_CONST_FLOAT_64:
+ return "TOK_CONST_FLOAT_64";
+ case TOK_CONST_CHAR:
+ return "TOK_CONST_CHAR";
+ case TOK_CONST_STRING_ASCII:
+ return "TOK_CONST_STRING_ASCII";
+ case TOK_SPECIAL_EOF:
+ return "TOK_SPECIAL_EOF";
+ case TOK_SPECIAL_ERROR:
+ return "TOK_SPECIAL_ERROR";
+ case TOK_SEP_LEFT_PAREN:
+ return "TOK_SEP_LEFT_PAREN";
+ case TOK_SEP_RIGHT_PAREN:
+ return "TOK_SEP_RIGHT_PAREN";
+ case TOK_SEP_LEFT_BRACKET:
+ return "TOK_SEP_LEFT_BRACKET";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "TOK_SEP_RIGHT_BRACKET";
+ case TOK_SEP_LEFT_BRACE:
+ return "TOK_SEP_LEFT_BRACE";
+ case TOK_SEP_RIGHT_BRACE:
+ return "TOK_SEP_RIGHT_BRACE";
+ case TOK_SEP_COMMA:
+ return "TOK_SEP_COMMA";
+ case TOK_SEP_SEMICOLON:
+ return "TOK_SEP_SEMICOLON";
+ case TOK_SEP_DOT:
+ return "TOK_SEP_DOT";
+ case TOK_SEP_ELLIPSIS:
+ return "TOK_SEP_ELLIPSIS";
+ }
+ return "UNKNOWN";
}
-token_t *read_token(void) {
- if (unget_token != NULL) {
- token_t *tok = unget_token;
- unget_token = NULL;
- return tok;
- }
- // What goes here?
- int c = input_getc();
- if (c == EOF) {
- return NULL;
+char *re_escape_string(const char *str) {
+ int len = strlen(str);
+ char *buf = malloc(len * 2 + 1);
+ if (buf == NULL) {
+ fprintf(stderr, "Out of memory. Cannot escape string\n");
+ exit(1);
}
- tok_warn(
- "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
- line, column);
- return read_token();
-}
-</code></pre>
- </div>
- <p>Now we can implement the meat of the tokenizer. The general structure is as follows: </p>
- <ol>
- <li>Read a character from the input stream. </li>
- <li>If whitespace, skip it. </li>
- <li>If a digit or decimal point, read a number. </li>
- <li>If a letter, read an identifier or keyword. </li>
- <li>If a quote, read a string. </li>
- <li>If a single quote, read a char.</li>
- <li>If not any of the above, read an operator or separator. </li>
- <li>Return the token if it matches any of the above, otherwise return an error or EOF token. </li>
- </ol>
- <p>Each of the functions for reading a token will return a token or <code>NULL</code> if it's not the right type.
- </p>
- <p>Let's first write a function to skip whitespace and comments. The rule in C is that whitespace is ignored except
- when it separates tokens, any character sequence starting with <code>//</code> is a comment until the end of the
- line, and any character sequence starting with <code>/*</code> is a comment until the next <code>*/</code>. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":skip-whitespace" href="#:skip-whitespace">Skip
- whitespace</a></em></strong></span>
- <pre class="prettyprint"><code class="">static token_t *skip_whitespace(void) {
- int c;
- while ((c = input_getc()) != EOF) {
- if (isspace(c)) {
- if (c == '\n') {
- line++;
- column = 1;
- } else {
- column++;
- }
- } else if (c == '#') // GCC preprocessor line control directive.
- {
- char buf[512];
- int i = 0;
- while ((c = input_getc()) != EOF && c != '\n') {
- buf[i++] = c;
- column++;
+ int i = 0;
+ for (int j = 0; j < len; j++) {
+ switch (str[j]) {
+ case '\a':
+ buf[i++] = '\\';
+ buf[i++] = 'a';
+ break;
+ case '\b':
+ buf[i++] = '\\';
+ buf[i++] = 'b';
+ break;
+ case '\f':
+ buf[i++] = '\\';
+ buf[i++] = 'f';
+ break;
+ case '\n':
+ buf[i++] = '\\';
+ buf[i++] = 'n';
+ break;
+ case '\r':
+ buf[i++] = '\\';
+ buf[i++] = 'r';
+ break;
+ case '\t':
+ buf[i++] = '\\';
+ buf[i++] = 't';
+ break;
+ case '\v':
+ buf[i++] = '\\';
+ buf[i++] = 'v';
+ break;
+ case '\\':
+ buf[i++] = '\\';
+ buf[i++] = '\\';
+ break;
+ case '\'':
+ buf[i++] = '\\';
+ buf[i++] = '\'';
+ break;
+ case '"':
+ buf[i++] = '\\';
+ buf[i++] = '"';
+ break;
+ default:
+ buf[i++] = str[j];
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return buf;
+}
+
+void print_token(token_t *tok) {
+ if (tok == NULL) {
+ printf("NULL\n");
+ return;
+ }
+ const char *name = token_name_from_type(tok->kind);
+ switch (tok->kind) {
+ case TOK_ID:
+ case TOK_CONST_STRING_ASCII: {
+ char *escaped = re_escape_string(token_string(tok));
+ printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+ free(escaped);
+ break;
+ }
+ case TOK_CONST_CHAR:
+ printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_S64:
+ case TOK_CONST_INTEGER_U64:
+ printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+ break;
+ default:
+ printf("%s@%d:%d\n", name, tok->line, tok->column);
+ break;
+ }
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:16"><div class="section"><h4 class="noheading">16. </h4></a>
+<p>Now we can implement the functions we defined in the interface.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Creation and Destruction <a href="lexer.html#1:16">16</a>}</span>
+<pre class="prettyprint lang-c">
+token_t *token_data_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_1;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create(c_token_types kind, int lin, int col, int len) {
+ token_t *token = malloc(sizeof(token_t));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_2;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
+ return token;
+}
+
+token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.i = i;
+ return token;
+}
+
+token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.f = f;
+ return token;
+}
+
+token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.c = c;
+ return token;
+}
+
+void token_destroy(token_t *token) {
+ if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) {
+ if (token->kind == TOK_CONST_STRING_ASCII) {
+ free((char *)token_data(token)->data.s);
+ }
+ free(token);
+ } else {
+ fputs("Corrupt token\n", stderr);
+ exit(1);
+ }
+}
+</pre>
+
+
+<p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:17"><div class="section"><h4 class="noheading">17. </h4></a>
+<p><code>token_create_string</code> can be implemented either the easy way or the right way. Let's try the easy way.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Create String <a href="lexer.html#1:17">17</a>}</span>
+<pre class="prettyprint lang-c">
+token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) {
+ token_t *token = token_create(kind, lin, col, len);
+ token_data(token)->data.s = strdup(s);
+ return token;
+}
+</pre>
+
+<p class="seealso">Redefined in section <a href="lexer.html#1:37">37</a></p>
+<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:18"><div class="section"><h4 class="noheading">18. </h4></a>
+<p>There's an issue with this approach. <code>token_create_string</code> will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to <code>fprintf</code>, <code>fscanf</code>, <code>strchr</code>, <code>strtok</code>, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage.
+</p>
+<p>To fix this, we use a hash table to store the strings. We'll define a hash table in <code>hash_table.h</code> and <code>hash_table.c</code>.
+</p>
+
+</div>
+<a name="1:19"><div class="section"><h4>19. Hash Table</h4></a>
+<p>A hash table is a data structure that maps keys to values. It's commonly used to store information, such as variables and functions in a symbol table. To implement a generic hash table, we'll need several things:
+</p>
+<ul>
+<li>A function to hash the keys.
+</li>
+<li>A function to compare keys.
+</li>
+<li>An opaque type for the hash table.
+</li>
+<li>A function to destroy deleted keys and values.
+</li>
+</ul>
+<p>Let's start with the interface.
+</p>
+
+</div>
+<a name="1:20"><div class="section"><h4 class="noheading">20. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Table Opaque Types <a href="lexer.html#1:20">20</a>}</span>
+<pre class="prettyprint lang-c">
+typedef struct hash_table hash_table_t;
+typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
+typedef void (*hash_table_dtor)(void *value, int is_key);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p>
+</div>
+</div>
+<a name="1:21"><div class="section"><h4 class="noheading">21. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Table Creation and Destruction <a href="lexer.html#1:21">21</a>}</span>
+<pre class="prettyprint lang-c">
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor);
+void hash_table_destroy(hash_table_t *table);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p>
+</div>
+</div>
+<a name="1:22"><div class="section"><h4 class="noheading">22. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Table Access <a href="lexer.html#1:22">22</a>}</span>
+<pre class="prettyprint lang-c">
+void *hash_table_get(hash_table_t *table, void *key);
+void hash_table_put(hash_table_t *table, void *key, void *value);
+void hash_table_remove(hash_table_t *table, void *key);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:23">23</a></p>
+</div>
+</div>
+<a name="1:23"><div class="section"><h4 class="noheading">23. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>hash_table.h</strong> <a href="lexer.html#1:23">23</a>}</span>
+<pre class="prettyprint lang-c">
+#ifndef HASH_TABLE_H
+#define HASH_TABLE_H
+<span class="nocode pln">{Hash Table Opaque Types, <a href="lexer.html#1:20">20</a>}</span>
+<span class="nocode pln">{Hash Table Creation and Destruction, <a href="lexer.html#1:21">21</a>}</span>
+<span class="nocode pln">{Hash Table Access, <a href="lexer.html#1:22">22</a>}</span>
+#endif
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:24"><div class="section"><h4 class="noheading">24. </h4></a>
+<p>Let's implement the hash table now.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>hash_table.c</strong> <a href="lexer.html#1:24">24</a>}</span>
+<pre class="prettyprint lang-c">
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "hash_table.h"
+
+<span class="nocode pln">{Hash Table Data Structure, <a href="lexer.html#1:25">25</a>}</span>
+<span class="nocode pln">{Hash Table Entry Data Structure, <a href="lexer.html#1:26">26</a>}</span>
+
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) {
+<span class="nocode pln"> {Allocate and Initialize Hash Table, <a href="lexer.html#1:27">27</a>}</span>
+ return table;
+}
+
+void hash_table_destroy(hash_table_t *table) {
+<span class="nocode pln"> {Destroy Entries, <a href="lexer.html#1:28">28</a>}</span>
+ free(table->entries);
+ free(table);
+}
+
+void *hash_table_get(hash_table_t *table, void *key) {
+<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span>
+<span class="nocode pln"> {Loop Through Entries and Return Value if Match, <a href="lexer.html#1:33">33</a>}</span>
+ return NULL;
+}
+
+void hash_table_put(hash_table_t *table, void *key, void *value) {
+<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span>
+<span class="nocode pln"> {Loop Through Entries and Replace Value if Key Matches, <a href="lexer.html#1:30">30</a>}</span>
+<span class="nocode pln"> {Allocate New Entry if No Match, <a href="lexer.html#1:31">31</a>}</span>
+}
+
+void hash_table_remove(hash_table_t *table, void *key) {
+<span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:29">29</a>}</span>
+<span class="nocode pln"> {Loop Through Entries and Remove Entry if Key Matches, <a href="lexer.html#1:32">32</a>}</span>
+}
+
+#ifdef TEST_HASH_TABLE
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+int string_cmp(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+
+unsigned long string_hash(void *key) {
+ unsigned long hash = 5381;
+ char *str = (char *)key;
+ while (*str != '\0') {
+ hash = ((hash << 5) + hash) + *str;
+ str++;
+ }
+ return hash;
+}
+
+int main() {
+ hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL);
+ hash_table_put(table, "foo", "bar");
+ hash_table_put(table, "foo", "baz");
+ assert(strcmp((char *)hash_table_get(table, "foo"), "baz") == 0);
+ hash_table_remove(table, "foo");
+ assert(hash_table_get(table, "foo") == NULL);
+ hash_table_destroy(table);
+ return 0;
+}
+#endif
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:25"><div class="section"><h4 class="noheading">25. </h4></a>
+<p>For the hash table data structure, we'll define a pointer to an array of entries, the size of the array, and the hash/comparison functions.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Table Data Structure <a href="lexer.html#1:25">25</a>}</span>
+<pre class="prettyprint lang-c">
+struct hash_table {
+ struct hash_table_entry **entries;
+ int size;
+ hash_table_cmp_fn cmp;
+ hash_table_hash_fn hash;
+ hash_table_dtor dtor;
+};
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:26"><div class="section"><h4 class="noheading">26. </h4></a>
+<p>Entries in the hash table will have a key, a value, and a link to the next entry in the chain.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Table Entry Data Structure <a href="lexer.html#1:26">26</a>}</span>
+<pre class="prettyprint lang-c">
+struct hash_table_entry {
+ void *key;
+ void *value;
+ struct hash_table_entry *next;
+};
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:27"><div class="section"><h4 class="noheading">27. </h4></a>
+<p>Allocating a hash table involves allocating memory for the hash table itself and the entries, zeroing out the entries, and setting the hash and comparison functions.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Allocate and Initialize Hash Table <a href="lexer.html#1:27">27</a>}</span>
+<pre class="prettyprint lang-c">
+hash_table_t *table = malloc(sizeof(struct hash_table));
+if (table == NULL) {
+ fputs("Out of memory, could not allocate hash table\n", stderr);
+ exit(1);
+}
+table->entries = calloc(size, sizeof(struct hash_table_entry *));
+if (table->entries == NULL) {
+ fputs("Out of memory, could not allocate hash table entries\n", stderr);
+ exit(1);
+}
+table->size = size;
+table->cmp = cmp;
+table->hash = hash;
+table->dtor = dtor;
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:28"><div class="section"><h4 class="noheading">28. </h4></a>
+<p>To destroy a hash table, we loop through the entries, freeing the keys and values, and then free the entries and the table itself.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Destroy Entries <a href="lexer.html#1:28">28</a>}</span>
+<pre class="prettyprint lang-c">
+for (int i = 0; i < table->size; i++) {
+ struct hash_table_entry *entry = table->entries[i];
+ while (entry != NULL) {
+ struct hash_table_entry *next = entry->next;
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
+ }
+ free(entry);
+ entry = next;
+ }
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:29"><div class="section"><h4 class="noheading">29. </h4></a>
+<p>To get an entry from the hash table, we hash the key, loop through the entries, and return the value if we find a match.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Get Entry By Hash <a href="lexer.html#1:29">29</a>}</span>
+<pre class="prettyprint lang-c">
+unsigned int hash = table->hash(key) % table->size;
+struct hash_table_entry *entry = table->entries[hash];
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:30"><div class="section"><h4 class="noheading">30. </h4></a>
+<p>To put an entry in the hash table, we hash the key, loop through the entries, and replace the value if we find a match.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Loop Through Entries and Replace Value if Key Matches <a href="lexer.html#1:30">30</a>}</span>
+<pre class="prettyprint lang-c">
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ entry->value = value;
+ return;
+ }
+ entry = entry->next;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:31"><div class="section"><h4 class="noheading">31. </h4></a>
+<p>If we don't find a match, we allocate a new entry, set the key and value, and insert it at the head of the linked list.
+</p>
+<p>This exploits a property in computer science called locality of reference. The gist of that is that when you write to a piece of memory, you're likely to read from it again soon. By putting the new entry at the head of the linked list, we increase the chances that we'll find it quickly next time.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Allocate New Entry if No Match <a href="lexer.html#1:31">31</a>}</span>
+<pre class="prettyprint lang-c">
+struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry));
+if (new_entry == NULL) {
+ fputs("Out of memory, could not allocate hash table entry\n", stderr);
+ exit(1);
+}
+new_entry->key = key;
+new_entry->value = value;
+new_entry->next = table->entries[hash];
+table->entries[hash] = new_entry;
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:32"><div class="section"><h4 class="noheading">32. </h4></a>
+<p>To remove an entry from the hash table, we hash the key, loop through the entries, and remove the entry if we find a match.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Loop Through Entries and Remove Entry if Key Matches <a href="lexer.html#1:32">32</a>}</span>
+<pre class="prettyprint lang-c">
+struct hash_table_entry *prev = NULL;
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (prev == NULL) {
+ table->entries[hash] = entry->next;
+ } else {
+ prev->next = entry->next;
+ }
+ if (table->dtor != NULL) {
+ table->dtor(entry->key, 1);
+ table->dtor(entry->value, 0);
+ }
+ free(entry);
+ return;
+ }
+ prev = entry;
+ entry = entry->next;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:33"><div class="section"><h4 class="noheading">33. </h4></a>
+<p>To find a value associated with a given key in the hash table, we hash the string, loop through the entries, and return the value if a match is found.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Loop Through Entries and Return Value if Match <a href="lexer.html#1:33">33</a>}</span>
+<pre class="prettyprint lang-c">
+while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ return entry->value;
+ }
+ entry = entry->next;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:24">24</a></p>
+</div>
+</div>
+<a name="1:34"><div class="section"><h4 class="noheading">34. </h4></a>
+<p>We're now almost ready to implement <code>token_create_string</code> the right way. First, we'll need a good hash function.
+</p>
+<p>Hash functions are a very interesting topic and there's a lot of good research on them. The hash function we use should be fast, have a low collision rate, and be able to handle strings of any length.
+</p>
+<p>We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful.
+</p>
+<p>Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 376 were unique. That's a collision rate of 99.999999%!
+</p>
+<p>Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well.
+</p>
+<p>The first time I wrote this, I used the hash function from the 'Red Dragon Book' (Compilers: Principles, Techniques, and Tools).
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Function <a href="lexer.html#1:34">34</a>}</span>
+<pre class="prettyprint lang-c">
+static unsigned long hash_string(void *key) {
+ unsigned long hash = 0, g;
+ char *p = key;
+ while (*p) {
+ hash = (hash \<\< 4) + *p++;
+ if ((g = hash \& 0xf0000000) != 0) {
+ hash ^= g \>\> 24;
+ hash ^= g;
+ }
+ }
+ return hash;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p>
+</div>
+<p>This is a bit slow on modern processors because it's not very cache-friendly. We can do better. Let's use its child, ELFHash, from libc.
+</p>
+<p>As you can see in the code below, this function avoids extra operations and should be much faster.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Hash Function <a href="lexer.html#1:34">34</a>} :=</span>
+<pre class="prettyprint lang-c">
+static unsigned int hash_string(void *key) {
+ unsigned long hash = 0, hi = 0;
+ char *p = key;
+ hash = *p;
+ if (hash != 0 && p[1] != 0) {
+ hash = (hash << 4) + p[1];
+ if (p[2] != 0) {
+ hash = (hash << 4) + p[2];
+ if (p[3] != 0) {
+ hash = (hash << 4) + p[3];
+ if (p[4] != 0) {
+ hash = (hash << 4) + p[4];
+ p += 5;
+ while (*p != 0) {
+ hash = (hash << 4) + *p++;
+ hi = hash & 0xf0000000l;
+ hash ^= hi >> 24;
+ }
+ hash &= 0x0fffffffl;
+ }
+ }
+ }
+ }
+ return hash;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p>
+</div>
+</div>
+<a name="1:35"><div class="section"><h4 class="noheading">35. </h4></a>
+<p>We also need a comparison function for strings.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{String Comparison <a href="lexer.html#1:35">35</a>}</span>
+<pre class="prettyprint lang-c">
+static int cmp_string(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p>
+</div>
+</div>
+<a name="1:36"><div class="section"><h4 class="noheading">36. </h4></a>
+<p>Finally, we'll need a destructor for entries.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{String Destructor <a href="lexer.html#1:36">36</a>}</span>
+<pre class="prettyprint lang-c">
+static void dtor_string(void *value, int is_key) {
+ if (is_key) {
+ free(value); // Since the key and value are the same, we only need to free once.
+ }
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:37">37</a></p>
+</div>
+</div>
+<a name="1:37"><div class="section"><h4 class="noheading">37. </h4></a>
+<p>Now we can implement <code>token_create_string</code> the right way.
+</p>
+<p>You might notice that we're using the same key and value. This way of using a hash table is normally called a set. We're using it to store strings, but we could use it to store anything we want to deduplicate.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Token Create String <a href="lexer.html#1:17">17</a>} :=</span>
+<pre class="prettyprint lang-c">
+<span class="nocode pln">{String Comparison, <a href="lexer.html#1:35">35</a>}</span>
+<span class="nocode pln">{Hash Function, <a href="lexer.html#1:34">34</a>}</span>
+<span class="nocode pln">{String Destructor, <a href="lexer.html#1:36">36</a>}</span>
+hash_table_t *string_table;
+token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
+ if (string_table == NULL) {
+ string_table = hash_table_create(2048, cmp_string, hash_string, dtor_string);
+ }
+ token_t *token = token_data_create(kind, lin, col, len);
+ char *key = hash_table_get(string_table, (void *)s);
+ if (key == NULL) {
+ key = strdup(s);
+ hash_table_put(string_table, key, key);
+ }
+ token_data(token)->data.s = key;
+ return token;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
+</div>
+</div>
+<a name="1:38"><div class="section"><h4 class="noheading">38. </h4></a>
+<p>We'll add an external declaration for <code>string_table</code> in <code>token.h</code> so other programs can take advantage of it.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>} :=</span>
+<pre class="prettyprint lang-c">
+#ifndef TOKEN_H
+#define TOKEN_H
+#include <stdint.h> // We use this for int64_t
+#include "hash_table.h" // We need this for the string table
+<span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span>
+<span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span>
+<span class="nocode pln">{Token Creation and Destruction Interface, <a href="lexer.html#1:8">8</a>}</span>
+<span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span>
+extern hash_table_t *string_table;
+extern int column;
+extern int line;
+#endif
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:39"><div class="section"><h4 class="noheading">39. </h4></a>
+<p>Finally, we implement the token data structure in <code>token.c</code>.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>token.c</strong> <a href="lexer.html#1:39">39</a>}</span>
+<pre class="prettyprint lang-c">
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "token.h"
+#include "hash_table.h"
+<span class="nocode pln">{Token Data Structure, <a href="lexer.html#1:13">13</a>}</span>
+<span class="nocode pln">{Token Data Access, <a href="lexer.html#1:14">14</a>}</span>
+<span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:16">16</a>}</span>
+<span class="nocode pln">{Token Create String, <a href="lexer.html#1:17">17</a>}</span>
+<span class="nocode pln">{Token Debugging, <a href="lexer.html#1:15">15</a>}</span>
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:40"><div class="section"><h4>40. Input</h4></a>
+<p>Input will provide a simple interface for reading characters from a file. The stream itself is deliberately hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that.
+</p>
+
+</div>
+<a name="1:41"><div class="section"><h4 class="noheading">41. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Interface <a href="lexer.html#1:41">41</a>}</span>
+<pre class="prettyprint lang-c">
+void input_init(const char *filename);
+int input_getc(void);
+void input_ungetc(int c);
+void input_destroy(void);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:50">50</a></p>
+</div>
+<p>When the program wants to start reading a file, it calls <code>input_init</code> with the filename. It can then call <code>input_getc</code> to get the next character in the file. If there's no more input, <code>input_getc</code> will return <code>EOF</code>.
+</p>
+<p>There's also an <code>input_ungetc</code> function, which allows the program to put a character back into the stream. I'll only allow one character to be put back, but that should be enough for the tokenizer.
+</p>
+<p>Finally, when the program is done reading the file, it should call <code>input_destroy</code> to clean up.
+</p>
+
+</div>
+<a name="1:42"><div class="section"><h4>42. Input Design Decisions</h4></a>
+<p>Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into memory, we'll need to read it in chunks. There are a couple of choices for how to do this:
+</p>
+<ol>
+<li><p><strong>Read a line at a time</strong>: This is a more natural approach, but it has two drawbacks. First, it requires a large buffer to store the line (C normally specifies <code>BUFSIZ</code> as 8192 bytes). Second, if the line is longer than <code>BUFSIZ</code>, we'll have to read the line in chunks anyway.
+</p>
+</li>
+<li><p><strong>Choose some arbitrary buffer size and read that many bytes at a time</strong>: This is the approach I'm going to take. It's a little less natural, but it's more memory efficient.
+</p>
+</li>
+</ol>
+<p>Input will read chunks of 128 bytes at a time, reusing the same static buffer. This limitation is not visible to the tokenizer, which will only see the <code>input_getc</code> interface.
+</p>
+<p>When the buffer is exhausted, <code>input_getc</code> will call <code>nextline</code>, which will read the next chunk of the file.
+</p>
+
+</div>
+<a name="1:43"><div class="section"><h4>43. Input Implementation</h4></a>
+<p>The implementation of the input module is pretty straightforward. We have the following data structures and defines as globals:
+</p>
+
+</div>
+<a name="1:44"><div class="section"><h4 class="noheading">44. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Data <a href="lexer.html#1:44">44</a>}</span>
+<pre class="prettyprint lang-c">
+#define CHUNK_SIZE 128
+static char buffer[CHUNK_SIZE];
+static int buffer_pos = 0;
+static int buffer_size = 0;
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
+
+static FILE *file = NULL;
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p>
+</div>
+<p>When the program calls <code>input_init</code>, we open the file.
+</p>
+
+</div>
+<a name="1:45"><div class="section"><h4 class="noheading">45. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Initialization <a href="lexer.html#1:45">45</a>}</span>
+<pre class="prettyprint lang-c">
+void input_init(const char *filename) {
+ file = fopen(filename, "r");
+ if (file == NULL) {
+ fprintf(stderr, "Error: Cannot open file %s\n", filename);
+ exit(1);
+ }
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p>
+</div>
+<p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is exhausted, we call <code>nextline</code>. We also track the line and column.
+</p>
+
+</div>
+<a name="1:46"><div class="section"><h4 class="noheading">46. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Get Character <a href="lexer.html#1:46">46</a>}</span>
+<pre class="prettyprint lang-c">
+int input_getc(void) {
+ if (unget_buffer_stack_pos > 0) {
+ return unget_buffer_stack[--unget_buffer_stack_pos];
+ }
+ if (buffer_pos == buffer_size) {
+ buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
+ buffer_pos = 0;
+ }
+ if (buffer_size == 0) {
+ return EOF;
+ }
+ char c = buffer[buffer_pos++];
+ return c;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p>
+</div>
+<p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>.
+</p>
+
+</div>
+<a name="1:47"><div class="section"><h4 class="noheading">47. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Unget Character <a href="lexer.html#1:47">47</a>}</span>
+<pre class="prettyprint lang-c">
+void input_ungetc(int c) {
+ unget_buffer_stack[unget_buffer_stack_pos++] = c;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p>
+</div>
+<p>Since we're not using dynamic memory allocation, cleanup is pretty simple.
+</p>
+
+</div>
+<a name="1:48"><div class="section"><h4 class="noheading">48. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Input Destroy <a href="lexer.html#1:48">48</a>}</span>
+<pre class="prettyprint lang-c">
+void input_destroy(void) {
+ fclose(file);
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:49">49</a></p>
+</div>
+</div>
+<a name="1:49"><div class="section"><h4 class="noheading">49. </h4></a>
+<p>We put the whole thing together in <code>input.c</code>.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>input.c</strong> <a href="lexer.html#1:49">49</a>}</span>
+<pre class="prettyprint lang-c">
+#include <stdio.h>
+#include <stdlib.h>
+#include "input.h"
+<span class="nocode pln">{Input Data, <a href="lexer.html#1:44">44</a>}</span>
+<span class="nocode pln">{Input Initialization, <a href="lexer.html#1:45">45</a>}</span>
+<span class="nocode pln">{Input Get Character, <a href="lexer.html#1:46">46</a>}</span>
+<span class="nocode pln">{Input Unget Character, <a href="lexer.html#1:47">47</a>}</span>
+<span class="nocode pln">{Input Destroy, <a href="lexer.html#1:48">48</a>}</span>
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:50"><div class="section"><h4 class="noheading">50. </h4></a>
+<p>We'll need an external declaration for <code>file</code> in <code>input.h</code> so other programs can take advantage of it.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>input.h</strong> <a href="lexer.html#1:50">50</a>}</span>
+<pre class="prettyprint lang-c">
+#ifndef INPUT_H
+#define INPUT_H
+<span class="nocode pln">{Input Interface, <a href="lexer.html#1:41">41</a>}</span>
+#endif
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:51"><div class="section"><h4 class="noheading">51. </h4></a>
+<p>We'll implement the lexer interface in <code>tokenizer.h</code>
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>tokenizer.h</strong> <a href="lexer.html#1:51">51</a>}</span>
+<pre class="prettyprint lang-c">
+#ifndef TOKENIZER_H
+#define TOKENIZER_H
+#include "token.h"
+#include "input.h"
+<span class="nocode pln">{Tokenization Interface, <a href="lexer.html#1:52">52</a>}</span>
+#endif
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:52"><div class="section"><h4 class="noheading">52. </h4></a>
+<p>The tokenization interface will have a couple of functions. <code>next_token</code> will return the next token in the input stream, <code>init_tokenizer</code> will initialize the tokenizer, and <code>destroy_tokenizer</code> will clean up.
+</p>
+<p>We'll also have some helper functions for lookahead and matching.
+</p>
+<p>'peek_token' will return the next token without consuming it (it technically does advance the input stream, but it saves the token so it can be reused).
+</p>
+<p>'consume' will consume the next token if it matches a given kind. If it doesn't match, it will print an error message and exit.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenization Interface <a href="lexer.html#1:52">52</a>}</span>
+<pre class="prettyprint lang-c">
+void init_tokenizer(const char *filename);
+void destroy_tokenizer(void);
+token_t *next_token(void);
+void reject_token(token_t *token);
+token_t *peek_token(void);
+void consume(c_token_types kind);
+void consume_alt(c_token_types *kinds, int n);
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:51">51</a></p>
+</div>
+</div>
+<a name="1:53"><div class="section"><h4 class="noheading">53. </h4></a>
+<p>Now we can finally implement the tokenizer.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{<strong>tokenizer.c</strong> <a href="lexer.html#1:53">53</a>}</span>
+<pre class="prettyprint lang-c">
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "tokenizer.h"
+#include "token.h"
+#include "input.h"
+token_t *left_stack[8];
+int left_stack_pos = 0;
+<span class="nocode pln">{Utility Functions, <a href="lexer.html#1:54">54</a>}</span>
+<span class="nocode pln">{Tokenization Function, <a href="lexer.html#1:56">56</a>}</span>
+</pre>
+
+
+
+</div>
+</div>
+<a name="1:54"><div class="section"><h4 class="noheading">54. </h4></a>
+<p>Utility functions are everything that doesn't directly tokenize the input.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Utility Functions <a href="lexer.html#1:54">54</a>}</span>
+<pre class="prettyprint lang-c">
+void init_tokenizer(const char *filename) {
+ input_init(filename);
+}
+
+void destroy_tokenizer(void) {
+ input_destroy();
+}
+
+void reject_token(token_t *token) {
+ left_stack[left_stack_pos++] = token;
+}
+
+token_t *peek_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[left_stack_pos - 1];
+ }
+ token_t *token = next_token();
+ reject_token(token);
+ return token;
+}
+
+<span class="nocode pln">{Stringify Type, <a href="lexer.html#1:55">55</a>}</span>
+
+void consume(c_token_types kind) {
+ token_t *token = next_token();
+ if (token_type(token) != kind) {
+ fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
+ exit(1);
+ }
+ token_destroy(token);
+}
+
+void consume_alt(c_token_types *kinds, int n) {
+ token_t *token = next_token();
+ for (int i = 0; i < n; i++) {
+ if (token_type(token) == kinds[i]) {
+ token_destroy(token);
+ return;
+ }
+ }
+ fprintf(stderr, "Error: Expected one of the following tokens: ");
+ for (int i = 0; i < n; i++) {
+ fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
+ }
+ fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
+ exit(1);
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p>
+</div>
+</div>
+<a name="1:55"><div class="section"><h4 class="noheading">55. </h4></a>
+<p>We'll need a helper function to convert token types to strings. It's pretty simple, just tedious.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Stringify Type <a href="lexer.html#1:55">55</a>}</span>
+<pre class="prettyprint lang-c">
+const char *stringify_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "if";
+ case TOK_CTK_ELSE:
+ return "else";
+ case TOK_CTK_SWITCH:
+ return "switch";
+ case TOK_CTK_CASE:
+ return "case";
+ case TOK_CTK_DEFAULT:
+ return "default";
+ case TOK_CTK_WHILE:
+ return "while";
+ case TOK_CTK_DO:
+ return "do";
+ case TOK_CTK_FOR:
+ return "for";
+ case TOK_CTK_CONTINUE:
+ return "continue";
+ case TOK_CTK_BREAK:
+ return "break";
+ case TOK_CTK_RETURN:
+ return "return";
+ case TOK_CTK_GOTO:
+ return "goto";
+ case TOK_TK_VOID:
+ return "void";
+ case TOK_TK_CHAR:
+ return "char";
+ case TOK_TK_SHORT:
+ return "short";
+ case TOK_TK_INT:
+ return "int";
+ case TOK_TK_LONG:
+ return "long";
+ case TOK_TK_FLOAT:
+ return "float";
+ case TOK_TK_DOUBLE:
+ return "double";
+ case TOK_TK_SIGNED:
+ return "signed";
+ case TOK_TK_UNSIGNED:
+ return "unsigned";
+ case TOK_TK_STRUCT:
+ return "struct";
+ case TOK_TK_UNION:
+ return "union";
+ case TOK_TK_ENUM:
+ return "enum";
+ case TOK_TK_TYPEDEF:
+ return "typedef";
+ case TOK_SCSK_AUTO:
+ return "auto";
+ case TOK_SCSK_REGISTER:
+ return "register";
+ case TOK_SCSK_STATIC:
+ return "static";
+ case TOK_SCSK_EXTERN:
+ return "extern";
+ case TOK_SCSK_CONST:
+ return "const";
+ case TOK_SCSK_VOLATILE:
+ return "volatile";
+ case TOK_MK_SIZEOF:
+ return "sizeof";
+ case TOK_OP_ADD:
+ return "+";
+ case TOK_OP_SUB:
+ return "-";
+ case TOK_OP_MUL:
+ return "*";
+ case TOK_OP_DIV:
+ return "/";
+ case TOK_OP_MOD:
+ return "%";
+ case TOK_OP_BIT_AND:
+ return "&";
+ case TOK_OP_BIT_OR:
+ return "|";
+ case TOK_OP_BIT_XOR:
+ return "^";
+ case TOK_OP_BIT_NOT:
+ return "~";
+ case TOK_OP_LSHIFT:
+ return "<<";
+ case TOK_OP_RSHIFT:
+ return ">>";
+ case TOK_OP_NOT:
+ return "!";
+ case TOK_OP_ASSIGN:
+ return "=";
+ case TOK_OP_LT:
+ return "<";
+ case TOK_OP_GT:
+ return ">";
+ case TOK_OP_INC:
+ return "++";
+ case TOK_OP_DEC:
+ return "--";
+ case TOK_OP_EQ:
+ return "==";
+ case TOK_OP_NE:
+ return "!=";
+ case TOK_OP_LE:
+ return "<=";
+ case TOK_OP_GE:
+ return ">=";
+ case TOK_OP_AND:
+ return "&&";
+ case TOK_OP_OR:
+ return "||";
+ case TOK_OP_MEMBER_POINTER:
+ return "->";
+ case TOK_OP_MEMBER:
+ return ".";
+ case TOK_OP_COND_DECISION:
+ return ":";
+ case TOK_OP_COND:
+ return "?";
+ case TOK_OP_ASSIGN_ADD:
+ return "+=";
+ case TOK_OP_ASSIGN_SUB:
+ return "-=";
+ case TOK_OP_ASSIGN_MUL:
+ return "*=";
+ case TOK_OP_ASSIGN_DIV:
+ return "/=";
+ case TOK_OP_ASSIGN_MOD:
+ return "%=";
+ case TOK_OP_ASSIGN_BITAND:
+ return "&=";
+ case TOK_OP_ASSIGN_BITOR:
+ return "|=";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "^=";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "<<=";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return ">>=";
+ case TOK_SEP_HASH:
+ return "#";
+ case TOK_ID:
+ return "identifier";
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_U64:
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_S64:
+ return "integer constant";
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ return "floating constant";
+ case TOK_CONST_CHAR:
+ return "character constant";
+ case TOK_CONST_STRING_ASCII:
+ return "string constant";
+ case TOK_SPECIAL_EOF:
+ return "EOF";
+ case TOK_SPECIAL_ERROR:
+ return "error";
+ case TOK_SEP_LEFT_PAREN:
+ return "(";
+ case TOK_SEP_RIGHT_PAREN:
+ return ")";
+ case TOK_SEP_LEFT_BRACKET:
+ return "[";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "]";
+ case TOK_SEP_LEFT_BRACE:
+ return "{";
+ case TOK_SEP_RIGHT_BRACE:
+ return "}";
+ case TOK_SEP_COMMA:
+ return ",";
+ case TOK_SEP_SEMICOLON:
+ return ";";
+ case TOK_SEP_DOT:
+ return ".";
+ case TOK_SEP_ELLIPSIS:
+ return "...";
+ }
+ return "UNKNOWN";
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:54">54</a></p>
+</div>
+</div>
+<a name="1:56"><div class="section"><h4 class="noheading">56. </h4></a>
+<p>Now we can implement the tokenization function. The pattern is pretty simple: we call each of the tokenization functions in turn until we find a match. If we don't find a match, we print an error message and exit.
+You might wonder why skip_whitespace can return a token. This makes handling the divide operator easier as comments also start with a slash.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenization Function <a href="lexer.html#1:56">56</a>}</span>
+<pre class="prettyprint lang-c">
+char file_name[1024];
+<span class="nocode pln">{Warning/Error Functions, <a href="lexer.html#1:57">57</a>}</span>
+<span class="nocode pln">{Skip Whitespace, <a href="lexer.html#1:58">58</a>}</span>
+<span class="nocode pln">{Tokenize Identifier, <a href="lexer.html#1:59">59</a>}</span>
+<span class="nocode pln">{Tokenize Number, <a href="lexer.html#1:62">62</a>}</span>
+<span class="nocode pln">{Tokenize String, <a href="lexer.html#1:71">71</a>}</span>
+<span class="nocode pln">{Tokenize Character, <a href="lexer.html#1:70">70</a>}</span>
+<span class="nocode pln">{Tokenize Operator, <a href="lexer.html#1:61">61</a>}</span>
+token_t *next_token(void) {
+ if (left_stack_pos > 0) {
+ return left_stack[--left_stack_pos];
+ }
+ token_t *tok = skip_whitespace();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_identifier();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_number();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_char_constant();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_string_literal();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_operator();
+ if (tok != NULL) {
+ return tok;
+ }
+ int c = input_getc();
+ if (c == EOF) {
+ return NULL;
+ }
+ tok_warn(
+ "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+ line, column);
+ return next_token();
+}
+
+#ifdef TEST_TOKENIZER
+<span class="nocode pln">{Run Test, <a href="lexer.html#1:73">73</a>}</span>
+#endif
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p>
+</div>
+</div>
+<a name="1:57"><div class="section"><h4 class="noheading">57. </h4></a>
+<p>We'll need a couple of helper functions to skip whitespace and print warnings/errors.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Warning/Error Functions <a href="lexer.html#1:57">57</a>}</span>
+<pre class="prettyprint lang-c">
+void tok_error(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+
+void tok_warn(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:58"><div class="section"><h4 class="noheading">58. </h4></a>
+<p>The <code>skip_whitespace</code> function is pretty simple. It just skips over any comments, whitespace, and line directives.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Skip Whitespace <a href="lexer.html#1:58">58</a>}</span>
+<pre class="prettyprint lang-c">
+static token_t *skip_whitespace(void) {
+ int c;
+ while ((c = input_getc()) != EOF) {
+ if (isspace(c)) { // Whitespace
+ if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ } else if (c == '#') // GCC preprocessor line control directive.
+ {
+ char buf[512];
+ int i = 0;
+ while ((c = input_getc()) != EOF && c != '\n') {
+ buf[i++] = c;
+ column++;
}
buf[i] = '\0';
- if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+ if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
column = 1;
} else {
tok_error("Invalid #line directive\n");
@@ -852,10 +2212,10 @@ token_t *read_token(void) {
if (c == EOF) {
return NULL;
}
- } else if (c == '/') {
+ } else if (c == '/') { // Comment
c = input_getc();
if (c == '/') {
- while ((c = input_getc()) != EOF && c != '\n') {
+ while ((c = input_getc()) != EOF && c != '\n') {
column++;
}
if (c == EOF) {
@@ -863,7 +2223,7 @@ token_t *read_token(void) {
}
line++;
column = 1;
- } else if (c == '*') {
+ } else if (c == '*') { // Multiline comment
while ((c = input_getc()) != EOF) {
if (c == '*') {
c = input_getc();
@@ -880,7 +2240,7 @@ token_t *read_token(void) {
if (c == EOF) {
return NULL;
}
- } else {
+ } else { // Handled here to simplify the code.
if (c == '=')
return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
input_ungetc(c);
@@ -892,32 +2252,38 @@ token_t *read_token(void) {
}
}
return NULL;
-}</code></pre>
- </div>
- <p>Now we'll implement code to recognize keywords and identifiers. The rules for identifiers are that they must
- start with a letter or underscore and can contain letters, digits, and underscores. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":read-identifier" href="#:read-identifier">Read
- identifier</a></em></strong></span>
- <pre class="prettyprint"><code class="">static token_t *read_identifier(void) {
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:59"><div class="section"><h4 class="noheading">59. </h4></a>
+<p>The <code>read_identifier</code> function reads an identifier from the input stream. C identifiers can contain letters, digits, and underscores, but they can't start with a digit.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenize Identifier <a href="lexer.html#1:59">59</a>}</span>
+<pre class="prettyprint lang-c">
+<span class="nocode pln">{Get Keyword, <a href="lexer.html#1:60">60</a>}</span>
+static token_t *read_identifier(void) {
int c;
char buf[1024];
int i = 0;
c = input_getc();
- if (!isalpha(c) && c != '_') {
+ if (!isalpha(c) && c != '_') {
input_ungetc(c);
return NULL;
}
buf[i++] = c;
while ((c = input_getc()) != EOF) {
- if (!isalnum(c) && c != '_') {
+ if (!isalnum(c) && c != '_') {
input_ungetc(c);
break;
}
buf[i++] = c;
- if (i >= 1008) {
+ if (i >= 1008) {
tok_error("Identifier too long\n");
exit(1);
}
@@ -930,81 +2296,29 @@ token_t *read_token(void) {
return token_create(kind, line, column, i);
}
return token_create_string(kind, line, column, buf, i);
-}</code></pre>
- </div>
- <p> We have to match the following keywords: </p>
- <ul>
- <li>Control Keywords: <code>if</code>, <code>else</code>, <code>switch</code>, <code>case</code>,
- <code>default</code>, <code>while</code>, <code>do</code>, <code>for</code>, <code>continue</code>,
- <code>return</code>, <code>goto</code>
- </li>
- <li>Type Keywords: <code>void</code>, <code>char</code>, <code>short</code>, <code>int</code>, <code>long</code>,
- <code>float</code>, <code>double</code>, <code>unsigned</code>, <code>struct</code>, <code>union</code>,
- <code>enum</code>, <code>typedef</code>
- </li>
- <li>Storage Class/Specifier Keywords: <code>auto</code>, <code>register</code>, <code>static</code>,
- <code>extern</code>, <code>const</code>, <code>volatile</code>
- </li>
- <li>Misc Keywords: <code>sizeof</code></li>
- </ul>
- <p>For recognizing keywords, I'll use a decision tree. This is a simple and space-efficient way to recognize
- keywords. Originally I wanted to use a trie, but those end up using a lot of memory for a small number of
- items. </p>
- <p> To build that, we first need to figure out what keywords have common prefixes. We can do that by sorting them:
- </p>
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":keyword-sort" href="#:keyword-sort">Keyword
- sort</a></em></strong></span>
- <pre class="prettyprint"><code class="">auto
-break
-case
-char
-const
-continue
-default
-do
-double
-else
-enum
-extern
-float
-for
-goto
-if
-int
-long
-register
-return
-short
-signed
-sizeof
-static
-struct
-switch
-typedef
-union
-unsigned
-void
-volatile
-while</code></pre>
- </div>
- <p>From this, the organization is pretty clear. </p>
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
- decision tree</a></em></strong></span>
- <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf, int len) {
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:60"><div class="section"><h4 class="noheading">60. </h4></a>
+<p>The <code>get_keyword</code> function is a simple decision tree for identifying keywords. The code is pretty tedious, but it works.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Get Keyword <a href="lexer.html#1:60">60</a>}</span>
+<pre class="prettyprint lang-c">
+c_token_types get_keyword(const char *buf, int len) {
switch (buf[0]) {
case 'a':
- if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+ if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
return TOK_SCSK_AUTO;
break;
case 'b':
- if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+ if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
buf[4] == 'k')
return TOK_CTK_BREAK;
break;
@@ -1012,18 +2326,18 @@ while</code></pre>
case 'c':
switch (buf[1]) {
case 'a':
- if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
return TOK_CTK_CASE;
break;
case 'h':
- if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+ if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
return TOK_TK_CHAR;
break;
case 'o':
- if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+ if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
return TOK_SCSK_CONST;
- if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
- buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+ if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
return TOK_CTK_CONTINUE;
break;
}
@@ -1032,14 +2346,14 @@ while</code></pre>
case 'd':
switch (buf[1]) {
case 'e':
- if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
- buf[5] == 'l' && buf[6] == 't')
+ if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+ buf[5] == 'l' && buf[6] == 't')
return TOK_CTK_DEFAULT;
break;
case 'o':
- if (len == 2 && buf[2] == '\0')
+ if (len == 2 && buf[2] == '\0')
return TOK_CTK_DO;
- if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
buf[5] == 'e')
return TOK_TK_DOUBLE;
break;
@@ -1049,15 +2363,15 @@ while</code></pre>
case 'e':
switch (buf[1]) {
case 'l':
- if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
return TOK_CTK_ELSE;
break;
case 'n':
- if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+ if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
return TOK_TK_ENUM;
break;
case 'x':
- if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+ if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
buf[5] == 'n')
return TOK_SCSK_EXTERN;
break;
@@ -1067,46 +2381,46 @@ while</code></pre>
case 'f':
switch (buf[1]) {
case 'l':
- if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
return TOK_TK_FLOAT;
break;
case 'o':
- if (len == 3 && buf[2] == 'r')
+ if (len == 3 && buf[2] == 'r')
return TOK_CTK_FOR;
break;
}
break;
case 'g':
- if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+ if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
return TOK_CTK_GOTO;
break;
case 'i':
switch (buf[1]) {
case 'f':
- if (len == 2 && buf[2] == '\0')
+ if (len == 2 && buf[2] == '\0')
return TOK_CTK_IF;
break;
case 'n':
- if (len == 3 && buf[2] == 't')
+ if (len == 3 && buf[2] == 't')
return TOK_TK_INT;
break;
}
break;
case 'l':
- if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+ if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
return TOK_TK_LONG;
break;
case 'r':
switch (buf[1]) {
case 'e':
- if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
- buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+ if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+ buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
return TOK_SCSK_REGISTER;
- if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+ if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
buf[5] == 'n')
return TOK_CTK_RETURN;
break;
@@ -1116,28 +2430,28 @@ while</code></pre>
case 's':
switch (buf[1]) {
case 'h':
- if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
return TOK_TK_SHORT;
break;
case 't':
- if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+ if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
buf[5] == 'c')
return TOK_SCSK_STATIC;
break;
case 'i':
- if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+ if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
buf[5] == 'd')
return TOK_TK_SIGNED;
- if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+ if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
buf[5] == 'f')
return TOK_MK_SIZEOF;
break;
case 'r':
- if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
return TOK_TK_STRUCT;
break;
case 'w':
- if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+ if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
buf[5] == 'h')
return TOK_CTK_SWITCH;
break;
@@ -1145,18 +2459,18 @@ while</code></pre>
break;
case 't':
- if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
- buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+ if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+ buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
return TOK_TK_TYPEDEF;
break;
case 'u':
switch (buf[1]) {
case 'n':
- if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+ if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
return TOK_TK_UNION;
- if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
- buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+ if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+ buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
return TOK_TK_UNSIGNED;
break;
}
@@ -1165,17 +2479,17 @@ while</code></pre>
case 'v':
switch (buf[1]) {
case 'o':
- if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+ if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
return TOK_TK_VOID;
- if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
- buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+ if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+ buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
return TOK_SCSK_VOLATILE;
break;
}
break;
case 'w':
- if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+ if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
buf[4] == 'e')
return TOK_CTK_WHILE;
break;
@@ -1184,71 +2498,22 @@ while</code></pre>
return TOK_ID;
}
return TOK_ID;
-}</code></pre>
- </div>
- <p> It's very tedious, but it performs pretty well. You might wonder why I didn't use strcmp, and the answer is that
- this approach turned out faster in testing, plus, considering I want this to be self-hosting, I can't rely on
- optimizations in the standard library. </p>
- <p>We'll use the same approach for recognizing operators. Let's list both out (Table 2-3 in the C
- Reference Manual). </p>
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":operator" href="#:operators">Operators</a></em></strong></span>
- <pre class="prettyprint"><code class="">!
-!=
-%
-%=
-&
-&&
-&=
-(
-)
-*
-*=
-+
-++
-+=
-,
--
---
--=
-->
-.
-...
-/
-/=
-:
-;
-<
-<<
-<<=
-<=
-=
-==
->
->=
->>
->>=
-?
-[
-]
-^
-^=
-{
-|
-|=
-||
-}
-~</code></pre>
- </div>
- <p>And the decision tree for operators: </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":operator-decision-tree" href="#:operator-decision-tree">Operator
- decision tree</a></em></strong></span>
- <pre class="prettyprint"><code class="">token_t *read_operator(void) {
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
+</div>
+</div>
+<a name="1:61"><div class="section"><h4 class="noheading">61. </h4></a>
+<p>The <code>read_operator</code> function works similarly to the <code>read_identifier</code> function. It uses a decision tree to identify operators.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenize Operator <a href="lexer.html#1:61">61</a>}</span>
+<pre class="prettyprint lang-c">
+
+token_t *read_operator(void) {
int c;
c = input_getc();
switch (c) {
@@ -1266,9 +2531,9 @@ while</code></pre>
input_ungetc(c);
return token_create(TOK_OP_MOD, line, column, 1);
}
- case '&': {
+ case '&': {
c = input_getc();
- if (c == '&')
+ if (c == '&')
return token_create(TOK_OP_AND, line, column, 2);
if (c == '=')
return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
@@ -1303,7 +2568,7 @@ while</code></pre>
return token_create(TOK_OP_DEC, line, column, 2);
if (c == '=')
return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
- if (c == '>')
+ if (c == '>')
return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
input_ungetc(c);
return token_create(TOK_OP_SUB, line, column, 1);
@@ -1334,9 +2599,9 @@ while</code></pre>
return token_create(TOK_OP_COND_DECISION, line, column, 1);
case ';':
return token_create(TOK_SEP_SEMICOLON, line, column, 1);
- case '<': {
+ case '<': {
c = input_getc();
- if (c == '<') {
+ if (c == '<') {
c = input_getc();
if (c == '=')
return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
@@ -1355,9 +2620,9 @@ while</code></pre>
input_ungetc(c);
return token_create(TOK_OP_ASSIGN, line, column, 1);
}
- case '>': {
+ case '>': {
c = input_getc();
- if (c == '>') {
+ if (c == '>') {
c = input_getc();
if (c == '=')
return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
@@ -1401,84 +2666,59 @@ while</code></pre>
input_ungetc(c);
return NULL;
}
+
+ return NULL;
}
-</code></pre>
- </div>
- <p>Now we can implement the functions for reading numbers. C recognizes the following grammar for numbers (if you
- haven't seen a formal grammar before, you read '::=' as "can be", '|' as "or", and '[]' as "optional", so an
- integer constant can be a decimal constant with an optional suffix, an octal constant with an optional suffix, or
- a hexadecimal constant with an optional suffix): </p>
- <div
- style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
- <pre class="prettyprint"><code class="">constant ::= integer-constant | floating-constant
-
- integer-constant ::= decimal-constant [integer-suffix]
- | octal-constant [integer-suffix]
- | hexadecimal-constant [integer-suffix]
-
- decimal-constant ::= nonzero-digit [digit-sequence] | '0'
-
- octal-constant ::= '0' octal-digit-sequence
-
- hexadecimal-constant ::= '0x' hexadecimal-digit-sequence
- | '0X' hexadecimal-digit-sequence
-
- integer-suffix ::= 'u' | 'U' | 'l' | 'L'
- | 'ul' | 'UL' | 'lu' | 'LU'
-
- floating-constant ::= fractional-constant [exponent-part] [floating-suffix]
- | digit-sequence exponent-part [floating-suffix]
-
- fractional-constant ::= digit-sequence '.' [digit-sequence]
- | '.' digit-sequence
-
- exponent-part ::= 'e' [sign] digit-sequence
- | 'E' [sign] digit-sequence
-
- sign ::= '+' | '-'
-
- floating-suffix ::= 'f' | 'F'
-
- digit-sequence ::= digit [digit-sequence]
-
- nonzero-digit ::= '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
-
- octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
-
- hexadecimal-digit ::= digit
- | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
- | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
-
- digit ::= '0' | nonzero-digit
-</code></pre>
- </div>
- <p>Where the following semantics apply: </p>
- <ul>
- <li>Integer constants are evaluated according to their radix, where <code>decimal-constant</code> is base 10,
- <code>octal-constant</code> is base 8, and <code>hexadecimal-constant</code> is base 16.
- </li>
- <li>Integer suffixes are used to specify the type of the constant. A suffix of <code>u</code> or <code>U</code>
- specifies an unsigned integer, and a suffix of <code>l</code> or <code>L</code> specifies a long integer. </li>
- <li>Suffixes can be combined, so <code>ul</code> specifies an unsigned long integer. </li>
- <li>Floating constants are evaluated as double precision unless a suffix of <code>f</code> or <code>F</code> is
- used, in which case they are evaluated as single precision. </li>
- </ul>
- <p> Looking at these rules, we can see that all valid integer constants start with a digit from 0-9 or a period.
- Since periods are also used for struct members, we'll have to handle that as well. </p>
- <p> The code for reading numbers is pretty complex, so I'll comment it heavily. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":read-number" href="#:read-number">Read
- number</a></em></strong></span>
- <pre class="prettyprint"><code class="">
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:62"><div class="section"><h4 class="noheading">62. </h4></a>
+<p>The <code>read_number</code> function reads a number from the input stream. It can be an integer or a floating-point number.
+</p>
+<p>I've broken it up a bit to make it easier to read.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenize Number <a href="lexer.html#1:62">62</a>}</span>
+<pre class="prettyprint lang-c">
static token_t *read_number(void) {
int c;
char buf[1024];
int i = 0;
c = input_getc();
- // If we don't have a digit or decimal point, it's not a number
- if (!isdigit(c) && c != '.') {
+<span class="nocode pln"> {Check for valid prefix, <a href="lexer.html#1:63">63</a>}</span>
+ int radix = 10;
+<span class="nocode pln"> {Process Radix, <a href="lexer.html#1:64">64</a>}</span>
+ int is_float = 0;
+<span class="nocode pln"> {Read Number Loop, <a href="lexer.html#1:65">65</a>}</span>
+ buf[i] = '\0';
+<span class="nocode pln"> {Process Suffixes, <a href="lexer.html#1:66">66</a>}</span>
+<span class="nocode pln"> {Check for conflicting suffixes, <a href="lexer.html#1:67">67</a>}</span>
+ if (is_float) {
+<span class="nocode pln"> {Convert to float, <a href="lexer.html#1:68">68</a>}</span>
+ } else {
+<span class="nocode pln"> {Convert to integer, <a href="lexer.html#1:69">69</a>}</span>
+ }
+ return NULL;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:63"><div class="section"><h4 class="noheading">63. </h4></a>
+<p>To determine if a character is a valid prefix for a number, we need to check if it's a digit or a period followed by a digit
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Check for valid prefix <a href="lexer.html#1:63">63</a>}</span>
+<pre class="prettyprint lang-c">
+ // If we don't have a digit or decimal point, it's not a number
+ if (!isdigit(c) && c != '.') {
input_ungetc(c);
return NULL;
}
@@ -1491,8 +2731,19 @@ static token_t *read_number(void) {
}
input_ungetc(cnext);
}
- int radix = 10;
- int is_float = 0;
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:64"><div class="section"><h4 class="noheading">64. </h4></a>
+<p>A C constant starting with a zero is either an octal or hexadecimal constant. We need to check the next character to determine which one it is.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Process Radix <a href="lexer.html#1:64">64</a>}</span>
+<pre class="prettyprint lang-c">
// Check for hex and octal.
if (c == '0') {
char cnext = input_getc();
@@ -1508,21 +2759,31 @@ static token_t *read_number(void) {
// Decimal, append the first digit
buf[i++] = c;
}
- // Read the rest of the number
- while ((c = input_getc()) != EOF) {
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:65"><div class="section"><h4 class="noheading">65. </h4></a>
+
+<div class="codeblock">
+<span class="codeblock_name">{Read Number Loop <a href="lexer.html#1:65">65</a>}</span>
+<pre class="prettyprint lang-c">
+ while ((c = input_getc()) != EOF) {
// Since there can be multiple writes to the buffer, we want to make sure we
// don't overflow by giving a 4 byte pad
- if (i > 1020) {
+ if (i > 1020) {
tok_error("Number too long\n");
return NULL;
}
// Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
// a-f/A-F for hex
- if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
- (radix == 8 && c >= '0' && c <= '7')) {
+ if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+ (radix == 8 && c >= '0' && c <= '7')) {
buf[i++] = c;
// Decimal point and not a float yet, must be a float
- } else if (c == '.' && !is_float) {
+ } else if (c == '.' && !is_float) {
is_float = 1;
if (radix != 10) {
tok_error("Invalid floating point constant, expected decimal, got %s\n",
@@ -1554,12 +2815,22 @@ static token_t *read_number(void) {
break;
}
}
- buf[i] = '\0';
- // Check for suffixes
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:66"><div class="section"><h4 class="noheading">66. </h4></a>
+<p>C constants can have suffixes to indicate their type. We need to check for these suffixes and set the appropriate flags.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Process Suffixes <a href="lexer.html#1:66">66</a>}</span>
+<pre class="prettyprint lang-c">
int is_unsigned = 0;
int is_long = 0;
int is_single = 0;
- // Loop to get all possible suffixes. Warn when duplicated.
while (1) {
c = input_getc();
if (c == 'u' || c == 'U') {
@@ -1585,25 +2856,47 @@ static token_t *read_number(void) {
break;
}
}
- // Resolve invalid suffixes. Doesn't error because you can still compile with
- // them.
- if (is_single && is_long) {
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:67"><div class="section"><h4 class="noheading">67. </h4></a>
+<p>If we find conflicting suffixes, we print a warning and ignore the suffixes.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Check for conflicting suffixes <a href="lexer.html#1:67">67</a>}</span>
+<pre class="prettyprint lang-c">
+ if (is_single && is_long) {
tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
"constant. Ignoring 'l'\n");
is_long = 0;
}
- if (is_single && is_unsigned) {
+ if (is_single && is_unsigned) {
tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
"constant. Ignoring 'u'\n");
is_unsigned = 0;
}
- if (is_single && !is_float) {
+ if (is_single && !is_float) {
tok_warn(
"Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
is_single = 0;
}
- // use the strtox functions to convert the string to a number
- if (is_float) {
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:68"><div class="section"><h4 class="noheading">68. </h4></a>
+<p>If the constant is a floating-point number, we convert it to a float. We need to make sure that the number is in range for the given type and check for errors from strtod
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Convert to float <a href="lexer.html#1:68">68</a>}</span>
+<pre class="prettyprint lang-c">
errno = 0;
// Strtod generates a unix-style error when it's given something out of
// range, so we want to get on top of that quickly instead of ignoring it
@@ -1615,12 +2908,12 @@ static token_t *read_number(void) {
}
// Warn if the constant is out of range for a float, I.E it's too big or too
// small
- if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+ if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
tok_warn(
"Warning: Floating point constant %f is out of range for float\n", f);
}
// Warn if the constant is too precise for a float
- if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+ if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
tok_warn("Warning: Converting double precision floating point constant "
"%f to float loses "
"precision\n",
@@ -1629,7 +2922,19 @@ static token_t *read_number(void) {
return token_create_float(is_single ? TOK_CONST_FLOAT_32
: TOK_CONST_FLOAT_64,
line, column, f, i);
- } else {
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:69"><div class="section"><h4 class="noheading">69. </h4></a>
+<p>If the constant is an integer, we convert it to an integer. We need to make sure that the number is in range for the given type and check for errors from strtoll
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Convert to integer <a href="lexer.html#1:69">69</a>}</span>
+<pre class="prettyprint lang-c">
errno = 0;
uint64_t int_ = strtoull(buf, NULL, radix);
// Same as above, but for integers
@@ -1641,7 +2946,7 @@ static token_t *read_number(void) {
if (is_long) {
return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
} else {
- if (int_ > UINT32_MAX) {
+ if (int_ > UINT32_MAX) {
tok_warn(
"Warning: Integer constant %lld is out of range for unsigned "
"int\n",
@@ -1653,113 +2958,34 @@ static token_t *read_number(void) {
if (is_long) {
// If the highest bit is set, that means this will overflow a signed
// long (Due to two's complement)
- if (int_ & (1UL << 63)) {
+ if (int_ & (1UL << 63)) {
tok_warn(
"Warning: Integer constant %lld is out of range for long long\n",
i);
}
return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
} else {
- if (int_ & (1UL << 31)) {
+ if (int_ & (1UL << 31)) {
tok_warn("Warning: Integer constant %lld is out of range for int\n",
int_);
}
return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
}
}
- }
- return NULL;
-}</code></pre>
- </div>
- <p>Now let's implement char constants. The rules for those are pretty simple, as I'm not doing multi-byte or wide
- characters. </p>
- <div
- style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
- <pre class="prettyprint"><code class="">character-constant ::= '\'' c-char-sequence '\''
-c-char-sequence ::= c-char | c-char-sequence c-char
-c-char ::= any member of the source character set except the single-quote ', backslash \, or new-line character | escape-sequence
-
-escape-sequence ::= simple-escape-sequence | octal-escape-sequence | hexadecimal-escape-sequence
-simple-escape-sequence ::= '\' ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | '\'' | '"' | '?' | '\')
-octal-escape-sequence ::= '\' octal-digit | '\' octal-digit octal-digit | '\' octal-digit octal-digit octal-digit
-hexadecimal-escape-sequence ::= '\' 'x' hexadecimal-digit | hexadecimal-escape-sequence hexadecimal-digit
-
-octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
-hexadecimal-digit ::= digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre>
- </div>
- <p>We start with the code to read escape sequences (which can be reused for string literals). </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":read-esacpe-sequence" href="#:read-escape-sequence">Read
- escape sequence</a></em></strong></span>
- <pre class="prettyprint"><code class="">static char read_escape_sequence(int *len) {
- int c = input_getc();
- *len += 1;
- switch (c) {
- case 'a':
- return '\a';
- case 'b':
- return '\b';
- case 'f':
- return '\f';
- case 'n':
- return '\n';
- case 'r':
- return '\r';
- case 't':
- return '\t';
- case 'v':
- return '\v';
- case '\'':
- return '\'';
- case '"':
- return '"';
- case '?':
- return '?';
- case '\\':
- return '\\';
- case '0':
- return '\0';
- case 'x': {
- c = input_getc();
- if (!isxdigit(c)) {
- tok_error("Invalid hexadecimal escape sequence\n");
- return 0;
- }
- int val = 0;
- while (isxdigit(c)) {
- *len += 1;
- val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
- c = input_getc();
- }
- input_ungetc(c);
- return (char)val;
- }
- default:
- if (!isdigit(c)) {
- tok_error("Invalid escape sequence\n");
- return 0;
- }
- int val = 0;
- while (isdigit(c)) {
- *len += 1;
- val = val * 8 + c - '0';
- c = input_getc();
- }
- input_ungetc(c);
- return (char)val;
- }
-}</code></pre>
- </div>
- <p>Now we can implement the code to read character constants. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":read-char-constant" href="#:read-char-constant">Read
- character constant</a></em></strong></span>
- <pre class="prettyprint"><code class="">static token_t *read_char_constant(void) {
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
+</div>
+</div>
+<a name="1:70"><div class="section"><h4 class="noheading">70. </h4></a>
+<p>The <code>read_char_constant</code> function reads a character constant from the input stream. It can be a single character or a multi-character escape sequence.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenize Character <a href="lexer.html#1:70">70</a>}</span>
+<pre class="prettyprint lang-c">
+static token_t *read_char_constant(void) {
int c;
int len = 0;
c = input_getc();
@@ -1774,7 +3000,7 @@ digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre>
return NULL;
}
if (c == '\\') {
- c = read_escape_sequence(&len);
+ c = read_escape_sequence(&len);
}
int val = c;
c = input_getc();
@@ -1784,24 +3010,25 @@ digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre>
}
len++;
return token_create_char(TOK_CONST_CHAR, line, column, val, len);
-}</code></pre>
- </div>
- <p>Finally, we can implement string literals. </p>
- <p>The grammar for string literals is pretty simple. </p>
- <div
- style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
- <pre
- class="prettyprint"><code class="">string-literal ::= '"' s-char-sequence '"'
-s-char-sequence ::= s-char | s-char-sequence s-char
-s-char ::= any member of the source character set except the double-quote ", backslash \, or new-line character | escape-sequence</code></pre>
- </div>
- <p>What that means for us is that we just have to skip over the string until we hit an unescaped double quote. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":read-string-literal" href="#:read-string-literal">Read
- string literal</a></em></strong></span>
- <pre class="prettyprint"><code class="">static token_t *read_string_literal(void) {
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:71"><div class="section"><h4 class="noheading">71. </h4></a>
+<p>The <code>read_string_literal</code> function reads a string literal from the input stream.
+</p>
+<p>For this function, an automatic-lifetime buffer is used to store the string it becomes too large. At that point, a heap-allocated buffer is used.
+This way we can avoid unnecessary heap allocations for small strings.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Tokenize String <a href="lexer.html#1:71">71</a>}</span>
+<pre class="prettyprint lang-c">
+<span class="nocode pln">{Read Escape Sequence, <a href="lexer.html#1:72">72</a>}</span>
+static token_t *read_string_literal(void) {
int c;
c = input_getc();
if (c != '"') {
@@ -1809,23 +3036,22 @@ s-char ::= any member of the source character set except the double-quote ", bac
return NULL;
}
int i = 0;
- // Malloc is used for the buf here, the pointer stays function local as string
- // interning duplicates the string
char s_buf[512];
char *buf = s_buf;
int len = 512;
+ int esc_pad = 0;
while ((c = input_getc()) != EOF) {
if (c == '"') {
// Implicit skip of closing quote
break;
}
if (c == '\\') {
- c = read_escape_sequence();
+ c = read_escape_sequence(&esc_pad);
if (c == 0) {
return NULL;
}
}
- if (i >= len) {
+ if (i >= len) {
if (buf == s_buf) {
buf = malloc(1024);
if (buf == NULL) {
@@ -1850,249 +3076,195 @@ s-char ::= any member of the source character set except the double-quote ", bac
return NULL;
}
- token_t *tok =
- token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, i);
+ token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf,
+ i + esc_pad + 2);
if (buf != s_buf) {
free(buf);
}
return tok;
-}</code></pre>
- </div>
- <p>This function uses a smaller buffer on the stack to store the string, and if it grows too large, it allocates a
- larger buffer on the heap. This helps to avoid unnecessary allocations for small strings. </p>
- <p>Let's recap. We've implemented code for recognizing keywords, operators, numbers, character constants, and string
- literals. That's everything we need for the main function. We just call them all in order. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":main-loop" href="#:main-loop">Main
- loop</a></em></strong></span>
- <pre class="prettyprint"><code class="">token_t *read_token(void) {
- if (unget_token != NULL) {
- token_t *tok = unget_token;
- unget_token = NULL;
- return tok;
- }
- token_t *tok = skip_whitespace();
- if (tok != NULL) {
- return tok;
- }
- tok = read_identifier();
- if (tok != NULL) {
- return tok;
- }
- tok = read_number();
- if (tok != NULL) {
- return tok;
- }
- tok = read_char_constant();
- if (tok != NULL) {
- return tok;
- }
- tok = read_string_literal();
- if (tok != NULL) {
- return tok;
- }
- tok = read_operator();
- if (tok != NULL) {
- return tok;
- }
- int c = input_getc();
- if (c == EOF) {
- return NULL;
- }
- tok_warn(
- "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
- line, column);
- return read_token();
-}
-</code></pre>
- </div>
- <p>With that, we're basically done.</p>
- <h3>Error Handling</h3>
- <p>One thing we haven't covered yet is error handling. We've used <code>tok_error</code> and <code>tok_warn</code>
- to print errors and warnings, but we haven't actually implemented them. Let's do that now. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":error-handling" href="#:error-handling">Error
- handling</a></em></strong></span>
- <pre class="prettyprint"><code class="">void tok_error(const char *fmt, ...) {
- va_list args;
- va_start(args, fmt);
- fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
- column);
- vfprintf(stderr, fmt, args);
- va_end(args);
}
+</pre>
-void tok_warn(const char *fmt, ...) {
- va_list args;
- va_start(args, fmt);
- fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
- column);
- vfprintf(stderr, fmt, args);
- va_end(args);
-}</code></pre>
- </div>
- <h3>Debugging</h3>
- <p>It'd be helpful to have a way to print out the tokens we're reading. Let's add a function for that. </p>
- <div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":print-token" href="#:print-token">Print
- token</a></em></strong></span>
- <pre class="prettyprint"><code class="">char *re_escape_string(const char *str) {
- int len = strlen(str);
- char *buf = malloc(len * 2 + 1);
- if (buf == NULL) {
- fprintf(stderr, "Out of memory. Cannot escape string\n");
- exit(1);
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+</div>
+<a name="1:72"><div class="section"><h4 class="noheading">72. </h4></a>
+<p>Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Read Escape Sequence <a href="lexer.html#1:72">72</a>}</span>
+<pre class="prettyprint lang-c">
+static char read_escape_sequence(int *len) {
+ int c = input_getc();
+ *len += 1;
+ switch (c) {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '\'':
+ return '\'';
+ case '"':
+ return '"';
+ case '?':
+ return '?';
+ case '\\':
+ return '\\';
+ case '0':
+ return '\0';
+ case 'x': {
+ c = input_getc();
+ if (!isxdigit(c)) {
+ tok_error("Invalid hexadecimal escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isxdigit(c)) {
+ *len += 1;
+ val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
}
- int i = 0;
- for (int j = 0; j < len; j++) {
- switch (str[j]) {
- case '\a':
- buf[i++] = '\\';
- buf[i++] = 'a';
- break;
- case '\b':
- buf[i++] = '\\';
- buf[i++] = 'b';
- break;
- case '\f':
- buf[i++] = '\\';
- buf[i++] = 'f';
- break;
- case '\n':
- buf[i++] = '\\';
- buf[i++] = 'n';
- break;
- case '\r':
- buf[i++] = '\\';
- buf[i++] = 'r';
- break;
- case '\t':
- buf[i++] = '\\';
- buf[i++] = 't';
- break;
- case '\v':
- buf[i++] = '\\';
- buf[i++] = 'v';
- break;
- case '\\':
- buf[i++] = '\\';
- buf[i++] = '\\';
- break;
- case '\'':
- buf[i++] = '\\';
- buf[i++] = '\'';
- break;
- case '"':
- buf[i++] = '\\';
- buf[i++] = '"';
- break;
- default:
- buf[i++] = str[j];
- break;
+ default:
+ if (!isdigit(c)) {
+ tok_error("Invalid escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isdigit(c)) {
+ *len += 1;
+ val = val * 8 + c - '0';
+ c = input_getc();
}
+ input_ungetc(c);
+ return (char)val;
}
- buf[i] = '\0';
- return buf;
}
+</pre>
-void print_token(token_t *tok) {
- if (tok == NULL) {
- printf("NULL\n");
- return;
- }
- const char *name = token_name_from_type(tok->kind);
- switch (tok->kind) {
- case TOK_ID:
- case TOK_CONST_STRING_ASCII: {
- char *escaped = re_escape_string(token_string(tok));
- printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
- free(escaped);
- break;
- }
- case TOK_CONST_CHAR:
- printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
- break;
- case TOK_CONST_INTEGER_S32:
- case TOK_CONST_INTEGER_U32:
- case TOK_CONST_INTEGER_S64:
- case TOK_CONST_INTEGER_U64:
- printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
- break;
- case TOK_CONST_FLOAT_32:
- case TOK_CONST_FLOAT_64:
- printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
- break;
- default:
- printf("%s@%d:%d\n", name, tok->line, tok->column);
- break;
- }
-}</code></pre>
- </div>
- <p>The name of the token is retrieved using <code>token_name_from_type</code>, which is a simple switch statement.
- It's long and boring, check the source code if you're interested. </p>
- <h3>Bugs/Errata</h3>
- <p> I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The
- code you see here is the final version, with all bugs fixed. </p>
- <ul>
- <li> had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for
- lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code>
- check which if true returns EOF, preventing input initialization. Fixed by changing to
- <code>buffer_pos == buffer_size</code>.
- </li>
- <li>assertion <code>token->kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot
- to expand check for identifiers which also use token_string. Fixed by changing to
- <code>token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>.
- </li>
- <li> token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving
- the call to free after
- the call to <code>hash_table_get</code>.</li>
- <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created
- double free. Fixed by
- rewriting part of function.</li>
- <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.</li>
- <li>Destructor for string literals missing in tokenizer teardown, added it in.</li>
- <li> read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting.
- Added explicit cast.</li>
- <li>read_char_constant - Forgot to handle '\0'. Added.</li>
- <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by
- adding a check for division operator.</li>
- <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced
- with ELF hash function.</li>
- <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an
- array index. Ew. Used i instead of strlen.</li>
- <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead
- of token_create_string for keywords.</li>
- <li>Everywhere - Checks added for memory allocation failure.</li>
- <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.</li>
- </ul>
- <h3>Conclusion</h3>
- <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust. </p>
- <p>Next time, we'll start on the parser. </p>
- <h1>Source code, biblography</h1>
- <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is
- available <a href="/projects/cminus/code/tokenizer.h">here</a>. </p>
- <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is
- available <a href="/projects/cminus/code/input.h">here</a>. </p>
- <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is
- available <a href="/projects/cminus/code/hash_table.h">here</a>. </p>
- <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable
- Compiler for ANSI C. Grammars are from The C Reference Manual.</p>
- <p> I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).</p>
- <p> Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.</p>
- <p> The code for the ELF hash function is from the glibc source code.</p>
- <p> The idea for decision trees came from LCC.</p>
- </main>
-
- <footer style=" text-align: center; padding: 20px;">
- <p>© 2021 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
+
+<p class="seealso">Used in section <a href="lexer.html#1:71">71</a></p>
+</div>
+</div>
+<a name="1:73"><div class="section"><h4 class="noheading">73. </h4></a>
+<p>Finally, I'll add some code for running the tokenizer as its own program. This way we can test it out.
+</p>
+
+<div class="codeblock">
+<span class="codeblock_name">{Run Test <a href="lexer.html#1:73">73</a>}</span>
+<pre class="prettyprint lang-c">
+char *preprocess(char *in) {
+ char *output_name = malloc(1024);
+ snprintf(output_name, 1024, "%s.preprocessed", in);
+ char *command = malloc(2048);
+ snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
+ system(command);
+ free(command);
+ return output_name;
+}
+
+// Tokenize the input file
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
+ return 1;
+ }
+ char *input_name = argv[1];
+ char *preprocessed = preprocess(input_name);
+ init_tokenizer(preprocessed);
+ token_t *tok;
+ while ((tok = next_token()) != NULL) {
+ print_token(tok);
+ token_destroy(tok);
+ }
+ destroy_tokenizer();
+ remove(preprocessed);
+ free(preprocessed);
+ return 0;
+}
+</pre>
+
+
+<p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
+</div>
+<h3> Bugs/Errata</h3>
+<p>I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed.
+</p>
+<ul>
+<li>had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code> check which if true returns EOF, preventing input initialization. Fixed by changing to <code>buffer_pos == buffer_size</code>.
+</li>
+<li>assertion <code>token->kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot to expand check for identifiers which also use token_string. Fixed by changing to <code>token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>.
+</li>
+<li>token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving the call to free after the call to <code>hash_table_get</code>.
+</li>
+<li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created double free. Fixed by rewriting part of function.
+</li>
+<li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.
+</li>
+<li>Destructor for string literals missing in tokenizer teardown, added it in.
+</li>
+<li>read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting. Added explicit cast.
+</li>
+<li>read_char_constant - Forgot to handle '\\0'. Added.
+</li>
+<li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by adding a check for division operator.
+</li>
+<li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced with ELF hash function.
+</li>
+<li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an array index. Ew. Used i instead of strlen.
+</li>
+<li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead of token_create_string for keywords.
+</li>
+<li>Everywhere - Checks added for memory allocation failure.
+</li>
+<li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.
+</li>
+</ul>
+<h3> Conclusion</h3>
+<p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust.
+</p>
+<p>Next time, we'll start on the parser.
+</p>
+<h1> Source code, biblography</h1>
+<p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is available <a href="/projects/cminus/code/tokenizer.h">here</a>.
+</p>
+<p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is available <a href="/projects/cminus/code/input.h">here</a>.
+</p>
+<p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is available <a href="/projects/cminus/code/hash_table.h">here</a>.
+</p>
+<p>Source code for the token module is available <a href="/projects/cminus/code/token.c">here</a>, header file is available <a href="/projects/cminus/code/token.h">here</a>.
+</p>
+<p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable Compiler for ANSI C. Grammars are from The C Reference Manual.
+</p>
+<p>I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).
+</p>
+<p>Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.
+</p>
+<p>The code for the ELF hash function is from the glibc source code.
+</p>
+<p>The idea for decision trees came from LCC.
+</p>
+<p>Literate programming rendered using <a href="https://github.com/zyedidia/Literate">literate</a>.
+</p>
+<p> <footer style=" text-align: center; padding: 20px;">
+ <p>© 2024 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p>
</footer>
-</body>
+</p>
-</html>-
\ No newline at end of file
+</div>
+</body>