commit 80be860aa6798159a32a91aa6a2d5ef46680e4ee
parent c3abcec683384c5aaa5140189f4226cafed68239
Author: Reagan <rfische2@uccs.edu>
Date: Thu, 22 Aug 2024 01:24:54 -0600
Lexer
Diffstat:
8 files changed, 2794 insertions(+), 436 deletions(-)
diff --git a/blog.html b/blog.html
@@ -39,7 +39,6 @@
</nav>
</header>
- <img>
<img alt="Under Construction" src="images/page-under-construction.gif">
<br>
Coming soon...
diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c
@@ -1,4 +1,5 @@
#include "hash_table.h"
+#include <stdio.h>
#include <stdlib.h>
struct hash_table {
hash_table_entry_t **entries;
@@ -29,11 +30,15 @@ hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
return table;
}
-void hash_table_destroy(hash_table_t *table) {
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
for (int i = 0; i < table->size; i++) {
hash_table_entry_t *entry = table->entries[i];
while (entry != NULL) {
hash_table_entry_t *next = entry->next;
+ if (dtor != NULL) {
+ dtor(entry->key, 1);
+ dtor(entry->value, 0);
+ }
free(entry);
entry = next;
}
@@ -69,6 +74,10 @@ int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
entry = entry->next;
}
entry = malloc(sizeof(hash_table_entry_t));
+ if (entry == NULL) {
+ fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
+ exit(1);
+ }
entry->key = key;
entry->value = value;
entry->next = table->entries[hash];
@@ -120,4 +129,4 @@ int main() {
hash_table_destroy(table);
return 0;
}
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h
@@ -3,11 +3,13 @@
typedef struct hash_table hash_table_t;
typedef struct hash_table_entry hash_table_entry_t;
typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
-typedef unsigned long (*hash_table_hash_fn)(void *key);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
hash_table_hash_fn hash);
-void hash_table_destroy(hash_table_t *table);
+
+typedef void (*hash_table_dtor)(void *value, int is_key);
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
void *hash_table_get(hash_table_t *table, void *key);
int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
void hash_table_remove(hash_table_t *table, void *key);
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/input.c b/projects/cminus/code/input.c
@@ -5,7 +5,8 @@
static char buffer[CHUNK_SIZE];
static int buffer_pos = 0;
static int buffer_size = 0;
-static char unget_buffer = '\0';
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
static FILE *file = NULL;
@@ -18,10 +19,8 @@ void input_init(const char *filename) {
}
int input_getc(void) {
- if (unget_buffer != '\0') {
- char c = unget_buffer;
- unget_buffer = '\0';
- return c;
+ if (unget_buffer_stack_pos > 0) {
+ return unget_buffer_stack[--unget_buffer_stack_pos];
}
if (buffer_pos == buffer_size) {
buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
@@ -30,10 +29,11 @@ int input_getc(void) {
if (buffer_size == 0) {
return EOF;
}
- return buffer[buffer_pos++];
+ char c = buffer[buffer_pos++];
+ return c;
}
-void input_ungetc(int c) { unget_buffer = c; }
+void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }
void input_destroy(void) { fclose(file); }
@@ -52,4 +52,4 @@ int main() {
input_destroy();
return 0;
}
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/input.h b/projects/cminus/code/input.h
@@ -4,4 +4,4 @@ void input_init(const char *filename);
int input_getc(void);
void input_ungetc(int c);
void input_destroy(void);
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c
@@ -1,18 +1,25 @@
#include "tokenizer.h"
#include "hash_table.h"
+#include "input.h"
#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdarg.h>
#include <stdint.h>
+#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Token data
-#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
+#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
struct token {
long magic;
int line;
int column;
short kind;
- char opt_data[0];
+ long opt_data[0];
};
typedef struct token token_t;
@@ -29,48 +36,81 @@ struct token_data {
typedef struct token_data token_data_t;
// Token creation
+int line = 1;
+int column = 1;
+char file_name[1024];
#define token_data(token) ((struct token_data *)((token)->opt_data))
-static token_t *token_data_create(c_token_types kind, int line, int column) {
+static token_t *token_data_create(c_token_types kind, int lin, int col,
+ int len) {
token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
token->magic = TOK_MAGIC_1;
- token->line = line;
- token->column = column;
+ token->line = lin;
+ token->column = col;
+ column += len;
token->kind = kind;
return token;
}
-static token_t *token_create(c_token_types kind, int line, int column) {
+static token_t *token_create(c_token_types kind, int lin, int col, int len) {
token_t *token = malloc(sizeof(token_t));
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
token->magic = TOK_MAGIC_2;
- token->line = line;
- token->column = column;
+ token->line = lin;
+ token->column = col;
+ column += len;
token->kind = kind;
return token;
}
-static token_t *token_create_int(c_token_types kind, int line, int column,
- int64_t i) {
- token_t *token = token_data_create(kind, line, column);
+static token_t *token_create_int(c_token_types kind, int lin, int col,
+ int64_t i, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
token_data(token)->data.i = i;
return token;
}
-static token_t *token_create_float(c_token_types kind, int line, int column,
- double f) {
- token_t *token = token_data_create(kind, line, column);
+static token_t *token_create_float(c_token_types kind, int lin, int col,
+ double f, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
token_data(token)->data.f = f;
return token;
}
-#define PRIME 211
-static unsigned long hash_string(void *key) {
- unsigned long hash = 0, g;
+static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
+ int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.c = c;
+ return token;
+}
+
+static unsigned int hash_string(void *key) {
+ unsigned long hash = 0, hi = 0;
char *p = key;
- while (*p) {
- hash = (hash << 4) + *p++;
- if ((g = hash & 0xf0000000) != 0) {
- hash ^= g >> 24;
- hash ^= g;
+ hash = *p;
+ if (hash != 0 && p[1] != 0) {
+ hash = (hash << 4) + p[1];
+ if (p[2] != 0) {
+ hash = (hash << 4) + p[2];
+ if (p[3] != 0) {
+ hash = (hash << 4) + p[3];
+ if (p[4] != 0) {
+ hash = (hash << 4) + p[4];
+ p += 5;
+ while (*p != 0) {
+ hash = (hash << 4) + *p++;
+ hi = hash & 0xf0000000l;
+ hash ^= hi >> 24;
+ }
+ hash &= 0x0fffffffl;
+ }
+ }
}
}
return hash;
@@ -81,30 +121,22 @@ static int cmp_string(void *key1, void *key2) {
}
static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int line, int column,
- const char *s) {
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
if (string_table == NULL) {
- string_table = hash_table_create(1024, cmp_string, hash_string);
+ string_table = hash_table_create(2048, cmp_string, hash_string);
}
- token_t *token = token_data_create(kind, line, column);
- char *key = strdup(s);
- if (hash_table_put(string_table, key, key, 0)) {
- free(key);
- key = hash_table_get(string_table, key);
+ token_t *token = token_data_create(kind, lin, col, len);
+ char *key = hash_table_get(string_table, (void *)s);
+ if (key == NULL) {
+ key = strdup(s);
+ hash_table_put(string_table, key, key, 1);
}
token_data(token)->data.s = key;
return token;
}
-static token_t *token_create_char(c_token_types kind, int line, int column,
- char c) {
- token_t *token = token_data_create(kind, line, column);
- token_data(token)->data.c = c;
- return token;
-}
-
// External token operations
-
c_token_types token_type(token_t *token) {
assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
return token->kind;
@@ -127,7 +159,7 @@ double token_float(token_t *token) {
}
const char *token_string(token_t *token) {
- assert(token->kind == TOK_CONST_STRING_ASCII);
+ assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
assert(token->magic == TOK_MAGIC_1);
return token_data(token)->data.s;
}
@@ -150,10 +182,1189 @@ int token_column(token_t *token) {
void token_destroy(token_t *token) {
assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- if (token->magic == TOK_MAGIC_1) {
- if (token->kind == TOK_CONST_STRING_ASCII) {
- free((void *)token_data(token)->data.s);
+ // Don't free the string table, it's a global variable
+ free(token);
+}
+
+token_t *unget_token = NULL;
+void tokenizer_init(const char *filename) { input_init(filename); }
+
+void tokenizer_unget(token_t *token) {
+ assert(unget_token == NULL);
+ unget_token = token;
+}
+
+void string_table_dtor(void *value, int is_key) {
+ if (is_key) {
+ free(value);
+ }
+}
+
+void tokenizer_destroy(void) {
+ input_destroy();
+ if (string_table != NULL) {
+ hash_table_destroy(string_table, string_table_dtor);
+ }
+}
+
+// Error handling
+void tok_error(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+
+void tok_warn(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+
+// Tokenizer
+static token_t *skip_whitespace(void) {
+ int c;
+ while ((c = input_getc()) != EOF) {
+ if (isspace(c)) {
+ if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ } else if (c == '#') // GCC preprocessor line control directive.
+ {
+ char buf[512];
+ int i = 0;
+ while ((c = input_getc()) != EOF && c != '\n') {
+ buf[i++] = c;
+ column++;
+ }
+ buf[i] = '\0';
+ if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+ column = 1;
+ } else {
+ tok_error("Invalid #line directive\n");
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ } else if (c == '/') {
+ c = input_getc();
+ if (c == '/') {
+ while ((c = input_getc()) != EOF && c != '\n') {
+ column++;
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ line++;
+ column = 1;
+ } else if (c == '*') {
+ while ((c = input_getc()) != EOF) {
+ if (c == '*') {
+ c = input_getc();
+ if (c == '/') {
+ break;
+ }
+ } else if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ }
+ if (c == EOF) {
+ return NULL;
+ }
+ } else {
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ } else {
+ input_ungetc(c);
+ return NULL;
}
}
- free(token);
+ return NULL;
+}
+
+c_token_types get_keyword(const char *buf, int len) {
+ switch (buf[0]) {
+ case 'a':
+ if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_SCSK_AUTO;
+ break;
+
+ case 'b':
+ if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+ buf[4] == 'k')
+ return TOK_CTK_BREAK;
+ break;
+
+ case 'c':
+ switch (buf[1]) {
+ case 'a':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_CASE;
+ break;
+ case 'h':
+ if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+ return TOK_TK_CHAR;
+ break;
+ case 'o':
+ if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+ return TOK_SCSK_CONST;
+ if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+ return TOK_CTK_CONTINUE;
+ break;
+ }
+ break;
+
+ case 'd':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+ buf[5] == 'l' && buf[6] == 't')
+ return TOK_CTK_DEFAULT;
+ break;
+ case 'o':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_DO;
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+ buf[5] == 'e')
+ return TOK_TK_DOUBLE;
+ break;
+ }
+ break;
+
+ case 'e':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_ELSE;
+ break;
+ case 'n':
+ if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+ return TOK_TK_ENUM;
+ break;
+ case 'x':
+ if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_SCSK_EXTERN;
+ break;
+ }
+ break;
+
+ case 'f':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+ return TOK_TK_FLOAT;
+ break;
+ case 'o':
+ if (len == 3 && buf[2] == 'r')
+ return TOK_CTK_FOR;
+ break;
+ }
+ break;
+
+ case 'g':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_CTK_GOTO;
+ break;
+
+ case 'i':
+ switch (buf[1]) {
+ case 'f':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_IF;
+ break;
+ case 'n':
+ if (len == 3 && buf[2] == 't')
+ return TOK_TK_INT;
+ break;
+ }
+ break;
+
+ case 'l':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+ return TOK_TK_LONG;
+ break;
+
+ case 'r':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+ buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+ return TOK_SCSK_REGISTER;
+ if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_CTK_RETURN;
+ break;
+ }
+ break;
+
+ case 's':
+ switch (buf[1]) {
+ case 'h':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+ return TOK_TK_SHORT;
+ break;
+ case 't':
+ if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'c')
+ return TOK_SCSK_STATIC;
+ break;
+ case 'i':
+ if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+ buf[5] == 'd')
+ return TOK_TK_SIGNED;
+ if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+ buf[5] == 'f')
+ return TOK_MK_SIZEOF;
+ break;
+ case 'r':
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+ return TOK_TK_STRUCT;
+ break;
+ case 'w':
+ if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+ buf[5] == 'h')
+ return TOK_CTK_SWITCH;
+ break;
+ }
+ break;
+
+ case 't':
+ if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+ buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+ return TOK_TK_TYPEDEF;
+ break;
+
+ case 'u':
+ switch (buf[1]) {
+ case 'n':
+ if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+ return TOK_TK_UNION;
+ if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+ buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+ return TOK_TK_UNSIGNED;
+ break;
+ }
+ break;
+
+ case 'v':
+ switch (buf[1]) {
+ case 'o':
+ if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+ return TOK_TK_VOID;
+ if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+ buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+ return TOK_SCSK_VOLATILE;
+ break;
+ }
+ break;
+
+ case 'w':
+ if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+ buf[4] == 'e')
+ return TOK_CTK_WHILE;
+ break;
+
+ default:
+ return TOK_ID;
+ }
+ return TOK_ID;
+}
+
+static token_t *read_identifier(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ if (!isalpha(c) && c != '_') {
+ input_ungetc(c);
+ return NULL;
+ }
+ buf[i++] = c;
+ while ((c = input_getc()) != EOF) {
+ if (!isalnum(c) && c != '_') {
+ input_ungetc(c);
+ break;
+ }
+ buf[i++] = c;
+ if (i >= 1008) {
+ tok_error("Identifier too long\n");
+ exit(1);
+ }
+ }
+ buf[i] = '\0';
+ column += i;
+ // Check if it's a keyword
+ c_token_types kind = get_keyword(buf, i);
+ if (kind != TOK_ID) {
+ return token_create(kind, line, column, i);
+ }
+ return token_create_string(kind, line, column, buf, i);
}
+
+token_t *read_operator(void) {
+ int c;
+ c = input_getc();
+ switch (c) {
+ case '!': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_NE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_NOT, line, column, 1);
+ }
+ case '%': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MOD, line, column, 1);
+ }
+ case '&': {
+ c = input_getc();
+ if (c == '&')
+ return token_create(TOK_OP_AND, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_AND, line, column, 1);
+ }
+ case '(':
+ return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+ case ')':
+ return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+ case '*': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MUL, line, column, 1);
+ }
+ case '+': {
+ c = input_getc();
+ if (c == '+')
+ return token_create(TOK_OP_INC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ADD, line, column, 2);
+ }
+ case ',':
+ return token_create(TOK_SEP_COMMA, line, column, 1);
+ case '-': {
+ c = input_getc();
+ if (c == '-')
+ return token_create(TOK_OP_DEC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+ if (c == '>')
+ return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_SUB, line, column, 1);
+ }
+ case '.': {
+ c = input_getc();
+ if (c == '.') {
+ c = input_getc();
+ if (c == '.') {
+ return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+ } else {
+ // Bail out, can't store more than one unget
+ tok_error("Unexpected character '.' at line %d, column %d\n", line,
+ column);
+ exit(1);
+ }
+ }
+ return token_create('.', line, column, 1);
+ }
+ case '/': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ case ':':
+ return token_create(TOK_OP_COND_DECISION, line, column, 1);
+ case ';':
+ return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+ case '<': {
+ c = input_getc();
+ if (c == '<') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_LSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_LE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_LT, line, column, 1);
+ }
+ case '=': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ASSIGN, line, column, 1);
+ }
+ case '>': {
+ c = input_getc();
+ if (c == '>') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_RSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_GE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_GT, line, column, 1);
+ }
+ case '?':
+ return token_create(TOK_OP_COND, line, column, 1);
+ case '[':
+ return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+ case ']':
+ return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+ case '^': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_XOR, line, column, 1);
+ }
+ case '{':
+ return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+ case '|': {
+ c = input_getc();
+ if (c == '|')
+ return token_create(TOK_OP_OR, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_OR, line, column, 1);
+ }
+ case '}':
+ return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+ case '~':
+ return token_create(TOK_OP_BIT_NOT, line, column, 1);
+ default:
+ input_ungetc(c);
+ return NULL;
+ }
+}
+
+static token_t *read_number(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ // If we don't have a digit or decimal point, it's not a number
+ if (!isdigit(c) && c != '.') {
+ input_ungetc(c);
+ return NULL;
+ }
+ // Decimal point followed by non-digit is a struct member
+ if (c == '.') {
+ char cnext = input_getc();
+ if (!isdigit(cnext)) {
+ input_ungetc(cnext);
+ return token_create(TOK_OP_MEMBER, line, column, 1);
+ }
+ input_ungetc(cnext);
+ }
+ int radix = 10;
+ int is_float = 0;
+ // Check for hex and octal.
+ if (c == '0') {
+ char cnext = input_getc();
+ if (cnext == 'x' || cnext == 'X') {
+ // Hex, discard the 0x
+ radix = 16;
+ } else {
+ // Octal, discard the 0
+ input_ungetc(cnext);
+ radix = 8;
+ }
+ } else {
+ // Decimal, append the first digit
+ buf[i++] = c;
+ }
+ // Read the rest of the number
+ while ((c = input_getc()) != EOF) {
+ // Since there can be multiple writes to the buffer, we want to make sure we
+ // don't overflow by giving a 4 byte pad
+ if (i > 1020) {
+ tok_error("Number too long\n");
+ return NULL;
+ }
+ // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+ // a-f/A-F for hex
+ if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+ (radix == 8 && c >= '0' && c <= '7')) {
+ buf[i++] = c;
+ // Decimal point and not a float yet, must be a float
+ } else if (c == '.' && !is_float) {
+ is_float = 1;
+ if (radix != 10) {
+ tok_error("Invalid floating point constant, expected decimal, got %s\n",
+ radix == 16 ? "hexadecimal" : "octal");
+ return NULL;
+ }
+ buf[i++] = c;
+ }
+ // Exponent on the end of a constant. (By standard this forces it to be a
+ // float)
+ else if (c == 'e' || c == 'E') {
+ buf[i++] = c;
+ c = input_getc();
+ // Sign on the exponent
+ if (c == '+' || c == '-') {
+ buf[i++] = c;
+ c = input_getc();
+ }
+ // Exponent must be a digit, I.E no 1e1.2
+ if (!isdigit(c)) {
+ tok_error("Invalid floating point exponent\n");
+ return NULL;
+ }
+ buf[i++] = c;
+ is_float = 1;
+ } else {
+ // Reached the end, unget the character so other functions can read it
+ input_ungetc(c);
+ break;
+ }
+ }
+ buf[i] = '\0';
+ // Check for suffixes
+ int is_unsigned = 0;
+ int is_long = 0;
+ int is_single = 0;
+ // Loop to get all possible suffixes. Warn when duplicated.
+ while (1) {
+ c = input_getc();
+ if (c == 'u' || c == 'U') {
+ if (is_unsigned) {
+ tok_warn(
+ "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+ }
+ is_unsigned = 1;
+ } else if (c == 'l' || c == 'L') {
+ if (is_long) {
+ tok_warn(
+ "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+ }
+ is_long = 1;
+ } else if (c == 'f' || c == 'F') {
+ if (is_single) {
+ tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+ "ignored\n");
+ }
+ is_single = 1;
+ } else {
+ input_ungetc(c);
+ break;
+ }
+ }
+ // Resolve invalid suffixes. Doesn't error because you can still compile with
+ // them.
+ if (is_single && is_long) {
+ tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+ "constant. Ignoring 'l'\n");
+ is_long = 0;
+ }
+ if (is_single && is_unsigned) {
+ tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+ "constant. Ignoring 'u'\n");
+ is_unsigned = 0;
+ }
+ if (is_single && !is_float) {
+ tok_warn(
+ "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+ is_single = 0;
+ }
+ // use the strtox functions to convert the string to a number
+ if (is_float) {
+ errno = 0;
+ // Strtod generates a unix-style error when it's given something out of
+ // range, so we want to get on top of that quickly instead of ignoring it
+ // That way we can avoid some nasty NAN-propagation in the constant folder.
+ double f = strtod(buf, NULL);
+ if (errno == ERANGE) {
+ tok_error("Floating point constant out of range\n");
+ return NULL;
+ }
+ // Warn if the constant is out of range for a float, I.E it's too big or too
+ // small
+ if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+ tok_warn(
+ "Warning: Floating point constant %f is out of range for float\n", f);
+ }
+ // Warn if the constant is too precise for a float
+ if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+ tok_warn("Warning: Converting double precision floating point constant "
+ "%f to float loses "
+ "precision\n",
+ f);
+ }
+ return token_create_float(is_single ? TOK_CONST_FLOAT_32
+ : TOK_CONST_FLOAT_64,
+ line, column, f, i);
+ } else {
+ errno = 0;
+ uint64_t int_ = strtoull(buf, NULL, radix);
+ // Same as above, but for integers
+ if (errno == ERANGE) {
+ tok_error("Integer constant out of range\n");
+ return NULL;
+ }
+ if (is_unsigned) {
+ if (is_long) {
+ return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+ } else {
+ if (int_ > UINT32_MAX) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for unsigned "
+ "int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+ }
+ } else {
+ if (is_long) {
+ // If the highest bit is set, that means this will overflow a signed
+ // long (Due to two's complement)
+ if (int_ & (1UL << 63)) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for long long\n",
+ i);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+ } else {
+ if (int_ & (1UL << 31)) {
+ tok_warn("Warning: Integer constant %lld is out of range for int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+ }
+ }
+ }
+ return NULL;
+}
+
+static char read_escape_sequence(int *len) {
+ int c = input_getc();
+ *len += 1;
+ switch (c) {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '\'':
+ return '\'';
+ case '"':
+ return '"';
+ case '?':
+ return '?';
+ case '\\':
+ return '\\';
+ case '0':
+ return '\0';
+ case 'x': {
+ c = input_getc();
+ if (!isxdigit(c)) {
+ tok_error("Invalid hexadecimal escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isxdigit(c)) {
+ *len += 1;
+ val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+ default:
+ if (!isdigit(c)) {
+ tok_error("Invalid escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isdigit(c)) {
+ *len += 1;
+ val = val * 8 + c - '0';
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+}
+
+static token_t *read_char_constant(void) {
+ int c;
+ int len = 0;
+ c = input_getc();
+ if (c != '\'') {
+ input_ungetc(c);
+ return NULL;
+ }
+ len++;
+ c = input_getc();
+ if (c == '\'') {
+ tok_error("Empty character constant\n");
+ return NULL;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&len);
+ }
+ int val = c;
+ c = input_getc();
+ if (c != '\'') {
+ tok_error("Expected closing quote for character constant\n");
+ return NULL;
+ }
+ len++;
+ return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}
+
+static token_t *read_string_literal(void) {
+ int c;
+ c = input_getc();
+ if (c != '"') {
+ input_ungetc(c);
+ return NULL;
+ }
+ int i = 0;
+ // Malloc is used for the buf here, the pointer stays function local as string
+ // interning duplicates the string
+ char s_buf[512];
+ char *buf = s_buf;
+ int len = 512;
+ int esc_pad = 0;
+ while ((c = input_getc()) != EOF) {
+ if (c == '"') {
+ // Implicit skip of closing quote
+ break;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&esc_pad);
+ if (c == 0) {
+ return NULL;
+ }
+ }
+ if (i >= len) {
+ if (buf == s_buf) {
+ buf = malloc(1024);
+ if (buf == NULL) {
+ fputs("Out of memory. Could not parse string literal.\n", stderr);
+ exit(1);
+ }
+ memcpy(buf, s_buf, 512);
+ len *= 2;
+ } else {
+ len *= 2;
+ buf = realloc(buf, len);
+ }
+ }
+ buf[i++] = c;
+ }
+ buf[i] = '\0';
+ if (c == EOF) {
+ tok_error("Unterminated string literal\n");
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return NULL;
+ }
+
+ token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf,
+ i + esc_pad + 2);
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return tok;
+}
+
+token_t *read_token(void) {
+ if (unget_token != NULL) {
+ token_t *tok = unget_token;
+ unget_token = NULL;
+ return tok;
+ }
+ token_t *tok = skip_whitespace();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_identifier();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_number();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_char_constant();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_string_literal();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_operator();
+ if (tok != NULL) {
+ return tok;
+ }
+ int c = input_getc();
+ if (c == EOF) {
+ return NULL;
+ }
+ tok_warn(
+ "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+ line, column);
+ return read_token();
+}
+
+const char *token_name_from_type(c_token_types type) {
+ switch (type) {
+ case TOK_CTK_IF:
+ return "TOK_CTK_IF";
+ case TOK_CTK_ELSE:
+ return "TOK_CTK_ELSE";
+ case TOK_CTK_SWITCH:
+ return "TOK_CTK_SWITCH";
+ case TOK_CTK_CASE:
+ return "TOK_CTK_CASE";
+ case TOK_CTK_DEFAULT:
+ return "TOK_CTK_DEFAULT";
+ case TOK_CTK_WHILE:
+ return "TOK_CTK_WHILE";
+ case TOK_CTK_DO:
+ return "TOK_CTK_DO";
+ case TOK_CTK_FOR:
+ return "TOK_CTK_FOR";
+ case TOK_CTK_CONTINUE:
+ return "TOK_CTK_CONTINUE";
+ case TOK_CTK_BREAK:
+ return "TOK_CTK_BREAK";
+ case TOK_CTK_RETURN:
+ return "TOK_CTK_RETURN";
+ case TOK_CTK_GOTO:
+ return "TOK_CTK_GOTO";
+ case TOK_TK_VOID:
+ return "TOK_TK_VOID";
+ case TOK_TK_CHAR:
+ return "TOK_TK_CHAR";
+ case TOK_TK_SHORT:
+ return "TOK_TK_SHORT";
+ case TOK_TK_INT:
+ return "TOK_TK_INT";
+ case TOK_TK_LONG:
+ return "TOK_TK_LONG";
+ case TOK_TK_FLOAT:
+ return "TOK_TK_FLOAT";
+ case TOK_TK_DOUBLE:
+ return "TOK_TK_DOUBLE";
+ case TOK_TK_SIGNED:
+ return "TOK_TK_SIGNED";
+ case TOK_TK_UNSIGNED:
+ return "TOK_TK_UNSIGNED";
+ case TOK_TK_STRUCT:
+ return "TOK_TK_STRUCT";
+ case TOK_TK_UNION:
+ return "TOK_TK_UNION";
+ case TOK_TK_ENUM:
+ return "TOK_TK_ENUM";
+ case TOK_TK_TYPEDEF:
+ return "TOK_TK_TYPEDEF";
+ case TOK_SCSK_AUTO:
+ return "TOK_SCSK_AUTO";
+ case TOK_SCSK_REGISTER:
+ return "TOK_SCSK_REGISTER";
+ case TOK_SCSK_STATIC:
+ return "TOK_SCSK_STATIC";
+ case TOK_SCSK_EXTERN:
+ return "TOK_SCSK_EXTERN";
+ case TOK_SCSK_CONST:
+ return "TOK_SCSK_CONST";
+ case TOK_SCSK_VOLATILE:
+ return "TOK_SCSK_VOLATILE";
+ case TOK_MK_SIZEOF:
+ return "TOK_MK_SIZEOF";
+ case TOK_OP_ADD:
+ return "TOK_OP_ADD";
+ case TOK_OP_SUB:
+ return "TOK_OP_SUB";
+ case TOK_OP_MUL:
+ return "TOK_OP_MUL";
+ case TOK_OP_DIV:
+ return "TOK_OP_DIV";
+ case TOK_OP_MOD:
+ return "TOK_OP_MOD";
+ case TOK_OP_BIT_AND:
+ return "TOK_OP_BIT_AND";
+ case TOK_OP_BIT_OR:
+ return "TOK_OP_BIT_OR";
+ case TOK_OP_BIT_XOR:
+ return "TOK_OP_BIT_XOR";
+ case TOK_OP_BIT_NOT:
+ return "TOK_OP_BIT_NOT";
+ case TOK_OP_LSHIFT:
+ return "TOK_OP_LSHIFT";
+ case TOK_OP_RSHIFT:
+ return "TOK_OP_RSHIFT";
+ case TOK_OP_NOT:
+ return "TOK_OP_NOT";
+ case TOK_OP_ASSIGN:
+ return "TOK_OP_ASSIGN";
+ case TOK_OP_LT:
+ return "TOK_OP_LT";
+ case TOK_OP_GT:
+ return "TOK_OP_GT";
+ case TOK_OP_INC:
+ return "TOK_OP_INC";
+ case TOK_OP_DEC:
+ return "TOK_OP_DEC";
+ case TOK_OP_EQ:
+ return "TOK_OP_EQ";
+ case TOK_OP_NE:
+ return "TOK_OP_NE";
+ case TOK_OP_LE:
+ return "TOK_OP_LE";
+ case TOK_OP_GE:
+ return "TOK_OP_GE";
+ case TOK_OP_AND:
+ return "TOK_OP_AND";
+ case TOK_OP_OR:
+ return "TOK_OP_OR";
+ case TOK_OP_MEMBER_POINTER:
+ return "TOK_OP_MEMBER_POINTER";
+ case TOK_OP_MEMBER:
+ return "TOK_OP_MEMBER";
+ case TOK_OP_COND_DECISION:
+ return "TOK_OP_COND_DECISION";
+ case TOK_OP_COND:
+ return "TOK_OP_COND";
+ case TOK_OP_ASSIGN_ADD:
+ return "TOK_OP_ASSIGN_ADD";
+ case TOK_OP_ASSIGN_SUB:
+ return "TOK_OP_ASSIGN_SUB";
+ case TOK_OP_ASSIGN_MUL:
+ return "TOK_OP_ASSIGN_MUL";
+ case TOK_OP_ASSIGN_DIV:
+ return "TOK_OP_ASSIGN_DIV";
+ case TOK_OP_ASSIGN_MOD:
+ return "TOK_OP_ASSIGN_MOD";
+ case TOK_OP_ASSIGN_BITAND:
+ return "TOK_OP_ASSIGN_BITAND";
+ case TOK_OP_ASSIGN_BITOR:
+ return "TOK_OP_ASSIGN_BITOR";
+ case TOK_OP_ASSIGN_BITXOR:
+ return "TOK_OP_ASSIGN_BITXOR";
+ case TOK_OP_ASSIGN_LSHIFT:
+ return "TOK_OP_ASSIGN_LSHIFT";
+ case TOK_OP_ASSIGN_RSHIFT:
+ return "TOK_OP_ASSIGN_RSHIFT";
+ case TOK_SEP_HASH:
+ return "TOK_SEP_HASH";
+ case TOK_ID:
+ return "TOK_ID";
+ case TOK_CONST_INTEGER_U32:
+ return "TOK_CONST_INTEGER_U32";
+ case TOK_CONST_INTEGER_U64:
+ return "TOK_CONST_INTEGER_U64";
+ case TOK_CONST_INTEGER_S32:
+ return "TOK_CONST_INTEGER_S32";
+ case TOK_CONST_INTEGER_S64:
+ return "TOK_CONST_INTEGER_S64";
+ case TOK_CONST_FLOAT_32:
+ return "TOK_CONST_FLOAT_32";
+ case TOK_CONST_FLOAT_64:
+ return "TOK_CONST_FLOAT_64";
+ case TOK_CONST_CHAR:
+ return "TOK_CONST_CHAR";
+ case TOK_CONST_STRING_ASCII:
+ return "TOK_CONST_STRING_ASCII";
+ case TOK_SPECIAL_EOF:
+ return "TOK_SPECIAL_EOF";
+ case TOK_SPECIAL_ERROR:
+ return "TOK_SPECIAL_ERROR";
+ case TOK_SEP_LEFT_PAREN:
+ return "TOK_SEP_LEFT_PAREN";
+ case TOK_SEP_RIGHT_PAREN:
+ return "TOK_SEP_RIGHT_PAREN";
+ case TOK_SEP_LEFT_BRACKET:
+ return "TOK_SEP_LEFT_BRACKET";
+ case TOK_SEP_RIGHT_BRACKET:
+ return "TOK_SEP_RIGHT_BRACKET";
+ case TOK_SEP_LEFT_BRACE:
+ return "TOK_SEP_LEFT_BRACE";
+ case TOK_SEP_RIGHT_BRACE:
+ return "TOK_SEP_RIGHT_BRACE";
+ case TOK_SEP_COMMA:
+ return "TOK_SEP_COMMA";
+ case TOK_SEP_SEMICOLON:
+ return "TOK_SEP_SEMICOLON";
+ case TOK_SEP_DOT:
+ return "TOK_SEP_DOT";
+ case TOK_SEP_ELLIPSIS:
+ return "TOK_SEP_ELLIPSIS";
+ }
+ return "UNKNOWN";
+}
+
+char *re_escape_string(const char *str) {
+ int len = strlen(str);
+ char *buf = malloc(len * 2 + 1);
+ if (buf == NULL) {
+ fprintf(stderr, "Out of memory. Cannot escape string\n");
+ exit(1);
+ }
+ int i = 0;
+ for (int j = 0; j < len; j++) {
+ switch (str[j]) {
+ case '\a':
+ buf[i++] = '\\';
+ buf[i++] = 'a';
+ break;
+ case '\b':
+ buf[i++] = '\\';
+ buf[i++] = 'b';
+ break;
+ case '\f':
+ buf[i++] = '\\';
+ buf[i++] = 'f';
+ break;
+ case '\n':
+ buf[i++] = '\\';
+ buf[i++] = 'n';
+ break;
+ case '\r':
+ buf[i++] = '\\';
+ buf[i++] = 'r';
+ break;
+ case '\t':
+ buf[i++] = '\\';
+ buf[i++] = 't';
+ break;
+ case '\v':
+ buf[i++] = '\\';
+ buf[i++] = 'v';
+ break;
+ case '\\':
+ buf[i++] = '\\';
+ buf[i++] = '\\';
+ break;
+ case '\'':
+ buf[i++] = '\\';
+ buf[i++] = '\'';
+ break;
+ case '"':
+ buf[i++] = '\\';
+ buf[i++] = '"';
+ break;
+ default:
+ buf[i++] = str[j];
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return buf;
+}
+
+void print_token(token_t *tok) {
+ if (tok == NULL) {
+ printf("NULL\n");
+ return;
+ }
+ const char *name = token_name_from_type(tok->kind);
+ switch (tok->kind) {
+ case TOK_ID:
+ case TOK_CONST_STRING_ASCII: {
+ char *escaped = re_escape_string(token_string(tok));
+ printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+ free(escaped);
+ break;
+ }
+ case TOK_CONST_CHAR:
+ printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_S64:
+ case TOK_CONST_INTEGER_U64:
+ printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+ break;
+ default:
+ printf("%s@%d:%d\n", name, tok->line, tok->column);
+ break;
+ }
+}
+
+#ifdef TOK_TEST
+
+char *preprocess(char *in) {
+ char *output_name = malloc(1024);
+ snprintf(output_name, 1024, "%s.preprocessed", in);
+ char *command = malloc(2048);
+ snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
+ system(command);
+ free(command);
+ return output_name;
+}
+
+// Tokenize ourselves
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
+ return 1;
+ }
+ char *input_name = argv[1];
+ char *preprocessed = preprocess(input_name);
+ tokenizer_init(preprocessed);
+ token_t *tok;
+ while ((tok = read_token()) != NULL) {
+ print_token(tok);
+ token_destroy(tok);
+ }
+ tokenizer_destroy();
+ remove(preprocessed);
+ free(preprocessed);
+ return 0;
+}
+
+#endif
diff --git a/projects/cminus/code/tokenizer.h b/projects/cminus/code/tokenizer.h
@@ -5,17 +5,18 @@
typedef struct token token_t;
typedef enum {
// Control Keywords
- TOK_CTK_IF, // if
- TOK_CTK_ELSE, // else
- TOK_CTK_SWITCH, // switch
- TOK_CTK_CASE, // case
- TOK_CTK_DEFAULT, // default
- TOK_CTK_WHILE, // while
- TOK_CTK_DO, // do
- TOK_CTK_FOR, // for
- TOK_CTK_CONTINUE, // continue
- TOK_CTK_RETURN, // return
- TOK_CTK_GOTO, // goto
+ TOK_CTK_IF,
+ TOK_CTK_ELSE,
+ TOK_CTK_SWITCH,
+ TOK_CTK_CASE,
+ TOK_CTK_DEFAULT,
+ TOK_CTK_WHILE,
+ TOK_CTK_DO,
+ TOK_CTK_FOR,
+ TOK_CTK_CONTINUE,
+ TOK_CTK_BREAK,
+ TOK_CTK_RETURN,
+ TOK_CTK_GOTO,
// Type Keywords
TOK_TK_VOID,
@@ -44,44 +45,43 @@ typedef enum {
TOK_MK_SIZEOF,
// Operators
- TOK_OP_PLUS, // +
- TOK_OP_MINUS, // -
- TOK_OP_ASTERISK, // *
- TOK_OP_SLASH, // /
- TOK_OP_PERCENT, // %
- TOK_OP_AMPERSAND, // &
- TOK_OP_PIPE, // |
- TOK_OP_CARET, // ^
- TOK_OP_TILDE, // ~
- TOK_OP_EXCLAMATION, // !
- TOK_OP_EQUALS, // =
- TOK_OP_LESS_THAN, // <
- TOK_OP_GREATER_THAN, // >
- TOK_OP_PLUS_PLUS, // ++
- TOK_OP_MINUS_MINUS, // --
- TOK_OP_EQUALS_EQUALS, // ==
- TOK_OP_NOT_EQUALS, // !=
- TOK_OP_LESS_EQUALS, // <=
- TOK_OP_GREATER_EQUALS, // >=
- TOK_OP_AMP_AMP, // &&
- TOK_OP_PIPE_PIPE, // ||
- TOK_OP_ARROW, // ->
- TOK_OP_DOT, // .
- TOK_OP_COLON, // :
- TOK_OP_QUESTION, // ?
- TOK_OP_COMMA, // ,
- TOK_OP_SEMICOLON, // ;
- TOK_OP_ELLIPSIS, // ...
- TOK_OP_ASSIGN_PLUS, // +=
- TOK_OP_ASSIGN_MINUS, // -=
- TOK_OP_ASSIGN_ASTERISK, // *=
- TOK_OP_ASSIGN_SLASH, // /=
- TOK_OP_ASSIGN_PERCENT, // %=
- TOK_OP_ASSIGN_AMPERSAND, // &=
- TOK_OP_ASSIGN_PIPE, // |=
- TOK_OP_ASSIGN_CARET, // ^=
- TOK_OP_ASSIGN_LEFT_SHIFT, // <<=
- TOK_OP_ASSIGN_RIGHT_SHIFT, // >>=
+ TOK_OP_ADD, // +
+ TOK_OP_SUB, // -
+ TOK_OP_MUL, // *
+ TOK_OP_DIV, // /
+ TOK_OP_MOD, // %
+ TOK_OP_BIT_AND, // &
+ TOK_OP_BIT_OR, // |
+ TOK_OP_BIT_XOR, // ^
+ TOK_OP_BIT_NOT, // ~
+ TOK_OP_LSHIFT, // <<
+ TOK_OP_RSHIFT, // >>
+ TOK_OP_NOT, // !
+ TOK_OP_ASSIGN, // =
+ TOK_OP_LT, // <
+ TOK_OP_GT, // >
+ TOK_OP_INC, // ++
+ TOK_OP_DEC, // --
+ TOK_OP_EQ, // ==
+ TOK_OP_NE, // !=
+ TOK_OP_LE, // <=
+ TOK_OP_GE, // >=
+ TOK_OP_AND, // &&
+ TOK_OP_OR, // ||
+ TOK_OP_MEMBER_POINTER, // ->
+ TOK_OP_MEMBER, // .
+ TOK_OP_COND_DECISION, // :
+ TOK_OP_COND, // ?
+ TOK_OP_ASSIGN_ADD, // +=
+ TOK_OP_ASSIGN_SUB, // -=
+ TOK_OP_ASSIGN_MUL, // *=
+ TOK_OP_ASSIGN_DIV, // /=
+ TOK_OP_ASSIGN_MOD, // %=
+ TOK_OP_ASSIGN_BITAND, // &=
+ TOK_OP_ASSIGN_BITOR, // |=
+ TOK_OP_ASSIGN_BITXOR, // ^=
+ TOK_OP_ASSIGN_LSHIFT, // <<=
+ TOK_OP_ASSIGN_RSHIFT, // >>=
// Separators
TOK_SEP_LEFT_PAREN, // (
@@ -92,30 +92,29 @@ typedef enum {
TOK_SEP_RIGHT_BRACE, // }
TOK_SEP_COMMA, // ,
TOK_SEP_SEMICOLON, // ;
- TOK_SEP_COLON, // :
TOK_SEP_DOT, // .
TOK_SEP_ELLIPSIS, // ...
TOK_SEP_HASH, // #
// Identifiers
TOK_ID, // Any identifier not ending in _t
- TOK_TID, // Any identifier ending in _t
// Constants
- TOK_CONST_INTEGER_U32, // u
- TOK_CONST_INTEGER_U64, // ul
- TOK_CONST_INTEGER_S32, // (no suffix)
- TOK_CONST_INTEGER_S64, // l
- TOK_CONST_FLOAT_32, // f
- TOK_CONST_FLOAT_64, // (no suffix)
- TOK_CONST_CHAR, // 'c'
- TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
+ TOK_CONST_INTEGER_U32, // u
+ TOK_CONST_INTEGER_U64, // ul
+ TOK_CONST_INTEGER_S32, // (no suffix)
+ TOK_CONST_INTEGER_S64, // l
+ TOK_CONST_FLOAT_32, // f
+ TOK_CONST_FLOAT_64, // (no suffix)
+ TOK_CONST_CHAR, // 'c'
+ TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
// Special
TOK_SPECIAL_EOF,
TOK_SPECIAL_ERROR,
} c_token_types;
+
void tokenizer_init(const char *filename);
token_t *tokenizer_get(void);
void tokenizer_unget(token_t *token);
@@ -130,4 +129,4 @@ int token_line(token_t *token);
int token_column(token_t *token);
void token_destroy(token_t *token);
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html
@@ -155,47 +155,46 @@ static char buffer[CHUNK_SIZE];
static int buffer_pos = 0;
static int buffer_size = 0;
static char unget_buffer = '\0';
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
static FILE *file = NULL;
</code></pre>
</div>
- <p>When the program calls <code>input_init</code>, we open the file and read the first chunk into the buffer. </p>
+ <p>When the program calls <code>input_init</code>, we open the file.</p>
<div class="code-block">
<span class="file-name">input.c</span>
<span class="block-header">
<strong class="block-title"><em><a id=":input-init" href="#:input-init">Input
initialization</a></em></strong></span>
<pre class="prettyprint"><code class="">void input_init(const char *filename) {
- // Open the file and read the first chunk
file = fopen(filename, "r");
if (file == NULL) {
- fprintf(stderr, "Error: could not open file %s\n", filename);
+ fprintf(stderr, "Error: Cannot open file %s\n", filename);
exit(1);
}
- buffer_pos = 0;
- nextline();
}</code></pre>
</div>
<p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is
- exhausted, we call <code>nextline</code>. </p>
+ exhausted, we call <code>nextline</code>. We also track the line and column. </p>
<div class="code-block">
<span class="file-name">input.c</span>
<span class="block-header">
<strong class="block-title"><em><a id=":input-getc" href="#:input-getc">Input
getc</a></em></strong></span>
<pre class="prettyprint"><code class="">int input_getc(void) {
- if (unget_buffer != '\0') {
- char c = unget_buffer;
- unget_buffer = '\0';
- return c;
+ if (unget_buffer_stack_pos > 0) {
+ return unget_buffer_stack[--unget_buffer_stack_pos];
}
if (buffer_pos == buffer_size) {
- nextline();
+ buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
+ buffer_pos = 0;
}
if (buffer_size == 0) {
return EOF;
}
- return buffer[buffer_pos++];
+ char c = buffer[buffer_pos++];
+ return c;
}</code></pre>
</div>
<p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. </p>
@@ -204,9 +203,8 @@ static FILE *file = NULL;
<span class="block-header">
<strong class="block-title"><em><a id=":input-ungetc" href="#:input-ungetc">Input
ungetc</a></em></strong></span>
- <pre class="prettyprint"><code class="">void input_ungetc(int c) {
- unget_buffer = c;
-}</code></pre>
+ <pre
+ class="prettyprint"><code class="">void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }</code></pre>
</div>
<p>Since we're not using dynamic memory allocation, cleanup is pretty simple.</p>
<div class="code-block">
@@ -257,8 +255,7 @@ const char *token_string(token_t *token);
char token_char(token_t *token);
int token_line(token_t *token);
int token_column(token_t *token);
-void token_destroy(token_t *token);
-</code></pre>
+void token_destroy(token_t *token);</code></pre>
</div>
<p>I'll also define an opaque struct for tokens and an enum for token types. </p>
<div class="code-block">
@@ -267,19 +264,20 @@ void token_destroy(token_t *token);
<strong class="block-title"><em><a id=":token-types" href="#:token-types">Token
types</a></em></strong></span>
<pre class="prettyprint"><code class="">typedef struct token token_t;
- typedef enum {
+typedef enum {
// Control Keywords
- TOK_CTK_IF, // if
- TOK_CTK_ELSE, // else
- TOK_CTK_SWITCH, // switch
- TOK_CTK_CASE, // case
- TOK_CTK_DEFAULT, // default
- TOK_CTK_WHILE, // while
- TOK_CTK_DO, // do
- TOK_CTK_FOR, // for
- TOK_CTK_CONTINUE, // continue
- TOK_CTK_RETURN, // return
- TOK_CTK_GOTO, // goto
+ TOK_CTK_IF,
+ TOK_CTK_ELSE,
+ TOK_CTK_SWITCH,
+ TOK_CTK_CASE,
+ TOK_CTK_DEFAULT,
+ TOK_CTK_WHILE,
+ TOK_CTK_DO,
+ TOK_CTK_FOR,
+ TOK_CTK_CONTINUE,
+ TOK_CTK_BREAK,
+ TOK_CTK_RETURN,
+ TOK_CTK_GOTO,
// Type Keywords
TOK_TK_VOID,
@@ -308,102 +306,80 @@ void token_destroy(token_t *token);
TOK_MK_SIZEOF,
// Operators
- TOK_OP_PLUS, // +
- TOK_OP_MINUS, // -
- TOK_OP_ASTERISK, // *
- TOK_OP_SLASH, // /
- TOK_OP_PERCENT, // %
- TOK_OP_AMPERSAND, // &
- TOK_OP_PIPE, // |
- TOK_OP_CARET, // ^
- TOK_OP_TILDE, // ~
- TOK_OP_EXCLAMATION, // !
- TOK_OP_EQUALS, // =
- TOK_OP_LESS_THAN, // <
- TOK_OP_GREATER_THAN, // >
- TOK_OP_PLUS_PLUS, // ++
- TOK_OP_MINUS_MINUS, // --
- TOK_OP_EQUALS_EQUALS, // ==
- TOK_OP_NOT_EQUALS, // !=
- TOK_OP_LESS_EQUALS, // <=
- TOK_OP_GREATER_EQUALS, // >=
- TOK_OP_AMP_AMP, // &&
- TOK_OP_PIPE_PIPE, // ||
- TOK_OP_ARROW, // ->
- TOK_OP_DOT, // .
- TOK_OP_COLON, // :
- TOK_OP_QUESTION, // ?
- TOK_OP_COMMA, // ,
- TOK_OP_SEMICOLON, // ;
- TOK_OP_ELLIPSIS, // ...
- TOK_OP_ASSIGN_PLUS, // +=
- TOK_OP_ASSIGN_MINUS, // -=
- TOK_OP_ASSIGN_ASTERISK, // *=
- TOK_OP_ASSIGN_SLASH, // /=
- TOK_OP_ASSIGN_PERCENT, // %=
- TOK_OP_ASSIGN_AMPERSAND, // &=
- TOK_OP_ASSIGN_PIPE, // |=
- TOK_OP_ASSIGN_CARET, // ^=
- TOK_OP_ASSIGN_LEFT_SHIFT, // <<=
- TOK_OP_ASSIGN_RIGHT_SHIFT, // >>=
+ TOK_OP_ADD, // +
+ TOK_OP_SUB, // -
+ TOK_OP_MUL, // *
+ TOK_OP_DIV, // /
+ TOK_OP_MOD, // %
+ TOK_OP_BIT_AND, // &
+ TOK_OP_BIT_OR, // |
+ TOK_OP_BIT_XOR, // ^
+ TOK_OP_BIT_NOT, // ~
+ TOK_OP_LSHIFT, // <<
+ TOK_OP_RSHIFT, // >>
+ TOK_OP_NOT, // !
+ TOK_OP_ASSIGN, // =
+ TOK_OP_LT, // <
+ TOK_OP_GT, // >
+ TOK_OP_INC, // ++
+ TOK_OP_DEC, // --
+ TOK_OP_EQ, // ==
+ TOK_OP_NE, // !=
+ TOK_OP_LE, // <=
+ TOK_OP_GE, // >=
+ TOK_OP_AND, // &&
+ TOK_OP_OR, // ||
+ TOK_OP_MEMBER_POINTER, // ->
+ TOK_OP_MEMBER, // .
+ TOK_OP_COND_DECISION, // :
+ TOK_OP_COND, // ?
+ TOK_OP_ASSIGN_ADD, // +=
+ TOK_OP_ASSIGN_SUB, // -=
+ TOK_OP_ASSIGN_MUL, // *=
+ TOK_OP_ASSIGN_DIV, // /=
+ TOK_OP_ASSIGN_MOD, // %=
+ TOK_OP_ASSIGN_BITAND, // &=
+ TOK_OP_ASSIGN_BITOR, // |=
+ TOK_OP_ASSIGN_BITXOR, // ^=
+ TOK_OP_ASSIGN_LSHIFT, // <<=
+ TOK_OP_ASSIGN_RSHIFT, // >>=
// Separators
- TOK_SEP_LEFT_PAREN, // (
- TOK_SEP_RIGHT_PAREN, // )
- TOK_SEP_LEFT_BRACKET, // [
+ TOK_SEP_LEFT_PAREN, // (
+ TOK_SEP_RIGHT_PAREN, // )
+ TOK_SEP_LEFT_BRACKET, // [
TOK_SEP_RIGHT_BRACKET, // ]
- TOK_SEP_LEFT_BRACE, // {
- TOK_SEP_RIGHT_BRACE, // }
- TOK_SEP_COMMA, // ,
- TOK_SEP_SEMICOLON, // ;
- TOK_SEP_COLON, // :
- TOK_SEP_DOT, // .
- TOK_SEP_ELLIPSIS, // ...
- TOK_SEP_HASH, // #
+ TOK_SEP_LEFT_BRACE, // {
+ TOK_SEP_RIGHT_BRACE, // }
+ TOK_SEP_COMMA, // ,
+ TOK_SEP_SEMICOLON, // ;
+ TOK_SEP_DOT, // .
+ TOK_SEP_ELLIPSIS, // ...
+ TOK_SEP_HASH, // #
// Identifiers
- TOK_ID, // Any identifier not ending in _t
- TOK_TID, // Any identifier ending in _t
+ TOK_ID, // Any identifier not ending in _t
// Constants
- TOK_CONST_INTEGER_U32, // u
- TOK_CONST_INTEGER_U64, // ul
- TOK_CONST_INTEGER_S32, // (no suffix)
- TOK_CONST_INTEGER_S64, // l
- TOK_CONST_FLOAT_32, // f
- TOK_CONST_FLOAT_64, // (no suffix)
- TOK_CONST_CHAR, // 'c'
+ TOK_CONST_INTEGER_U32, // u
+ TOK_CONST_INTEGER_U64, // ul
+ TOK_CONST_INTEGER_S32, // (no suffix)
+ TOK_CONST_INTEGER_S64, // l
+ TOK_CONST_FLOAT_32, // f
+ TOK_CONST_FLOAT_64, // (no suffix)
+ TOK_CONST_CHAR, // 'c'
TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
- TOK_CONST_STRING_UNICODE, // L"string" (width of 32 bits)
// Special
TOK_SPECIAL_EOF,
TOK_SPECIAL_ERROR,
-} c_token_types;
-</code></pre>
+} c_token_types;</code></pre>
</div>
- <p>One decision of interest here is the separation of identifiers into two categories: <code>TOK_ID</code> and
- <code>TOK_TID</code>. This helps prevent the need for a lexer hack for typedefs, as something like
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":vexing-parse" href="#:vexing-parse">The
- Vexing Parse</a></em></strong></span>
- <pre class="prettyprint"><code class="">typedef int foo;
-int main() {
- foo foo;
- foo = 5;
- return foo;
-}</code></pre>
- </div>
- which requires syntactic context to parse correctly is not allowed (since foo is a <code
- class="prettyprint">TOK_ID</code> and not a
- <code class="prettyprint">TOK_TID</code>, the parser will error when it's in the position of a type). </p>
<h4>Tokenizer Design Decisions</h4>
<p>A lot of introductory compiler courses focus on the design of deterministic finite automatas for lexing. While
that's definitely interesting from a CS perspective, for this project I'll be focusing more on simplicity and
readability, and so I'll write a lexer by hand. </p>
- <p>I'll implement the token API first, and then write the tokenizer</p>
+ <p>I'll implement the token API first, and then write the tokenizer. This'll be a 'one-at-a-time' tokenizer, and won't return a list of tokens. </p>
<h4>Token Implementation</h4>
<p>The token struct is pretty simple. It contains the token information, some metadata about the token, and the
optional value of the token. </p>
@@ -412,30 +388,28 @@ int main() {
<span class="block-header">
<strong class="block-title"><em><a id=":token-struct" href="#:token-struct">Token
struct</a></em></strong></span>
- <pre class="prettyprint"><code class="">
-#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
-struct token {
- long magic;
- int line;
- int column;
- short kind;
- char opt_data[0];
-};
-
-typedef struct token token_t;
-
-struct token_data {
- union {
- long long i;
- double f;
- const char *s;
- char c;
- } data;
-};
-
-typedef struct token_data token_data_t;
-</code></pre>
+ <pre class="prettyprint"><code class="">#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+ #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
+ struct token {
+ long magic;
+ int line;
+ int column;
+ short kind;
+ long opt_data[0];
+ };
+
+ typedef struct token token_t;
+
+ struct token_data {
+ union {
+ long long i;
+ double f;
+ const char *s;
+ char c;
+ } data;
+ };
+
+ typedef struct token_data token_data_t;</code></pre>
</div>
<p>By keeping the token data off of simple tokens (like keywords and operators), we save 8 bytes per token. This
will add up quickly, as the lexer will produce a lot of tokens. </p>
@@ -445,46 +419,62 @@ typedef struct token_data token_data_t;
<span class="block-header">
<strong class="block-title"><em><a id=":token-internals" href="#:token-internals">Token
Internals</a></em></strong></span>
- <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data))
-static token_t *token_data_create(c_token_types kind, int line, int column) {
+ <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data))
+static token_t *token_data_create(c_token_types kind, int lin, int col,
+ int len) {
token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
- token->magic = TOK_MAGIC_1;
- token->line = line;
- token->column = column;
- token->kind = kind;
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_1;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
return token;
}
-static token_t *token_create(c_token_types kind, int line, int column) {
+static token_t *token_create(c_token_types kind, int lin, int col, int len) {
token_t *token = malloc(sizeof(token_t));
- token->magic = TOK_MAGIC_2;
- token->line = line;
- token->column = column;
- token->kind = kind;
+ if (token == NULL) {
+ fputs("Out of memory\n", stderr);
+ exit(1);
+ }
+ token->magic = TOK_MAGIC_2;
+ token->line = lin;
+ token->column = col;
+ column += len;
+ token->kind = kind;
return token;
}
-static token_t *token_create_int(c_token_types kind, int line, int column, int64_t i) {
- token_t *token = token_data_create(kind, line, column);
- token_data(token)->data.i = i;
+static token_t *token_create_int(c_token_types kind, int lin, int col,
+ int64_t i, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.i = i;
return token;
}
-static token_t *token_create_float(c_token_types kind, int line, int column, double f) {
- token_t *token = token_data_create(kind, line, column);
- token_data(token)->data.f = f;
+static token_t *token_create_float(c_token_types kind, int lin, int col,
+ double f, int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.f = f;
return token;
}
-static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) {
- // Not yet
+static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
+ int len) {
+ token_t *token = token_data_create(kind, lin, col, len);
+ token_data(token)->data.c = c;
+ return token;
}
-static token_t *token_create_char(c_token_types kind, int line, int column, char c) {
- token_t *token = token_data_create(kind, line, column);
- token_data(token)->data.c = c;
- return token;
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
+ // Not yet
}
+
</code></pre>
</div>
<p>You might wonder why token_create_string was left out. Before you ask that, try counting up all the repeated
@@ -511,13 +501,15 @@ static token_t *token_create_char(c_token_types kind, int line, int column, char
<pre class="prettyprint"><code class="">typedef struct hash_table hash_table_t;
typedef struct hash_table_entry hash_table_entry_t;
typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
-typedef unsigned long (*hash_table_hash_fn)(void *key);
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash);
-void hash_table_destroy(hash_table_t *table);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+ hash_table_hash_fn hash);
+
+typedef void (*hash_table_dtor)(void *value, int is_key);
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
void *hash_table_get(hash_table_t *table, void *key);
int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
-void hash_table_remove(hash_table_t *table, void *key);
-</code></pre>
+void hash_table_remove(hash_table_t *table, void *key);</code></pre>
</div>
<p>Now that we have the hash table interface, we can implement the generic hash table. </p>
<div class="code-block">
@@ -537,88 +529,96 @@ struct hash_table_entry {
hash_table_entry_t *next;
};
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash) {
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+ hash_table_hash_fn hash) {
hash_table_t *table = malloc(sizeof(hash_table_t));
if (table == NULL) {
return NULL;
}
- table->entries = calloc(size, sizeof(hash_table_entry_t *));
- if (table->entries == NULL) {
+ table->entries = calloc(size, sizeof(hash_table_entry_t *));
+ if (table->entries == NULL) {
free(table);
return NULL;
}
- table->size = size;
- table->cmp = cmp;
- table->hash = hash;
+ table->size = size;
+ table->cmp = cmp;
+ table->hash = hash;
return table;
}
-void hash_table_destroy(hash_table_t *table) {
- for (int i = 0; i < table->size; i++) {
- hash_table_entry_t *entry = table->entries[i];
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
+ for (int i = 0; i < table->size; i++) {
+ hash_table_entry_t *entry = table->entries[i];
while (entry != NULL) {
- hash_table_entry_t *next = entry->next;
+ hash_table_entry_t *next = entry->next;
+ if (dtor != NULL) {
+ dtor(entry->key, 1);
+ dtor(entry->value, 0);
+ }
free(entry);
entry = next;
}
}
- free(table->entries);
+ free(table->entries);
free(table);
}
void *hash_table_get(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
- return entry->value;
+ if (table->cmp(entry->key, key) == 0) {
+ return entry->value;
}
- entry = entry->next;
+ entry = entry->next;
}
return NULL;
}
int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
+ if (table->cmp(entry->key, key) == 0) {
if (replace) {
- entry->value = value;
+ entry->value = value;
return 0;
} else {
- return 1;
+ return 1;
}
}
- entry = entry->next;
+ entry = entry->next;
}
entry = malloc(sizeof(hash_table_entry_t));
- entry->key = key;
- entry->value = value;
- entry->next = table->entries[hash];
- table->entries[hash] = entry;
+ if (entry == NULL) {
+ fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
+ exit(1);
+ }
+ entry->key = key;
+ entry->value = value;
+ entry->next = table->entries[hash];
+ table->entries[hash] = entry;
return 0;
}
void hash_table_remove(hash_table_t *table, void *key) {
- unsigned long hash = table->hash(key) % table->size;
- hash_table_entry_t *entry = table->entries[hash];
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
hash_table_entry_t *prev = NULL;
while (entry != NULL) {
- if (table->cmp(entry->key, key) == 0) {
+ if (table->cmp(entry->key, key) == 0) {
if (prev == NULL) {
- table->entries[hash] = entry->next;
+ table->entries[hash] = entry->next;
} else {
- prev->next = entry->next;
+ prev->next = entry->next;
}
free(entry);
return;
}
prev = entry;
- entry = entry->next;
+ entry = entry->next;
}
-}
-</code></pre>
+}</code></pre>
</div>
<p>One implementation detail of note is that hash_table_put returns 1 if the key already exists in the table, and,
unless replace is set to 1, will not replace the value. </p>
@@ -627,28 +627,39 @@ void hash_table_remove(hash_table_t *table, void *key) {
<h4>String hashing</h4>
<p>Now we can implement the string hash table for the tokenizer.</p>
<p>A lot of research has been done on hash functions (See <a href="https://www.strchr.com/hash_functions">here for a
- good review of theory and comparison of various functions</a>). For this project, I'll be using the hash
- function from the famous "Red Dragon Book" (Aho, Sethi, Ullman, Compilers: Principles, Techniques, and Tools, 1st
- edition). as it's optimized for compiler symbols. </p>
+ good review of theory and comparison of various functions</a>). For this project, I'll be using the ELF hash
+ function from glibc, which
+ </p>
<div class="code-block">
<span class="file-name">tokenizer.c</span>
<span class="block-header">
<strong class="block-title"><em><a id=":string-hash-function" href="#:string-hash-function">String
hash function</a></em></strong></span>
- <pre class="prettyprint"><code class="">#define PRIME 211
-static unsigned long hash_string(void *key) {
- unsigned long hash = 0, g;
+ <pre class="prettyprint"><code class="">static unsigned int hash_string(void *key) {
+ unsigned long hash = 0, hi = 0;
char *p = key;
- while (*p) {
- hash = (hash << 4) + *p++;
- if ((g = hash & 0xf0000000) != 0) {
- hash ^= g >> 24;
- hash ^= g;
+ hash = *p;
+ if (hash != 0 && p[1] != 0) {
+ hash = (hash << 4) + p[1];
+ if (p[2] != 0) {
+ hash = (hash << 4) + p[2];
+ if (p[3] != 0) {
+ hash = (hash << 4) + p[3];
+ if (p[4] != 0) {
+ hash = (hash << 4) + p[4];
+ p += 5;
+ while (*p != 0) {
+ hash = (hash << 4) + *p++;
+ hi = hash & 0xf0000000l;
+ hash ^= hi >> 24;
+ }
+ hash &= 0x0fffffffl;
+ }
+ }
}
}
return hash;
-}
-</code></pre>
+}</code></pre>
</div>
<p>We'll also need a comparison function for strings. </p>
<div class="code-block">
@@ -668,17 +679,18 @@ static unsigned long hash_string(void *key) {
<strong class="block-title"><em><a id=":string-creation" href="#:string-creation">String
creation</a></em></strong></span>
<pre class="prettyprint"><code class="">static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) {
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+ const char *s, int len) {
if (string_table == NULL) {
- string_table = hash_table_create(1024, cmp_string, hash_string);
+ string_table = hash_table_create(2048, cmp_string, hash_string);
}
- token_t *token = token_data_create(kind, line, column);
- char *key = strdup(s);
- if (hash_table_put(string_table, key, key, 0)) {
- free(key);
- key = hash_table_get(string_table, key);
+ token_t *token = token_data_create(kind, lin, col, len);
+ char *key = hash_table_get(string_table, (void *)s);
+ if (key == NULL) {
+ key = strdup(s);
+ hash_table_put(string_table, key, key, 1);
}
- token_data(token)->data.s = key;
+ token_data(token)->data.s = key;
return token;
}
</code></pre>
@@ -692,57 +704,52 @@ static token_t *token_create_string(c_token_types kind, int line, int column, co
<span class="block-header">
<strong class="block-title"><em><a id=":external-token-api" href="#:external-token-api">External
token API</a></em></strong></span>
- <pre class="prettyprint"><code class="">
-c_token_types token_type(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->kind;
+ <pre class="prettyprint"><code class="">c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
}
int64_t token_int(token_t *token) {
- assert(token->kind == TOK_CONST_INTEGER_U32 ||
- token->kind == TOK_CONST_INTEGER_U64 ||
- token->kind == TOK_CONST_INTEGER_S32 ||
- token->kind == TOK_CONST_INTEGER_S64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.i;
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
}
double token_float(token_t *token) {
- assert(token->kind == TOK_CONST_FLOAT_32 ||
- token->kind == TOK_CONST_FLOAT_64);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.f;
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
}
const char *token_string(token_t *token) {
- assert(token->kind == TOK_CONST_STRING_ASCII);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.s;
+ assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
}
char token_char(token_t *token) {
- assert(token->kind == TOK_CONST_CHAR);
- assert(token->magic == TOK_MAGIC_1);
- return token_data(token)->data.c;
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
}
int token_line(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->line;
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
}
int token_column(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- return token->column;
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
}
void token_destroy(token_t *token) {
- assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
- if (token->magic == TOK_MAGIC_1) {
- if (token->kind == TOK_CONST_STRING_ASCII) {
- free((void *)token_data(token)->data.s);
- }
- }
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ // Don't free the string table, it's a global variable
free(token);
}</code></pre>
</div>
@@ -755,7 +762,10 @@ void token_destroy(token_t *token) {
<span class="block-header">
<strong class="block-title"><em><a id=":tokenizer-implementation" href="#:tokenizer-implementation">Tokenizer
implementation</a></em></strong></span>
- <pre class="prettyprint"><code class="">token_t *unget_token = NULL;
+ <pre class="prettyprint"><code class="">int line = 1;
+int column = 1;
+char file_name[1024];
+token_t *unget_token = NULL;
void tokenizer_init(const char *filename)
{
input_init(filename);
@@ -774,7 +784,25 @@ void tokenizer_destroy(void)
{
hash_table_destroy(string_table);
}
-}</code></pre>
+}
+
+token_t *read_token(void) {
+ if (unget_token != NULL) {
+ token_t *tok = unget_token;
+ unget_token = NULL;
+ return tok;
+ }
+ // What goes here?
+ int c = input_getc();
+ if (c == EOF) {
+ return NULL;
+ }
+ tok_warn(
+ "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+ line, column);
+ return read_token();
+}
+</code></pre>
</div>
<p>Now we can implement the meat of the tokenizer. The general structure is as follows: </p>
<ol>
@@ -797,7 +825,7 @@ void tokenizer_destroy(void)
<span class="block-header">
<strong class="block-title"><em><a id=":skip-whitespace" href="#:skip-whitespace">Skip
whitespace</a></em></strong></span>
- <pre class="prettyprint"><code class="">static void skip_whitespace(void) {
+ <pre class="prettyprint"><code class="">static token_t *skip_whitespace(void) {
int c;
while ((c = input_getc()) != EOF) {
if (isspace(c)) {
@@ -807,6 +835,23 @@ void tokenizer_destroy(void)
} else {
column++;
}
+ } else if (c == '#') // GCC preprocessor line control directive.
+ {
+ char buf[512];
+ int i = 0;
+ while ((c = input_getc()) != EOF && c != '\n') {
+ buf[i++] = c;
+ column++;
+ }
+ buf[i] = '\0';
+ if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+ column = 1;
+ } else {
+ tok_error("Invalid #line directive\n");
+ }
+ if (c == EOF) {
+ return NULL;
+ }
} else if (c == '/') {
c = input_getc();
if (c == '/') {
@@ -814,7 +859,7 @@ void tokenizer_destroy(void)
column++;
}
if (c == EOF) {
- return;
+ return NULL;
}
line++;
column = 1;
@@ -833,17 +878,20 @@ void tokenizer_destroy(void)
}
}
if (c == EOF) {
- return;
+ return NULL;
}
} else {
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
input_ungetc(c);
- return;
+ return token_create(TOK_OP_DIV, line, column, 1);
}
} else {
input_ungetc(c);
- return;
+ return NULL;
}
}
+ return NULL;
}</code></pre>
</div>
<p>Now we'll implement code to recognize keywords and identifiers. The rules for identifiers are that they must
@@ -858,95 +906,1190 @@ void tokenizer_destroy(void)
char buf[1024];
int i = 0;
c = input_getc();
- if (!isalpha(c) && c != '_') {
+ if (!isalpha(c) && c != '_') {
input_ungetc(c);
return NULL;
}
buf[i++] = c;
while ((c = input_getc()) != EOF) {
- if (!isalnum(c) && c != '_') {
+ if (!isalnum(c) && c != '_') {
input_ungetc(c);
break;
}
buf[i++] = c;
+ if (i >= 1008) {
+ tok_error("Identifier too long\n");
+ exit(1);
+ }
}
buf[i] = '\0';
-
+ column += i;
// Check if it's a keyword
- c_token_types kind = get_keyword(buf);
-
- if (kind == TOK_ID) {
- // Check if it's a typedeffed identifier
- int len = strlen(buf);
- if (len > 2 && buf[len - 2] == '_' && buf[len - 1] == 't') {
- kind = TOK_TID;
- }
+ c_token_types kind = get_keyword(buf, i);
+ if (kind != TOK_ID) {
+ return token_create(kind, line, column, i);
}
- return token_create_string(kind, line, column, buf);
+ return token_create_string(kind, line, column, buf, i);
}</code></pre>
</div>
+ <p> We have to match the following keywords: </p>
+ <ul>
+ <li>Control Keywords: <code>if</code>, <code>else</code>, <code>switch</code>, <code>case</code>,
+ <code>default</code>, <code>while</code>, <code>do</code>, <code>for</code>, <code>continue</code>,
+ <code>return</code>, <code>goto</code>
+ </li>
+ <li>Type Keywords: <code>void</code>, <code>char</code>, <code>short</code>, <code>int</code>, <code>long</code>,
+ <code>float</code>, <code>double</code>, <code>unsigned</code>, <code>struct</code>, <code>union</code>,
+ <code>enum</code>, <code>typedef</code>
+ </li>
+ <li>Storage Class/Specifier Keywords: <code>auto</code>, <code>register</code>, <code>static</code>,
+ <code>extern</code>, <code>const</code>, <code>volatile</code>
+ </li>
+ <li>Misc Keywords: <code>sizeof</code></li>
+ </ul>
<p>For recognizing keywords, I'll use a decision tree. This is a simple and space-efficient way to recognize
keywords. Originally I wanted to use a trie, but those end up using a lot of memory for a small number of
items. </p>
- <p> To generate the decision tree, I'll use a simple C program which works off the following template: </p>
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
- decision tree</a></em></strong></span>
- <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
+ <p> To build that, we first need to figure out what keywords have common prefixes. We can do that by sorting them:
+ </p>
+ <div class="code-block">
+ <span class="file-name">N/A</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":keyword-sort" href="#:keyword-sort">Keyword
+ sort</a></em></strong></span>
+ <pre class="prettyprint"><code class="">auto
+break
+case
+char
+const
+continue
+default
+do
+double
+else
+enum
+extern
+float
+for
+goto
+if
+int
+long
+register
+return
+short
+signed
+sizeof
+static
+struct
+switch
+typedef
+union
+unsigned
+void
+volatile
+while</code></pre>
+ </div>
+ <p>From this, the organization is pretty clear. </p>
+ <div class="code-block">
+ <span class="file-name">N/A</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
+ decision tree</a></em></strong></span>
+ <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf, int len) {
switch (buf[0]) {
- case [first letter]:
- switch (buf[1]) {
- case [second letter]:
- switch (buf[2]) {
- case [third letter]:
- return [keyword];
- default:
- return TOK_ID;
- }
- default:
- return TOK_ID;
+ case 'a':
+ if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_SCSK_AUTO;
+ break;
+
+ case 'b':
+ if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+ buf[4] == 'k')
+ return TOK_CTK_BREAK;
+ break;
+
+ case 'c':
+ switch (buf[1]) {
+ case 'a':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_CASE;
+ break;
+ case 'h':
+ if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+ return TOK_TK_CHAR;
+ break;
+ case 'o':
+ if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+ return TOK_SCSK_CONST;
+ if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+ return TOK_CTK_CONTINUE;
+ break;
+ }
+ break;
+
+ case 'd':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+ buf[5] == 'l' && buf[6] == 't')
+ return TOK_CTK_DEFAULT;
+ break;
+ case 'o':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_DO;
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+ buf[5] == 'e')
+ return TOK_TK_DOUBLE;
+ break;
+ }
+ break;
+
+ case 'e':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+ return TOK_CTK_ELSE;
+ break;
+ case 'n':
+ if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+ return TOK_TK_ENUM;
+ break;
+ case 'x':
+ if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_SCSK_EXTERN;
+ break;
+ }
+ break;
+
+ case 'f':
+ switch (buf[1]) {
+ case 'l':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+ return TOK_TK_FLOAT;
+ break;
+ case 'o':
+ if (len == 3 && buf[2] == 'r')
+ return TOK_CTK_FOR;
+ break;
+ }
+ break;
+
+ case 'g':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+ return TOK_CTK_GOTO;
+ break;
+
+ case 'i':
+ switch (buf[1]) {
+ case 'f':
+ if (len == 2 && buf[2] == '\0')
+ return TOK_CTK_IF;
+ break;
+ case 'n':
+ if (len == 3 && buf[2] == 't')
+ return TOK_TK_INT;
+ break;
+ }
+ break;
+
+ case 'l':
+ if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+ return TOK_TK_LONG;
+ break;
+
+ case 'r':
+ switch (buf[1]) {
+ case 'e':
+ if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+ buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+ return TOK_SCSK_REGISTER;
+ if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+ buf[5] == 'n')
+ return TOK_CTK_RETURN;
+ break;
+ }
+ break;
+
+ case 's':
+ switch (buf[1]) {
+ case 'h':
+ if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+ return TOK_TK_SHORT;
+ break;
+ case 't':
+ if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+ buf[5] == 'c')
+ return TOK_SCSK_STATIC;
+ break;
+ case 'i':
+ if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+ buf[5] == 'd')
+ return TOK_TK_SIGNED;
+ if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+ buf[5] == 'f')
+ return TOK_MK_SIZEOF;
+ break;
+ case 'r':
+ if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+ return TOK_TK_STRUCT;
+ break;
+ case 'w':
+ if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+ buf[5] == 'h')
+ return TOK_CTK_SWITCH;
+ break;
+ }
+ break;
+
+ case 't':
+ if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+ buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+ return TOK_TK_TYPEDEF;
+ break;
+
+ case 'u':
+ switch (buf[1]) {
+ case 'n':
+ if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+ return TOK_TK_UNION;
+ if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+ buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+ return TOK_TK_UNSIGNED;
+ break;
+ }
+ break;
+
+ case 'v':
+ switch (buf[1]) {
+ case 'o':
+ if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+ return TOK_TK_VOID;
+ if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+ buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+ return TOK_SCSK_VOLATILE;
+ break;
+ }
+ break;
+
+ case 'w':
+ if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+ buf[4] == 'e')
+ return TOK_CTK_WHILE;
+ break;
+
+ default:
+ return TOK_ID;
+ }
+ return TOK_ID;
+}</code></pre>
+ </div>
+ <p> It's very tedious, but it performs pretty well. You might wonder why I didn't use strcmp, and the answer is that
+ this approach turned out faster in testing, plus, considering I want this to be self-hosting, I can't rely on
+ optimizations in the standard library. </p>
+ <p>We'll use the same approach for recognizing operators. Let's list both out (Table 2-3 in the C
+ Reference Manual). </p>
+ <div class="code-block">
+ <span class="file-name">N/A</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":operator" href="#:operators">Operators</a></em></strong></span>
+ <pre class="prettyprint"><code class="">!
+!=
+%
+%=
+&
+&&
+&=
+(
+)
+*
+*=
++
+++
++=
+,
+-
+--
+-=
+->
+.
+...
+/
+/=
+:
+;
+<
+<<
+<<=
+<=
+=
+==
+>
+>=
+>>
+>>=
+?
+[
+]
+^
+^=
+{
+|
+|=
+||
+}
+~</code></pre>
+ </div>
+ <p>And the decision tree for operators: </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":operator-decision-tree" href="#:operator-decision-tree">Operator
+ decision tree</a></em></strong></span>
+ <pre class="prettyprint"><code class="">token_t *read_operator(void) {
+ int c;
+ c = input_getc();
+ switch (c) {
+ case '!': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_NE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_NOT, line, column, 1);
+ }
+ case '%': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MOD, line, column, 1);
+ }
+ case '&': {
+ c = input_getc();
+ if (c == '&')
+ return token_create(TOK_OP_AND, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_AND, line, column, 1);
+ }
+ case '(':
+ return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+ case ')':
+ return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+ case '*': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_MUL, line, column, 1);
+ }
+ case '+': {
+ c = input_getc();
+ if (c == '+')
+ return token_create(TOK_OP_INC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ADD, line, column, 2);
+ }
+ case ',':
+ return token_create(TOK_SEP_COMMA, line, column, 1);
+ case '-': {
+ c = input_getc();
+ if (c == '-')
+ return token_create(TOK_OP_DEC, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+ if (c == '>')
+ return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_SUB, line, column, 1);
+ }
+ case '.': {
+ c = input_getc();
+ if (c == '.') {
+ c = input_getc();
+ if (c == '.') {
+ return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+ } else {
+ // Bail out, can't store more than one unget
+ tok_error("Unexpected character '.' at line %d, column %d\n", line,
+ column);
+ exit(1);
}
- default:
- return TOK_ID;
+ }
+ return token_create('.', line, column, 1);
+ }
+ case '/': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_DIV, line, column, 1);
+ }
+ case ':':
+ return token_create(TOK_OP_COND_DECISION, line, column, 1);
+ case ';':
+ return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+ case '<': {
+ c = input_getc();
+ if (c == '<') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_LSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_LE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_LT, line, column, 1);
+ }
+ case '=': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_ASSIGN, line, column, 1);
+ }
+ case '>': {
+ c = input_getc();
+ if (c == '>') {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+ input_ungetc(c);
+ return token_create(TOK_OP_RSHIFT, line, column, 2);
+ }
+ if (c == '=')
+ return token_create(TOK_OP_GE, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_GT, line, column, 1);
+ }
+ case '?':
+ return token_create(TOK_OP_COND, line, column, 1);
+ case '[':
+ return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+ case ']':
+ return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+ case '^': {
+ c = input_getc();
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_XOR, line, column, 1);
+ }
+ case '{':
+ return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+ case '|': {
+ c = input_getc();
+ if (c == '|')
+ return token_create(TOK_OP_OR, line, column, 2);
+ if (c == '=')
+ return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+ input_ungetc(c);
+ return token_create(TOK_OP_BIT_OR, line, column, 1);
+ }
+ case '}':
+ return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+ case '~':
+ return token_create(TOK_OP_BIT_NOT, line, column, 1);
+ default:
+ input_ungetc(c);
+ return NULL;
+ }
+}
+</code></pre>
+ </div>
+ <p>Now we can implement the functions for reading numbers. C recognizes the following grammar for numbers (if you
+ haven't seen a formal grammar before, you read '::=' as "can be", '|' as "or", and '[]' as "optional", so an
+ integer constant can be a decimal constant with an optional suffix, an octal constant with an optional suffix, or
+ a hexadecimal constant with an optional suffix): </p>
+ <div
+ style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+ <pre class="prettyprint"><code class="">constant ::= integer-constant | floating-constant
+
+ integer-constant ::= decimal-constant [integer-suffix]
+ | octal-constant [integer-suffix]
+ | hexadecimal-constant [integer-suffix]
+
+ decimal-constant ::= nonzero-digit [digit-sequence] | '0'
+
+ octal-constant ::= '0' octal-digit-sequence
+
+ hexadecimal-constant ::= '0x' hexadecimal-digit-sequence
+ | '0X' hexadecimal-digit-sequence
+
+ integer-suffix ::= 'u' | 'U' | 'l' | 'L'
+ | 'ul' | 'UL' | 'lu' | 'LU'
+
+ floating-constant ::= fractional-constant [exponent-part] [floating-suffix]
+ | digit-sequence exponent-part [floating-suffix]
+
+ fractional-constant ::= digit-sequence '.' [digit-sequence]
+ | '.' digit-sequence
+
+ exponent-part ::= 'e' [sign] digit-sequence
+ | 'E' [sign] digit-sequence
+
+ sign ::= '+' | '-'
+
+ floating-suffix ::= 'f' | 'F'
+
+ digit-sequence ::= digit [digit-sequence]
+
+ nonzero-digit ::= '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
+
+ octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
+
+ hexadecimal-digit ::= digit
+ | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
+ | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
+
+ digit ::= '0' | nonzero-digit
+</code></pre>
+ </div>
+ <p>Where the following semantics apply: </p>
+ <ul>
+ <li>Integer constants are evaluated according to their radix, where <code>decimal-constant</code> is base 10,
+ <code>octal-constant</code> is base 8, and <code>hexadecimal-constant</code> is base 16.
+ </li>
+ <li>Integer suffixes are used to specify the type of the constant. A suffix of <code>u</code> or <code>U</code>
+ specifies an unsigned integer, and a suffix of <code>l</code> or <code>L</code> specifies a long integer. </li>
+ <li>Suffixes can be combined, so <code>ul</code> specifies an unsigned long integer. </li>
+ <li>Floating constants are evaluated as double precision unless a suffix of <code>f</code> or <code>F</code> is
+ used, in which case they are evaluated as single precision. </li>
+ </ul>
+ <p> Looking at these rules, we can see that all valid integer constants start with a digit from 0-9 or a period.
+ Since periods are also used for struct members, we'll have to handle that as well. </p>
+ <p> The code for reading numbers is pretty complex, so I'll comment it heavily. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":read-number" href="#:read-number">Read
+ number</a></em></strong></span>
+ <pre class="prettyprint"><code class="">
+static token_t *read_number(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ // If we don't have a digit or decimal point, it's not a number
+ if (!isdigit(c) && c != '.') {
+ input_ungetc(c);
+ return NULL;
+ }
+ // Decimal point followed by non-digit is a struct member
+ if (c == '.') {
+ char cnext = input_getc();
+ if (!isdigit(cnext)) {
+ input_ungetc(cnext);
+ return token_create(TOK_OP_MEMBER, line, column, 1);
+ }
+ input_ungetc(cnext);
+ }
+ int radix = 10;
+ int is_float = 0;
+ // Check for hex and octal.
+ if (c == '0') {
+ char cnext = input_getc();
+ if (cnext == 'x' || cnext == 'X') {
+ // Hex, discard the 0x
+ radix = 16;
+ } else {
+ // Octal, discard the 0
+ input_ungetc(cnext);
+ radix = 8;
+ }
+ } else {
+ // Decimal, append the first digit
+ buf[i++] = c;
+ }
+ // Read the rest of the number
+ while ((c = input_getc()) != EOF) {
+ // Since there can be multiple writes to the buffer, we want to make sure we
+ // don't overflow by giving a 4 byte pad
+ if (i > 1020) {
+ tok_error("Number too long\n");
+ return NULL;
+ }
+ // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+ // a-f/A-F for hex
+ if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+ (radix == 8 && c >= '0' && c <= '7')) {
+ buf[i++] = c;
+ // Decimal point and not a float yet, must be a float
+ } else if (c == '.' && !is_float) {
+ is_float = 1;
+ if (radix != 10) {
+ tok_error("Invalid floating point constant, expected decimal, got %s\n",
+ radix == 16 ? "hexadecimal" : "octal");
+ return NULL;
+ }
+ buf[i++] = c;
+ }
+ // Exponent on the end of a constant. (By standard this forces it to be a
+ // float)
+ else if (c == 'e' || c == 'E') {
+ buf[i++] = c;
+ c = input_getc();
+ // Sign on the exponent
+ if (c == '+' || c == '-') {
+ buf[i++] = c;
+ c = input_getc();
+ }
+ // Exponent must be a digit, I.E no 1e1.2
+ if (!isdigit(c)) {
+ tok_error("Invalid floating point exponent\n");
+ return NULL;
+ }
+ buf[i++] = c;
+ is_float = 1;
+ } else {
+ // Reached the end, unget the character so other functions can read it
+ input_ungetc(c);
+ break;
+ }
+ }
+ buf[i] = '\0';
+ // Check for suffixes
+ int is_unsigned = 0;
+ int is_long = 0;
+ int is_single = 0;
+ // Loop to get all possible suffixes. Warn when duplicated.
+ while (1) {
+ c = input_getc();
+ if (c == 'u' || c == 'U') {
+ if (is_unsigned) {
+ tok_warn(
+ "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+ }
+ is_unsigned = 1;
+ } else if (c == 'l' || c == 'L') {
+ if (is_long) {
+ tok_warn(
+ "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+ }
+ is_long = 1;
+ } else if (c == 'f' || c == 'F') {
+ if (is_single) {
+ tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+ "ignored\n");
+ }
+ is_single = 1;
+ } else {
+ input_ungetc(c);
+ break;
+ }
+ }
+ // Resolve invalid suffixes. Doesn't error because you can still compile with
+ // them.
+ if (is_single && is_long) {
+ tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+ "constant. Ignoring 'l'\n");
+ is_long = 0;
+ }
+ if (is_single && is_unsigned) {
+ tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+ "constant. Ignoring 'u'\n");
+ is_unsigned = 0;
+ }
+ if (is_single && !is_float) {
+ tok_warn(
+ "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+ is_single = 0;
}
+ // use the strtox functions to convert the string to a number
+ if (is_float) {
+ errno = 0;
+ // Strtod generates a unix-style error when it's given something out of
+ // range, so we want to get on top of that quickly instead of ignoring it
+ // That way we can avoid some nasty NAN-propagation in the constant folder.
+ double f = strtod(buf, NULL);
+ if (errno == ERANGE) {
+ tok_error("Floating point constant out of range\n");
+ return NULL;
+ }
+ // Warn if the constant is out of range for a float, I.E it's too big or too
+ // small
+ if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+ tok_warn(
+ "Warning: Floating point constant %f is out of range for float\n", f);
+ }
+ // Warn if the constant is too precise for a float
+ if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+ tok_warn("Warning: Converting double precision floating point constant "
+ "%f to float loses "
+ "precision\n",
+ f);
+ }
+ return token_create_float(is_single ? TOK_CONST_FLOAT_32
+ : TOK_CONST_FLOAT_64,
+ line, column, f, i);
+ } else {
+ errno = 0;
+ uint64_t int_ = strtoull(buf, NULL, radix);
+ // Same as above, but for integers
+ if (errno == ERANGE) {
+ tok_error("Integer constant out of range\n");
+ return NULL;
+ }
+ if (is_unsigned) {
+ if (is_long) {
+ return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+ } else {
+ if (int_ > UINT32_MAX) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for unsigned "
+ "int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+ }
+ } else {
+ if (is_long) {
+ // If the highest bit is set, that means this will overflow a signed
+ // long (Due to two's complement)
+ if (int_ & (1UL << 63)) {
+ tok_warn(
+ "Warning: Integer constant %lld is out of range for long long\n",
+ i);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+ } else {
+ if (int_ & (1UL << 31)) {
+ tok_warn("Warning: Integer constant %lld is out of range for int\n",
+ int_);
+ }
+ return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+ }
+ }
+ }
+ return NULL;
}</code></pre>
- </div>
- <p>For example, if we just fed the program the keyword "if", it would output the following: </p>
- <div class="code-block">
- <span class="file-name">N/A</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":if-keyword" href="#:if-keyword">If
- keyword</a></em></strong></span>
- <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
- switch (buf[0]) {
- case 'i':
- switch (buf[1]) {
- case 'f':
- switch (buf[2]) {
- case '\0':
- return TOK_CTK_IF;
- default:
- return TOK_ID;
- }
- default:
- return TOK_ID;
+ </div>
+ <p>Now let's implement char constants. The rules for those are pretty simple, as I'm not doing multi-byte or wide
+ characters. </p>
+ <div
+ style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+ <pre class="prettyprint"><code class="">character-constant ::= '\'' c-char-sequence '\''
+c-char-sequence ::= c-char | c-char-sequence c-char
+c-char ::= any member of the source character set except the single-quote ', backslash \, or new-line character | escape-sequence
+
+escape-sequence ::= simple-escape-sequence | octal-escape-sequence | hexadecimal-escape-sequence
+simple-escape-sequence ::= '\' ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | '\'' | '"' | '?' | '\')
+octal-escape-sequence ::= '\' octal-digit | '\' octal-digit octal-digit | '\' octal-digit octal-digit octal-digit
+hexadecimal-escape-sequence ::= '\' 'x' hexadecimal-digit | hexadecimal-escape-sequence hexadecimal-digit
+
+octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
+hexadecimal-digit ::= digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
+digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre>
+ </div>
+ <p>We start with the code to read escape sequences (which can be reused for string literals). </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":read-esacpe-sequence" href="#:read-escape-sequence">Read
+ escape sequence</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static char read_escape_sequence(int *len) {
+ int c = input_getc();
+ *len += 1;
+ switch (c) {
+ case 'a':
+ return '\a';
+ case 'b':
+ return '\b';
+ case 'f':
+ return '\f';
+ case 'n':
+ return '\n';
+ case 'r':
+ return '\r';
+ case 't':
+ return '\t';
+ case 'v':
+ return '\v';
+ case '\'':
+ return '\'';
+ case '"':
+ return '"';
+ case '?':
+ return '?';
+ case '\\':
+ return '\\';
+ case '0':
+ return '\0';
+ case 'x': {
+ c = input_getc();
+ if (!isxdigit(c)) {
+ tok_error("Invalid hexadecimal escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isxdigit(c)) {
+ *len += 1;
+ val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+ default:
+ if (!isdigit(c)) {
+ tok_error("Invalid escape sequence\n");
+ return 0;
+ }
+ int val = 0;
+ while (isdigit(c)) {
+ *len += 1;
+ val = val * 8 + c - '0';
+ c = input_getc();
+ }
+ input_ungetc(c);
+ return (char)val;
+ }
+}</code></pre>
+ </div>
+ <p>Now we can implement the code to read character constants. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":read-char-constant" href="#:read-char-constant">Read
+ character constant</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static token_t *read_char_constant(void) {
+ int c;
+ int len = 0;
+ c = input_getc();
+ if (c != '\'') {
+ input_ungetc(c);
+ return NULL;
+ }
+ len++;
+ c = input_getc();
+ if (c == '\'') {
+ tok_error("Empty character constant\n");
+ return NULL;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence(&len);
+ }
+ int val = c;
+ c = input_getc();
+ if (c != '\'') {
+ tok_error("Expected closing quote for character constant\n");
+ return NULL;
+ }
+ len++;
+ return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}</code></pre>
+ </div>
+ <p>Finally, we can implement string literals. </p>
+ <p>The grammar for string literals is pretty simple. </p>
+ <div
+ style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+ <pre
+ class="prettyprint"><code class="">string-literal ::= '"' s-char-sequence '"'
+s-char-sequence ::= s-char | s-char-sequence s-char
+s-char ::= any member of the source character set except the double-quote ", backslash \, or new-line character | escape-sequence</code></pre>
+ </div>
+ <p>What that means for us is that we just have to skip over the string until we hit an unescaped double quote. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":read-string-literal" href="#:read-string-literal">Read
+ string literal</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static token_t *read_string_literal(void) {
+ int c;
+ c = input_getc();
+ if (c != '"') {
+ input_ungetc(c);
+ return NULL;
+ }
+ int i = 0;
+ // Malloc is used for the buf here, the pointer stays function local as string
+ // interning duplicates the string
+ char s_buf[512];
+ char *buf = s_buf;
+ int len = 512;
+ while ((c = input_getc()) != EOF) {
+ if (c == '"') {
+ // Implicit skip of closing quote
+ break;
+ }
+ if (c == '\\') {
+ c = read_escape_sequence();
+ if (c == 0) {
+ return NULL;
}
- default:
- return TOK_ID;
+ }
+ if (i >= len) {
+ if (buf == s_buf) {
+ buf = malloc(1024);
+ if (buf == NULL) {
+ fputs("Out of memory. Could not parse string literal.\n", stderr);
+ exit(1);
+ }
+ memcpy(buf, s_buf, 512);
+ len *= 2;
+ } else {
+ len *= 2;
+ buf = realloc(buf, len);
+ }
+ }
+ buf[i++] = c;
+ }
+ buf[i] = '\0';
+ if (c == EOF) {
+ tok_error("Unterminated string literal\n");
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return NULL;
+ }
+
+ token_t *tok =
+ token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, i);
+ if (buf != s_buf) {
+ free(buf);
+ }
+ return tok;
+}</code></pre>
+ </div>
+ <p>This function uses a smaller buffer on the stack to store the string, and if it grows too large, it allocates a
+ larger buffer on the heap. This helps to avoid unnecessary allocations for small strings. </p>
+ <p>Let's recap. We've implemented code for recognizing keywords, operators, numbers, character constants, and string
+ literals. That's everything we need for the main function. We just call them all in order. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":main-loop" href="#:main-loop">Main
+ loop</a></em></strong></span>
+ <pre class="prettyprint"><code class="">token_t *read_token(void) {
+ if (unget_token != NULL) {
+ token_t *tok = unget_token;
+ unget_token = NULL;
+ return tok;
+ }
+ token_t *tok = skip_whitespace();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_identifier();
+ if (tok != NULL) {
+ return tok;
}
+ tok = read_number();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_char_constant();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_string_literal();
+ if (tok != NULL) {
+ return tok;
+ }
+ tok = read_operator();
+ if (tok != NULL) {
+ return tok;
+ }
+ int c = input_getc();
+ if (c == EOF) {
+ return NULL;
+ }
+ tok_warn(
+ "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+ line, column);
+ return read_token();
}
</code></pre>
- </div>
- <p>The program will be placed into the main tokenizer.c file using the HIGH QUALITY TECHNIQUE of including a c file
- in another c file. </p>
- <p> Time to write the program. </p>
-
+ </div>
+ <p>With that, we're basically done.</p>
+ <h3>Error Handling</h3>
+ <p>One thing we haven't covered yet is error handling. We've used <code>tok_error</code> and <code>tok_warn</code>
+ to print errors and warnings, but we haven't actually implemented them. Let's do that now. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":error-handling" href="#:error-handling">Error
+ handling</a></em></strong></span>
+ <pre class="prettyprint"><code class="">void tok_error(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}
+void tok_warn(const char *fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+ column);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+}</code></pre>
+ </div>
+ <h3>Debugging</h3>
+ <p>It'd be helpful to have a way to print out the tokens we're reading. Let's add a function for that. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":print-token" href="#:print-token">Print
+ token</a></em></strong></span>
+ <pre class="prettyprint"><code class="">char *re_escape_string(const char *str) {
+ int len = strlen(str);
+ char *buf = malloc(len * 2 + 1);
+ if (buf == NULL) {
+ fprintf(stderr, "Out of memory. Cannot escape string\n");
+ exit(1);
+ }
+ int i = 0;
+ for (int j = 0; j < len; j++) {
+ switch (str[j]) {
+ case '\a':
+ buf[i++] = '\\';
+ buf[i++] = 'a';
+ break;
+ case '\b':
+ buf[i++] = '\\';
+ buf[i++] = 'b';
+ break;
+ case '\f':
+ buf[i++] = '\\';
+ buf[i++] = 'f';
+ break;
+ case '\n':
+ buf[i++] = '\\';
+ buf[i++] = 'n';
+ break;
+ case '\r':
+ buf[i++] = '\\';
+ buf[i++] = 'r';
+ break;
+ case '\t':
+ buf[i++] = '\\';
+ buf[i++] = 't';
+ break;
+ case '\v':
+ buf[i++] = '\\';
+ buf[i++] = 'v';
+ break;
+ case '\\':
+ buf[i++] = '\\';
+ buf[i++] = '\\';
+ break;
+ case '\'':
+ buf[i++] = '\\';
+ buf[i++] = '\'';
+ break;
+ case '"':
+ buf[i++] = '\\';
+ buf[i++] = '"';
+ break;
+ default:
+ buf[i++] = str[j];
+ break;
+ }
+ }
+ buf[i] = '\0';
+ return buf;
+}
+
+void print_token(token_t *tok) {
+ if (tok == NULL) {
+ printf("NULL\n");
+ return;
+ }
+ const char *name = token_name_from_type(tok->kind);
+ switch (tok->kind) {
+ case TOK_ID:
+ case TOK_CONST_STRING_ASCII: {
+ char *escaped = re_escape_string(token_string(tok));
+ printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+ free(escaped);
+ break;
+ }
+ case TOK_CONST_CHAR:
+ printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_INTEGER_S32:
+ case TOK_CONST_INTEGER_U32:
+ case TOK_CONST_INTEGER_S64:
+ case TOK_CONST_INTEGER_U64:
+ printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+ break;
+ case TOK_CONST_FLOAT_32:
+ case TOK_CONST_FLOAT_64:
+ printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+ break;
+ default:
+ printf("%s@%d:%d\n", name, tok->line, tok->column);
+ break;
+ }
+}</code></pre>
+ </div>
+ <p>The name of the token is retrieved using <code>token_name_from_type</code>, which is a simple switch statement.
+ It's long and boring, check the source code if you're interested. </p>
+ <h3>Bugs/Errata</h3>
+ <p> I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The
+ code you see here is the final version, with all bugs fixed. </p>
+ <ul>
+ <li> had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for
+ lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code>
+ check which if true returns EOF, preventing input initialization. Fixed by changing to
+ <code>buffer_pos == buffer_size</code>.
+ </li>
+ <li>assertion <code>token->kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot
+ to expand check for identifiers which also use token_string. Fixed by changing to
+ <code>token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>.
+ </li>
+ <li> token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving
+ the call to free after
+ the call to <code>hash_table_get</code>.</li>
+ <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created
+ double free. Fixed by
+ rewriting part of function.</li>
+ <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.</li>
+ <li>Destructor for string literals missing in tokenizer teardown, added it in.</li>
+ <li> read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting.
+ Added explicit cast.</li>
+ <li>read_char_constant - Forgot to handle '\0'. Added.</li>
+ <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by
+ adding a check for division operator.</li>
+ <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced
+ with ELF hash function.</li>
+ <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an
+ array index. Ew. Used i instead of strlen.</li>
+ <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead
+ of token_create_string for keywords.</li>
+ <li>Everywhere - Checks added for memory allocation failure.</li>
+ <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.</li>
+ </ul>
+ <h3>Conclusion</h3>
+ <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust. </p>
+ <p>Next time, we'll start on the parser. </p>
+ <h1>Source code, biblography</h1>
+ <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is
+ available <a href="/projects/cminus/code/tokenizer.h">here</a>. </p>
+ <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is
+ available <a href="/projects/cminus/code/input.h">here</a>. </p>
+ <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is
+ available <a href="/projects/cminus/code/hash_table.h">here</a>. </p>
+ <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable
+ Compiler for ANSI C. Grammars are from The C Reference Manual.</p>
+ <p> I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).</p>
+ <p> Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.</p>
+ <p> The code for the ELF hash function is from the glibc source code.</p>
+ <p> The idea for decision trees came from LCC.</p>
</main>
- <footer style="text-align: center; padding: 20px;">
+ <footer style=" text-align: center; padding: 20px;">
<p>© 2021 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p>
</footer>