website

Website contents
git clone git://git.reagancfischer.dev/website.git
Log | Files | Refs

commit 80be860aa6798159a32a91aa6a2d5ef46680e4ee
parent c3abcec683384c5aaa5140189f4226cafed68239
Author: Reagan <rfische2@uccs.edu>
Date:   Thu, 22 Aug 2024 01:24:54 -0600

Lexer

Diffstat:
Mblog.html | 1-
Mprojects/cminus/code/hash_table.c | 14+++++++++++---
Mprojects/cminus/code/hash_table.h | 9+++++----
Mprojects/cminus/code/input.c | 17++++++++---------
Mprojects/cminus/code/input.h | 3+--
Mprojects/cminus/code/tokenizer.c | 1299++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mprojects/cminus/code/tokenizer.h | 120+++++++++++++++++++++++++++++++++++++++----------------------------------------
Mprojects/cminus/lexer.html | 1767+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
8 files changed, 2794 insertions(+), 436 deletions(-)

diff --git a/blog.html b/blog.html @@ -39,7 +39,6 @@ </nav> </header> - <img> <img alt="Under Construction" src="images/page-under-construction.gif"> <br> Coming soon... diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c @@ -1,4 +1,5 @@ #include "hash_table.h" +#include <stdio.h> #include <stdlib.h> struct hash_table { hash_table_entry_t **entries; @@ -29,11 +30,15 @@ hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, return table; } -void hash_table_destroy(hash_table_t *table) { +void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) { for (int i = 0; i < table->size; i++) { hash_table_entry_t *entry = table->entries[i]; while (entry != NULL) { hash_table_entry_t *next = entry->next; + if (dtor != NULL) { + dtor(entry->key, 1); + dtor(entry->value, 0); + } free(entry); entry = next; } @@ -69,6 +74,10 @@ int hash_table_put(hash_table_t *table, void *key, void *value, int replace) { entry = entry->next; } entry = malloc(sizeof(hash_table_entry_t)); + if (entry == NULL) { + fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n"); + exit(1); + } entry->key = key; entry->value = value; entry->next = table->entries[hash]; @@ -120,4 +129,4 @@ int main() { hash_table_destroy(table); return 0; } -#endif- \ No newline at end of file +#endif diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h @@ -3,11 +3,13 @@ typedef struct hash_table hash_table_t; typedef struct hash_table_entry hash_table_entry_t; typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned long (*hash_table_hash_fn)(void *key); +typedef unsigned int (*hash_table_hash_fn)(void *key); hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash); -void hash_table_destroy(hash_table_t *table); + +typedef void (*hash_table_dtor)(void *value, int is_key); +void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor); void *hash_table_get(hash_table_t *table, void *key); int hash_table_put(hash_table_t *table, void *key, void *value, int replace); void hash_table_remove(hash_table_t *table, void *key); -#endif- \ No newline at end of file +#endif diff --git a/projects/cminus/code/input.c b/projects/cminus/code/input.c @@ -5,7 +5,8 @@ static char buffer[CHUNK_SIZE]; static int buffer_pos = 0; static int buffer_size = 0; -static char unget_buffer = '\0'; +static char unget_buffer_stack[8]; +static int unget_buffer_stack_pos = 0; static FILE *file = NULL; @@ -18,10 +19,8 @@ void input_init(const char *filename) { } int input_getc(void) { - if (unget_buffer != '\0') { - char c = unget_buffer; - unget_buffer = '\0'; - return c; + if (unget_buffer_stack_pos > 0) { + return unget_buffer_stack[--unget_buffer_stack_pos]; } if (buffer_pos == buffer_size) { buffer_size = fread(buffer, 1, CHUNK_SIZE, file); @@ -30,10 +29,11 @@ int input_getc(void) { if (buffer_size == 0) { return EOF; } - return buffer[buffer_pos++]; + char c = buffer[buffer_pos++]; + return c; } -void input_ungetc(int c) { unget_buffer = c; } +void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; } void input_destroy(void) { fclose(file); } @@ -52,4 +52,4 @@ int main() { input_destroy(); return 0; } -#endif- \ No newline at end of file +#endif diff --git a/projects/cminus/code/input.h b/projects/cminus/code/input.h @@ -4,4 +4,4 @@ void input_init(const char *filename); int input_getc(void); void input_ungetc(int c); void input_destroy(void); -#endif- \ No newline at end of file +#endif diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c @@ -1,18 +1,25 @@ #include "tokenizer.h" #include "hash_table.h" +#include "input.h" #include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <float.h> +#include <math.h> +#include <stdarg.h> #include <stdint.h> +#include <stdio.h> #include <stdlib.h> #include <string.h> // Token data -#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK" -#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN" +#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" +#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" struct token { long magic; int line; int column; short kind; - char opt_data[0]; + long opt_data[0]; }; typedef struct token token_t; @@ -29,48 +36,81 @@ struct token_data { typedef struct token_data token_data_t; // Token creation +int line = 1; +int column = 1; +char file_name[1024]; #define token_data(token) ((struct token_data *)((token)->opt_data)) -static token_t *token_data_create(c_token_types kind, int line, int column) { +static token_t *token_data_create(c_token_types kind, int lin, int col, + int len) { token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); + if (token == NULL) { + fputs("Out of memory\n", stderr); + exit(1); + } token->magic = TOK_MAGIC_1; - token->line = line; - token->column = column; + token->line = lin; + token->column = col; + column += len; token->kind = kind; return token; } -static token_t *token_create(c_token_types kind, int line, int column) { +static token_t *token_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t)); + if (token == NULL) { + fputs("Out of memory\n", stderr); + exit(1); + } token->magic = TOK_MAGIC_2; - token->line = line; - token->column = column; + token->line = lin; + token->column = col; + column += len; token->kind = kind; return token; } -static token_t *token_create_int(c_token_types kind, int line, int column, - int64_t i) { - token_t *token = token_data_create(kind, line, column); +static token_t *token_create_int(c_token_types kind, int lin, int col, + int64_t i, int len) { + token_t *token = token_data_create(kind, lin, col, len); token_data(token)->data.i = i; return token; } -static token_t *token_create_float(c_token_types kind, int line, int column, - double f) { - token_t *token = token_data_create(kind, line, column); +static token_t *token_create_float(c_token_types kind, int lin, int col, + double f, int len) { + token_t *token = token_data_create(kind, lin, col, len); token_data(token)->data.f = f; return token; } -#define PRIME 211 -static unsigned long hash_string(void *key) { - unsigned long hash = 0, g; +static token_t *token_create_char(c_token_types kind, int lin, int col, char c, + int len) { + token_t *token = token_data_create(kind, lin, col, len); + token_data(token)->data.c = c; + return token; +} + +static unsigned int hash_string(void *key) { + unsigned long hash = 0, hi = 0; char *p = key; - while (*p) { - hash = (hash << 4) + *p++; - if ((g = hash & 0xf0000000) != 0) { - hash ^= g >> 24; - hash ^= g; + hash = *p; + if (hash != 0 && p[1] != 0) { + hash = (hash << 4) + p[1]; + if (p[2] != 0) { + hash = (hash << 4) + p[2]; + if (p[3] != 0) { + hash = (hash << 4) + p[3]; + if (p[4] != 0) { + hash = (hash << 4) + p[4]; + p += 5; + while (*p != 0) { + hash = (hash << 4) + *p++; + hi = hash & 0xf0000000l; + hash ^= hi >> 24; + } + hash &= 0x0fffffffl; + } + } } } return hash; @@ -81,30 +121,22 @@ static int cmp_string(void *key1, void *key2) { } static hash_table_t *string_table; -static token_t *token_create_string(c_token_types kind, int line, int column, - const char *s) { +static token_t *token_create_string(c_token_types kind, int lin, int col, + const char *s, int len) { if (string_table == NULL) { - string_table = hash_table_create(1024, cmp_string, hash_string); + string_table = hash_table_create(2048, cmp_string, hash_string); } - token_t *token = token_data_create(kind, line, column); - char *key = strdup(s); - if (hash_table_put(string_table, key, key, 0)) { - free(key); - key = hash_table_get(string_table, key); + token_t *token = token_data_create(kind, lin, col, len); + char *key = hash_table_get(string_table, (void *)s); + if (key == NULL) { + key = strdup(s); + hash_table_put(string_table, key, key, 1); } token_data(token)->data.s = key; return token; } -static token_t *token_create_char(c_token_types kind, int line, int column, - char c) { - token_t *token = token_data_create(kind, line, column); - token_data(token)->data.c = c; - return token; -} - // External token operations - c_token_types token_type(token_t *token) { assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); return token->kind; @@ -127,7 +159,7 @@ double token_float(token_t *token) { } const char *token_string(token_t *token) { - assert(token->kind == TOK_CONST_STRING_ASCII); + assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID); assert(token->magic == TOK_MAGIC_1); return token_data(token)->data.s; } @@ -150,10 +182,1189 @@ int token_column(token_t *token) { void token_destroy(token_t *token) { assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); - if (token->magic == TOK_MAGIC_1) { - if (token->kind == TOK_CONST_STRING_ASCII) { - free((void *)token_data(token)->data.s); + // Don't free the string table, it's a global variable + free(token); +} + +token_t *unget_token = NULL; +void tokenizer_init(const char *filename) { input_init(filename); } + +void tokenizer_unget(token_t *token) { + assert(unget_token == NULL); + unget_token = token; +} + +void string_table_dtor(void *value, int is_key) { + if (is_key) { + free(value); + } +} + +void tokenizer_destroy(void) { + input_destroy(); + if (string_table != NULL) { + hash_table_destroy(string_table, string_table_dtor); + } +} + +// Error handling +void tok_error(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line, + column); + vfprintf(stderr, fmt, args); + va_end(args); +} + +void tok_warn(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line, + column); + vfprintf(stderr, fmt, args); + va_end(args); +} + +// Tokenizer +static token_t *skip_whitespace(void) { + int c; + while ((c = input_getc()) != EOF) { + if (isspace(c)) { + if (c == '\n') { + line++; + column = 1; + } else { + column++; + } + } else if (c == '#') // GCC preprocessor line control directive. + { + char buf[512]; + int i = 0; + while ((c = input_getc()) != EOF && c != '\n') { + buf[i++] = c; + column++; + } + buf[i] = '\0'; + if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) { + column = 1; + } else { + tok_error("Invalid #line directive\n"); + } + if (c == EOF) { + return NULL; + } + } else if (c == '/') { + c = input_getc(); + if (c == '/') { + while ((c = input_getc()) != EOF && c != '\n') { + column++; + } + if (c == EOF) { + return NULL; + } + line++; + column = 1; + } else if (c == '*') { + while ((c = input_getc()) != EOF) { + if (c == '*') { + c = input_getc(); + if (c == '/') { + break; + } + } else if (c == '\n') { + line++; + column = 1; + } else { + column++; + } + } + if (c == EOF) { + return NULL; + } + } else { + if (c == '=') + return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_DIV, line, column, 1); + } + } else { + input_ungetc(c); + return NULL; } } - free(token); + return NULL; +} + +c_token_types get_keyword(const char *buf, int len) { + switch (buf[0]) { + case 'a': + if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') + return TOK_SCSK_AUTO; + break; + + case 'b': + if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && + buf[4] == 'k') + return TOK_CTK_BREAK; + break; + + case 'c': + switch (buf[1]) { + case 'a': + if (len == 4 && buf[2] == 's' && buf[3] == 'e') + return TOK_CTK_CASE; + break; + case 'h': + if (len == 4 && buf[2] == 'a' && buf[3] == 'r') + return TOK_TK_CHAR; + break; + case 'o': + if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') + return TOK_SCSK_CONST; + if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && + buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') + return TOK_CTK_CONTINUE; + break; + } + break; + + case 'd': + switch (buf[1]) { + case 'e': + if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && + buf[5] == 'l' && buf[6] == 't') + return TOK_CTK_DEFAULT; + break; + case 'o': + if (len == 2 && buf[2] == '\0') + return TOK_CTK_DO; + if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && + buf[5] == 'e') + return TOK_TK_DOUBLE; + break; + } + break; + + case 'e': + switch (buf[1]) { + case 'l': + if (len == 4 && buf[2] == 's' && buf[3] == 'e') + return TOK_CTK_ELSE; + break; + case 'n': + if (len == 4 && buf[2] == 'u' && buf[3] == 'm') + return TOK_TK_ENUM; + break; + case 'x': + if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && + buf[5] == 'n') + return TOK_SCSK_EXTERN; + break; + } + break; + + case 'f': + switch (buf[1]) { + case 'l': + if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') + return TOK_TK_FLOAT; + break; + case 'o': + if (len == 3 && buf[2] == 'r') + return TOK_CTK_FOR; + break; + } + break; + + case 'g': + if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') + return TOK_CTK_GOTO; + break; + + case 'i': + switch (buf[1]) { + case 'f': + if (len == 2 && buf[2] == '\0') + return TOK_CTK_IF; + break; + case 'n': + if (len == 3 && buf[2] == 't') + return TOK_TK_INT; + break; + } + break; + + case 'l': + if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') + return TOK_TK_LONG; + break; + + case 'r': + switch (buf[1]) { + case 'e': + if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && + buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') + return TOK_SCSK_REGISTER; + if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && + buf[5] == 'n') + return TOK_CTK_RETURN; + break; + } + break; + + case 's': + switch (buf[1]) { + case 'h': + if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') + return TOK_TK_SHORT; + break; + case 't': + if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && + buf[5] == 'c') + return TOK_SCSK_STATIC; + break; + case 'i': + if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && + buf[5] == 'd') + return TOK_TK_SIGNED; + if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && + buf[5] == 'f') + return TOK_MK_SIZEOF; + break; + case 'r': + if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') + return TOK_TK_STRUCT; + break; + case 'w': + if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && + buf[5] == 'h') + return TOK_CTK_SWITCH; + break; + } + break; + + case 't': + if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && + buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') + return TOK_TK_TYPEDEF; + break; + + case 'u': + switch (buf[1]) { + case 'n': + if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') + return TOK_TK_UNION; + if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && + buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') + return TOK_TK_UNSIGNED; + break; + } + break; + + case 'v': + switch (buf[1]) { + case 'o': + if (len == 4 && buf[2] == 'i' && buf[3] == 'd') + return TOK_TK_VOID; + if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && + buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') + return TOK_SCSK_VOLATILE; + break; + } + break; + + case 'w': + if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && + buf[4] == 'e') + return TOK_CTK_WHILE; + break; + + default: + return TOK_ID; + } + return TOK_ID; +} + +static token_t *read_identifier(void) { + int c; + char buf[1024]; + int i = 0; + c = input_getc(); + if (!isalpha(c) && c != '_') { + input_ungetc(c); + return NULL; + } + buf[i++] = c; + while ((c = input_getc()) != EOF) { + if (!isalnum(c) && c != '_') { + input_ungetc(c); + break; + } + buf[i++] = c; + if (i >= 1008) { + tok_error("Identifier too long\n"); + exit(1); + } + } + buf[i] = '\0'; + column += i; + // Check if it's a keyword + c_token_types kind = get_keyword(buf, i); + if (kind != TOK_ID) { + return token_create(kind, line, column, i); + } + return token_create_string(kind, line, column, buf, i); } + +token_t *read_operator(void) { + int c; + c = input_getc(); + switch (c) { + case '!': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_NE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_NOT, line, column, 1); + } + case '%': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_MOD, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_MOD, line, column, 1); + } + case '&': { + c = input_getc(); + if (c == '&') + return token_create(TOK_OP_AND, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_AND, line, column, 1); + } + case '(': + return token_create(TOK_SEP_LEFT_PAREN, line, column, 1); + case ')': + return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1); + case '*': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_MUL, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_MUL, line, column, 1); + } + case '+': { + c = input_getc(); + if (c == '+') + return token_create(TOK_OP_INC, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_ADD, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_ADD, line, column, 2); + } + case ',': + return token_create(TOK_SEP_COMMA, line, column, 1); + case '-': { + c = input_getc(); + if (c == '-') + return token_create(TOK_OP_DEC, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_SUB, line, column, 2); + if (c == '>') + return token_create(TOK_OP_MEMBER_POINTER, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_SUB, line, column, 1); + } + case '.': { + c = input_getc(); + if (c == '.') { + c = input_getc(); + if (c == '.') { + return token_create(TOK_SEP_ELLIPSIS, line, column, 3); + } else { + // Bail out, can't store more than one unget + tok_error("Unexpected character '.' at line %d, column %d\n", line, + column); + exit(1); + } + } + return token_create('.', line, column, 1); + } + case '/': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_DIV, line, column, 1); + } + case ':': + return token_create(TOK_OP_COND_DECISION, line, column, 1); + case ';': + return token_create(TOK_SEP_SEMICOLON, line, column, 1); + case '<': { + c = input_getc(); + if (c == '<') { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3); + input_ungetc(c); + return token_create(TOK_OP_LSHIFT, line, column, 2); + } + if (c == '=') + return token_create(TOK_OP_LE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_LT, line, column, 1); + } + case '=': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_ASSIGN, line, column, 1); + } + case '>': { + c = input_getc(); + if (c == '>') { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3); + input_ungetc(c); + return token_create(TOK_OP_RSHIFT, line, column, 2); + } + if (c == '=') + return token_create(TOK_OP_GE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_GT, line, column, 1); + } + case '?': + return token_create(TOK_OP_COND, line, column, 1); + case '[': + return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1); + case ']': + return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1); + case '^': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_XOR, line, column, 1); + } + case '{': + return token_create(TOK_SEP_LEFT_BRACE, line, column, 1); + case '|': { + c = input_getc(); + if (c == '|') + return token_create(TOK_OP_OR, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_OR, line, column, 1); + } + case '}': + return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1); + case '~': + return token_create(TOK_OP_BIT_NOT, line, column, 1); + default: + input_ungetc(c); + return NULL; + } +} + +static token_t *read_number(void) { + int c; + char buf[1024]; + int i = 0; + c = input_getc(); + // If we don't have a digit or decimal point, it's not a number + if (!isdigit(c) && c != '.') { + input_ungetc(c); + return NULL; + } + // Decimal point followed by non-digit is a struct member + if (c == '.') { + char cnext = input_getc(); + if (!isdigit(cnext)) { + input_ungetc(cnext); + return token_create(TOK_OP_MEMBER, line, column, 1); + } + input_ungetc(cnext); + } + int radix = 10; + int is_float = 0; + // Check for hex and octal. + if (c == '0') { + char cnext = input_getc(); + if (cnext == 'x' || cnext == 'X') { + // Hex, discard the 0x + radix = 16; + } else { + // Octal, discard the 0 + input_ungetc(cnext); + radix = 8; + } + } else { + // Decimal, append the first digit + buf[i++] = c; + } + // Read the rest of the number + while ((c = input_getc()) != EOF) { + // Since there can be multiple writes to the buffer, we want to make sure we + // don't overflow by giving a 4 byte pad + if (i > 1020) { + tok_error("Number too long\n"); + return NULL; + } + // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and + // a-f/A-F for hex + if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) || + (radix == 8 && c >= '0' && c <= '7')) { + buf[i++] = c; + // Decimal point and not a float yet, must be a float + } else if (c == '.' && !is_float) { + is_float = 1; + if (radix != 10) { + tok_error("Invalid floating point constant, expected decimal, got %s\n", + radix == 16 ? "hexadecimal" : "octal"); + return NULL; + } + buf[i++] = c; + } + // Exponent on the end of a constant. (By standard this forces it to be a + // float) + else if (c == 'e' || c == 'E') { + buf[i++] = c; + c = input_getc(); + // Sign on the exponent + if (c == '+' || c == '-') { + buf[i++] = c; + c = input_getc(); + } + // Exponent must be a digit, I.E no 1e1.2 + if (!isdigit(c)) { + tok_error("Invalid floating point exponent\n"); + return NULL; + } + buf[i++] = c; + is_float = 1; + } else { + // Reached the end, unget the character so other functions can read it + input_ungetc(c); + break; + } + } + buf[i] = '\0'; + // Check for suffixes + int is_unsigned = 0; + int is_long = 0; + int is_single = 0; + // Loop to get all possible suffixes. Warn when duplicated. + while (1) { + c = input_getc(); + if (c == 'u' || c == 'U') { + if (is_unsigned) { + tok_warn( + "Warning: Duplicate suffix 'u' for integer constant ignored\n"); + } + is_unsigned = 1; + } else if (c == 'l' || c == 'L') { + if (is_long) { + tok_warn( + "Warning: Duplicate suffix 'l' for integer constant ignored\n"); + } + is_long = 1; + } else if (c == 'f' || c == 'F') { + if (is_single) { + tok_warn("Warning: Duplicate suffix 'f' for floating point constant " + "ignored\n"); + } + is_single = 1; + } else { + input_ungetc(c); + break; + } + } + // Resolve invalid suffixes. Doesn't error because you can still compile with + // them. + if (is_single && is_long) { + tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " + "constant. Ignoring 'l'\n"); + is_long = 0; + } + if (is_single && is_unsigned) { + tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point " + "constant. Ignoring 'u'\n"); + is_unsigned = 0; + } + if (is_single && !is_float) { + tok_warn( + "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n"); + is_single = 0; + } + // use the strtox functions to convert the string to a number + if (is_float) { + errno = 0; + // Strtod generates a unix-style error when it's given something out of + // range, so we want to get on top of that quickly instead of ignoring it + // That way we can avoid some nasty NAN-propagation in the constant folder. + double f = strtod(buf, NULL); + if (errno == ERANGE) { + tok_error("Floating point constant out of range\n"); + return NULL; + } + // Warn if the constant is out of range for a float, I.E it's too big or too + // small + if (is_single && (f < FLT_MIN || f > FLT_MAX)) { + tok_warn( + "Warning: Floating point constant %f is out of range for float\n", f); + } + // Warn if the constant is too precise for a float + if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) { + tok_warn("Warning: Converting double precision floating point constant " + "%f to float loses " + "precision\n", + f); + } + return token_create_float(is_single ? TOK_CONST_FLOAT_32 + : TOK_CONST_FLOAT_64, + line, column, f, i); + } else { + errno = 0; + uint64_t int_ = strtoull(buf, NULL, radix); + // Same as above, but for integers + if (errno == ERANGE) { + tok_error("Integer constant out of range\n"); + return NULL; + } + if (is_unsigned) { + if (is_long) { + return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i); + } else { + if (int_ > UINT32_MAX) { + tok_warn( + "Warning: Integer constant %lld is out of range for unsigned " + "int\n", + int_); + } + return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i); + } + } else { + if (is_long) { + // If the highest bit is set, that means this will overflow a signed + // long (Due to two's complement) + if (int_ & (1UL << 63)) { + tok_warn( + "Warning: Integer constant %lld is out of range for long long\n", + i); + } + return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i); + } else { + if (int_ & (1UL << 31)) { + tok_warn("Warning: Integer constant %lld is out of range for int\n", + int_); + } + return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i); + } + } + } + return NULL; +} + +static char read_escape_sequence(int *len) { + int c = input_getc(); + *len += 1; + switch (c) { + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + case '\'': + return '\''; + case '"': + return '"'; + case '?': + return '?'; + case '\\': + return '\\'; + case '0': + return '\0'; + case 'x': { + c = input_getc(); + if (!isxdigit(c)) { + tok_error("Invalid hexadecimal escape sequence\n"); + return 0; + } + int val = 0; + while (isxdigit(c)) { + *len += 1; + val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10); + c = input_getc(); + } + input_ungetc(c); + return (char)val; + } + default: + if (!isdigit(c)) { + tok_error("Invalid escape sequence\n"); + return 0; + } + int val = 0; + while (isdigit(c)) { + *len += 1; + val = val * 8 + c - '0'; + c = input_getc(); + } + input_ungetc(c); + return (char)val; + } +} + +static token_t *read_char_constant(void) { + int c; + int len = 0; + c = input_getc(); + if (c != '\'') { + input_ungetc(c); + return NULL; + } + len++; + c = input_getc(); + if (c == '\'') { + tok_error("Empty character constant\n"); + return NULL; + } + if (c == '\\') { + c = read_escape_sequence(&len); + } + int val = c; + c = input_getc(); + if (c != '\'') { + tok_error("Expected closing quote for character constant\n"); + return NULL; + } + len++; + return token_create_char(TOK_CONST_CHAR, line, column, val, len); +} + +static token_t *read_string_literal(void) { + int c; + c = input_getc(); + if (c != '"') { + input_ungetc(c); + return NULL; + } + int i = 0; + // Malloc is used for the buf here, the pointer stays function local as string + // interning duplicates the string + char s_buf[512]; + char *buf = s_buf; + int len = 512; + int esc_pad = 0; + while ((c = input_getc()) != EOF) { + if (c == '"') { + // Implicit skip of closing quote + break; + } + if (c == '\\') { + c = read_escape_sequence(&esc_pad); + if (c == 0) { + return NULL; + } + } + if (i >= len) { + if (buf == s_buf) { + buf = malloc(1024); + if (buf == NULL) { + fputs("Out of memory. Could not parse string literal.\n", stderr); + exit(1); + } + memcpy(buf, s_buf, 512); + len *= 2; + } else { + len *= 2; + buf = realloc(buf, len); + } + } + buf[i++] = c; + } + buf[i] = '\0'; + if (c == EOF) { + tok_error("Unterminated string literal\n"); + if (buf != s_buf) { + free(buf); + } + return NULL; + } + + token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, + i + esc_pad + 2); + if (buf != s_buf) { + free(buf); + } + return tok; +} + +token_t *read_token(void) { + if (unget_token != NULL) { + token_t *tok = unget_token; + unget_token = NULL; + return tok; + } + token_t *tok = skip_whitespace(); + if (tok != NULL) { + return tok; + } + tok = read_identifier(); + if (tok != NULL) { + return tok; + } + tok = read_number(); + if (tok != NULL) { + return tok; + } + tok = read_char_constant(); + if (tok != NULL) { + return tok; + } + tok = read_string_literal(); + if (tok != NULL) { + return tok; + } + tok = read_operator(); + if (tok != NULL) { + return tok; + } + int c = input_getc(); + if (c == EOF) { + return NULL; + } + tok_warn( + "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, + line, column); + return read_token(); +} + +const char *token_name_from_type(c_token_types type) { + switch (type) { + case TOK_CTK_IF: + return "TOK_CTK_IF"; + case TOK_CTK_ELSE: + return "TOK_CTK_ELSE"; + case TOK_CTK_SWITCH: + return "TOK_CTK_SWITCH"; + case TOK_CTK_CASE: + return "TOK_CTK_CASE"; + case TOK_CTK_DEFAULT: + return "TOK_CTK_DEFAULT"; + case TOK_CTK_WHILE: + return "TOK_CTK_WHILE"; + case TOK_CTK_DO: + return "TOK_CTK_DO"; + case TOK_CTK_FOR: + return "TOK_CTK_FOR"; + case TOK_CTK_CONTINUE: + return "TOK_CTK_CONTINUE"; + case TOK_CTK_BREAK: + return "TOK_CTK_BREAK"; + case TOK_CTK_RETURN: + return "TOK_CTK_RETURN"; + case TOK_CTK_GOTO: + return "TOK_CTK_GOTO"; + case TOK_TK_VOID: + return "TOK_TK_VOID"; + case TOK_TK_CHAR: + return "TOK_TK_CHAR"; + case TOK_TK_SHORT: + return "TOK_TK_SHORT"; + case TOK_TK_INT: + return "TOK_TK_INT"; + case TOK_TK_LONG: + return "TOK_TK_LONG"; + case TOK_TK_FLOAT: + return "TOK_TK_FLOAT"; + case TOK_TK_DOUBLE: + return "TOK_TK_DOUBLE"; + case TOK_TK_SIGNED: + return "TOK_TK_SIGNED"; + case TOK_TK_UNSIGNED: + return "TOK_TK_UNSIGNED"; + case TOK_TK_STRUCT: + return "TOK_TK_STRUCT"; + case TOK_TK_UNION: + return "TOK_TK_UNION"; + case TOK_TK_ENUM: + return "TOK_TK_ENUM"; + case TOK_TK_TYPEDEF: + return "TOK_TK_TYPEDEF"; + case TOK_SCSK_AUTO: + return "TOK_SCSK_AUTO"; + case TOK_SCSK_REGISTER: + return "TOK_SCSK_REGISTER"; + case TOK_SCSK_STATIC: + return "TOK_SCSK_STATIC"; + case TOK_SCSK_EXTERN: + return "TOK_SCSK_EXTERN"; + case TOK_SCSK_CONST: + return "TOK_SCSK_CONST"; + case TOK_SCSK_VOLATILE: + return "TOK_SCSK_VOLATILE"; + case TOK_MK_SIZEOF: + return "TOK_MK_SIZEOF"; + case TOK_OP_ADD: + return "TOK_OP_ADD"; + case TOK_OP_SUB: + return "TOK_OP_SUB"; + case TOK_OP_MUL: + return "TOK_OP_MUL"; + case TOK_OP_DIV: + return "TOK_OP_DIV"; + case TOK_OP_MOD: + return "TOK_OP_MOD"; + case TOK_OP_BIT_AND: + return "TOK_OP_BIT_AND"; + case TOK_OP_BIT_OR: + return "TOK_OP_BIT_OR"; + case TOK_OP_BIT_XOR: + return "TOK_OP_BIT_XOR"; + case TOK_OP_BIT_NOT: + return "TOK_OP_BIT_NOT"; + case TOK_OP_LSHIFT: + return "TOK_OP_LSHIFT"; + case TOK_OP_RSHIFT: + return "TOK_OP_RSHIFT"; + case TOK_OP_NOT: + return "TOK_OP_NOT"; + case TOK_OP_ASSIGN: + return "TOK_OP_ASSIGN"; + case TOK_OP_LT: + return "TOK_OP_LT"; + case TOK_OP_GT: + return "TOK_OP_GT"; + case TOK_OP_INC: + return "TOK_OP_INC"; + case TOK_OP_DEC: + return "TOK_OP_DEC"; + case TOK_OP_EQ: + return "TOK_OP_EQ"; + case TOK_OP_NE: + return "TOK_OP_NE"; + case TOK_OP_LE: + return "TOK_OP_LE"; + case TOK_OP_GE: + return "TOK_OP_GE"; + case TOK_OP_AND: + return "TOK_OP_AND"; + case TOK_OP_OR: + return "TOK_OP_OR"; + case TOK_OP_MEMBER_POINTER: + return "TOK_OP_MEMBER_POINTER"; + case TOK_OP_MEMBER: + return "TOK_OP_MEMBER"; + case TOK_OP_COND_DECISION: + return "TOK_OP_COND_DECISION"; + case TOK_OP_COND: + return "TOK_OP_COND"; + case TOK_OP_ASSIGN_ADD: + return "TOK_OP_ASSIGN_ADD"; + case TOK_OP_ASSIGN_SUB: + return "TOK_OP_ASSIGN_SUB"; + case TOK_OP_ASSIGN_MUL: + return "TOK_OP_ASSIGN_MUL"; + case TOK_OP_ASSIGN_DIV: + return "TOK_OP_ASSIGN_DIV"; + case TOK_OP_ASSIGN_MOD: + return "TOK_OP_ASSIGN_MOD"; + case TOK_OP_ASSIGN_BITAND: + return "TOK_OP_ASSIGN_BITAND"; + case TOK_OP_ASSIGN_BITOR: + return "TOK_OP_ASSIGN_BITOR"; + case TOK_OP_ASSIGN_BITXOR: + return "TOK_OP_ASSIGN_BITXOR"; + case TOK_OP_ASSIGN_LSHIFT: + return "TOK_OP_ASSIGN_LSHIFT"; + case TOK_OP_ASSIGN_RSHIFT: + return "TOK_OP_ASSIGN_RSHIFT"; + case TOK_SEP_HASH: + return "TOK_SEP_HASH"; + case TOK_ID: + return "TOK_ID"; + case TOK_CONST_INTEGER_U32: + return "TOK_CONST_INTEGER_U32"; + case TOK_CONST_INTEGER_U64: + return "TOK_CONST_INTEGER_U64"; + case TOK_CONST_INTEGER_S32: + return "TOK_CONST_INTEGER_S32"; + case TOK_CONST_INTEGER_S64: + return "TOK_CONST_INTEGER_S64"; + case TOK_CONST_FLOAT_32: + return "TOK_CONST_FLOAT_32"; + case TOK_CONST_FLOAT_64: + return "TOK_CONST_FLOAT_64"; + case TOK_CONST_CHAR: + return "TOK_CONST_CHAR"; + case TOK_CONST_STRING_ASCII: + return "TOK_CONST_STRING_ASCII"; + case TOK_SPECIAL_EOF: + return "TOK_SPECIAL_EOF"; + case TOK_SPECIAL_ERROR: + return "TOK_SPECIAL_ERROR"; + case TOK_SEP_LEFT_PAREN: + return "TOK_SEP_LEFT_PAREN"; + case TOK_SEP_RIGHT_PAREN: + return "TOK_SEP_RIGHT_PAREN"; + case TOK_SEP_LEFT_BRACKET: + return "TOK_SEP_LEFT_BRACKET"; + case TOK_SEP_RIGHT_BRACKET: + return "TOK_SEP_RIGHT_BRACKET"; + case TOK_SEP_LEFT_BRACE: + return "TOK_SEP_LEFT_BRACE"; + case TOK_SEP_RIGHT_BRACE: + return "TOK_SEP_RIGHT_BRACE"; + case TOK_SEP_COMMA: + return "TOK_SEP_COMMA"; + case TOK_SEP_SEMICOLON: + return "TOK_SEP_SEMICOLON"; + case TOK_SEP_DOT: + return "TOK_SEP_DOT"; + case TOK_SEP_ELLIPSIS: + return "TOK_SEP_ELLIPSIS"; + } + return "UNKNOWN"; +} + +char *re_escape_string(const char *str) { + int len = strlen(str); + char *buf = malloc(len * 2 + 1); + if (buf == NULL) { + fprintf(stderr, "Out of memory. Cannot escape string\n"); + exit(1); + } + int i = 0; + for (int j = 0; j < len; j++) { + switch (str[j]) { + case '\a': + buf[i++] = '\\'; + buf[i++] = 'a'; + break; + case '\b': + buf[i++] = '\\'; + buf[i++] = 'b'; + break; + case '\f': + buf[i++] = '\\'; + buf[i++] = 'f'; + break; + case '\n': + buf[i++] = '\\'; + buf[i++] = 'n'; + break; + case '\r': + buf[i++] = '\\'; + buf[i++] = 'r'; + break; + case '\t': + buf[i++] = '\\'; + buf[i++] = 't'; + break; + case '\v': + buf[i++] = '\\'; + buf[i++] = 'v'; + break; + case '\\': + buf[i++] = '\\'; + buf[i++] = '\\'; + break; + case '\'': + buf[i++] = '\\'; + buf[i++] = '\''; + break; + case '"': + buf[i++] = '\\'; + buf[i++] = '"'; + break; + default: + buf[i++] = str[j]; + break; + } + } + buf[i] = '\0'; + return buf; +} + +void print_token(token_t *tok) { + if (tok == NULL) { + printf("NULL\n"); + return; + } + const char *name = token_name_from_type(tok->kind); + switch (tok->kind) { + case TOK_ID: + case TOK_CONST_STRING_ASCII: { + char *escaped = re_escape_string(token_string(tok)); + printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column); + free(escaped); + break; + } + case TOK_CONST_CHAR: + printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column); + break; + case TOK_CONST_INTEGER_S32: + case TOK_CONST_INTEGER_U32: + case TOK_CONST_INTEGER_S64: + case TOK_CONST_INTEGER_U64: + printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column); + break; + case TOK_CONST_FLOAT_32: + case TOK_CONST_FLOAT_64: + printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column); + break; + default: + printf("%s@%d:%d\n", name, tok->line, tok->column); + break; + } +} + +#ifdef TOK_TEST + +char *preprocess(char *in) { + char *output_name = malloc(1024); + snprintf(output_name, 1024, "%s.preprocessed", in); + char *command = malloc(2048); + snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name); + system(command); + free(command); + return output_name; +} + +// Tokenize ourselves +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "Usage: %s <input.c>\n", argv[0]); + return 1; + } + char *input_name = argv[1]; + char *preprocessed = preprocess(input_name); + tokenizer_init(preprocessed); + token_t *tok; + while ((tok = read_token()) != NULL) { + print_token(tok); + token_destroy(tok); + } + tokenizer_destroy(); + remove(preprocessed); + free(preprocessed); + return 0; +} + +#endif diff --git a/projects/cminus/code/tokenizer.h b/projects/cminus/code/tokenizer.h @@ -5,17 +5,18 @@ typedef struct token token_t; typedef enum { // Control Keywords - TOK_CTK_IF, // if - TOK_CTK_ELSE, // else - TOK_CTK_SWITCH, // switch - TOK_CTK_CASE, // case - TOK_CTK_DEFAULT, // default - TOK_CTK_WHILE, // while - TOK_CTK_DO, // do - TOK_CTK_FOR, // for - TOK_CTK_CONTINUE, // continue - TOK_CTK_RETURN, // return - TOK_CTK_GOTO, // goto + TOK_CTK_IF, + TOK_CTK_ELSE, + TOK_CTK_SWITCH, + TOK_CTK_CASE, + TOK_CTK_DEFAULT, + TOK_CTK_WHILE, + TOK_CTK_DO, + TOK_CTK_FOR, + TOK_CTK_CONTINUE, + TOK_CTK_BREAK, + TOK_CTK_RETURN, + TOK_CTK_GOTO, // Type Keywords TOK_TK_VOID, @@ -44,44 +45,43 @@ typedef enum { TOK_MK_SIZEOF, // Operators - TOK_OP_PLUS, // + - TOK_OP_MINUS, // - - TOK_OP_ASTERISK, // * - TOK_OP_SLASH, // / - TOK_OP_PERCENT, // % - TOK_OP_AMPERSAND, // & - TOK_OP_PIPE, // | - TOK_OP_CARET, // ^ - TOK_OP_TILDE, // ~ - TOK_OP_EXCLAMATION, // ! - TOK_OP_EQUALS, // = - TOK_OP_LESS_THAN, // < - TOK_OP_GREATER_THAN, // > - TOK_OP_PLUS_PLUS, // ++ - TOK_OP_MINUS_MINUS, // -- - TOK_OP_EQUALS_EQUALS, // == - TOK_OP_NOT_EQUALS, // != - TOK_OP_LESS_EQUALS, // <= - TOK_OP_GREATER_EQUALS, // >= - TOK_OP_AMP_AMP, // && - TOK_OP_PIPE_PIPE, // || - TOK_OP_ARROW, // -> - TOK_OP_DOT, // . - TOK_OP_COLON, // : - TOK_OP_QUESTION, // ? - TOK_OP_COMMA, // , - TOK_OP_SEMICOLON, // ; - TOK_OP_ELLIPSIS, // ... - TOK_OP_ASSIGN_PLUS, // += - TOK_OP_ASSIGN_MINUS, // -= - TOK_OP_ASSIGN_ASTERISK, // *= - TOK_OP_ASSIGN_SLASH, // /= - TOK_OP_ASSIGN_PERCENT, // %= - TOK_OP_ASSIGN_AMPERSAND, // &= - TOK_OP_ASSIGN_PIPE, // |= - TOK_OP_ASSIGN_CARET, // ^= - TOK_OP_ASSIGN_LEFT_SHIFT, // <<= - TOK_OP_ASSIGN_RIGHT_SHIFT, // >>= + TOK_OP_ADD, // + + TOK_OP_SUB, // - + TOK_OP_MUL, // * + TOK_OP_DIV, // / + TOK_OP_MOD, // % + TOK_OP_BIT_AND, // & + TOK_OP_BIT_OR, // | + TOK_OP_BIT_XOR, // ^ + TOK_OP_BIT_NOT, // ~ + TOK_OP_LSHIFT, // << + TOK_OP_RSHIFT, // >> + TOK_OP_NOT, // ! + TOK_OP_ASSIGN, // = + TOK_OP_LT, // < + TOK_OP_GT, // > + TOK_OP_INC, // ++ + TOK_OP_DEC, // -- + TOK_OP_EQ, // == + TOK_OP_NE, // != + TOK_OP_LE, // <= + TOK_OP_GE, // >= + TOK_OP_AND, // && + TOK_OP_OR, // || + TOK_OP_MEMBER_POINTER, // -> + TOK_OP_MEMBER, // . + TOK_OP_COND_DECISION, // : + TOK_OP_COND, // ? + TOK_OP_ASSIGN_ADD, // += + TOK_OP_ASSIGN_SUB, // -= + TOK_OP_ASSIGN_MUL, // *= + TOK_OP_ASSIGN_DIV, // /= + TOK_OP_ASSIGN_MOD, // %= + TOK_OP_ASSIGN_BITAND, // &= + TOK_OP_ASSIGN_BITOR, // |= + TOK_OP_ASSIGN_BITXOR, // ^= + TOK_OP_ASSIGN_LSHIFT, // <<= + TOK_OP_ASSIGN_RSHIFT, // >>= // Separators TOK_SEP_LEFT_PAREN, // ( @@ -92,30 +92,29 @@ typedef enum { TOK_SEP_RIGHT_BRACE, // } TOK_SEP_COMMA, // , TOK_SEP_SEMICOLON, // ; - TOK_SEP_COLON, // : TOK_SEP_DOT, // . TOK_SEP_ELLIPSIS, // ... TOK_SEP_HASH, // # // Identifiers TOK_ID, // Any identifier not ending in _t - TOK_TID, // Any identifier ending in _t // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' - TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) + TOK_CONST_INTEGER_U32, // u + TOK_CONST_INTEGER_U64, // ul + TOK_CONST_INTEGER_S32, // (no suffix) + TOK_CONST_INTEGER_S64, // l + TOK_CONST_FLOAT_32, // f + TOK_CONST_FLOAT_64, // (no suffix) + TOK_CONST_CHAR, // 'c' + TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) // Special TOK_SPECIAL_EOF, TOK_SPECIAL_ERROR, } c_token_types; + void tokenizer_init(const char *filename); token_t *tokenizer_get(void); void tokenizer_unget(token_t *token); @@ -130,4 +129,4 @@ int token_line(token_t *token); int token_column(token_t *token); void token_destroy(token_t *token); -#endif- \ No newline at end of file +#endif diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html @@ -155,47 +155,46 @@ static char buffer[CHUNK_SIZE]; static int buffer_pos = 0; static int buffer_size = 0; static char unget_buffer = '\0'; +static char unget_buffer_stack[8]; +static int unget_buffer_stack_pos = 0; static FILE *file = NULL; </code></pre> </div> - <p>When the program calls <code>input_init</code>, we open the file and read the first chunk into the buffer. </p> + <p>When the program calls <code>input_init</code>, we open the file.</p> <div class="code-block"> <span class="file-name">input.c</span> <span class="block-header"> <strong class="block-title"><em><a id=":input-init" href="#:input-init">Input initialization</a></em></strong></span> <pre class="prettyprint"><code class="">void input_init(const char *filename) { - // Open the file and read the first chunk file = fopen(filename, "r"); if (file == NULL) { - fprintf(stderr, "Error: could not open file %s\n", filename); + fprintf(stderr, "Error: Cannot open file %s\n", filename); exit(1); } - buffer_pos = 0; - nextline(); }</code></pre> </div> <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is - exhausted, we call <code>nextline</code>. </p> + exhausted, we call <code>nextline</code>. We also track the line and column. </p> <div class="code-block"> <span class="file-name">input.c</span> <span class="block-header"> <strong class="block-title"><em><a id=":input-getc" href="#:input-getc">Input getc</a></em></strong></span> <pre class="prettyprint"><code class="">int input_getc(void) { - if (unget_buffer != '\0') { - char c = unget_buffer; - unget_buffer = '\0'; - return c; + if (unget_buffer_stack_pos > 0) { + return unget_buffer_stack[--unget_buffer_stack_pos]; } if (buffer_pos == buffer_size) { - nextline(); + buffer_size = fread(buffer, 1, CHUNK_SIZE, file); + buffer_pos = 0; } if (buffer_size == 0) { return EOF; } - return buffer[buffer_pos++]; + char c = buffer[buffer_pos++]; + return c; }</code></pre> </div> <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. </p> @@ -204,9 +203,8 @@ static FILE *file = NULL; <span class="block-header"> <strong class="block-title"><em><a id=":input-ungetc" href="#:input-ungetc">Input ungetc</a></em></strong></span> - <pre class="prettyprint"><code class="">void input_ungetc(int c) { - unget_buffer = c; -}</code></pre> + <pre + class="prettyprint"><code class="">void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }</code></pre> </div> <p>Since we're not using dynamic memory allocation, cleanup is pretty simple.</p> <div class="code-block"> @@ -257,8 +255,7 @@ const char *token_string(token_t *token); char token_char(token_t *token); int token_line(token_t *token); int token_column(token_t *token); -void token_destroy(token_t *token); -</code></pre> +void token_destroy(token_t *token);</code></pre> </div> <p>I'll also define an opaque struct for tokens and an enum for token types. </p> <div class="code-block"> @@ -267,19 +264,20 @@ void token_destroy(token_t *token); <strong class="block-title"><em><a id=":token-types" href="#:token-types">Token types</a></em></strong></span> <pre class="prettyprint"><code class="">typedef struct token token_t; - typedef enum { +typedef enum { // Control Keywords - TOK_CTK_IF, // if - TOK_CTK_ELSE, // else - TOK_CTK_SWITCH, // switch - TOK_CTK_CASE, // case - TOK_CTK_DEFAULT, // default - TOK_CTK_WHILE, // while - TOK_CTK_DO, // do - TOK_CTK_FOR, // for - TOK_CTK_CONTINUE, // continue - TOK_CTK_RETURN, // return - TOK_CTK_GOTO, // goto + TOK_CTK_IF, + TOK_CTK_ELSE, + TOK_CTK_SWITCH, + TOK_CTK_CASE, + TOK_CTK_DEFAULT, + TOK_CTK_WHILE, + TOK_CTK_DO, + TOK_CTK_FOR, + TOK_CTK_CONTINUE, + TOK_CTK_BREAK, + TOK_CTK_RETURN, + TOK_CTK_GOTO, // Type Keywords TOK_TK_VOID, @@ -308,102 +306,80 @@ void token_destroy(token_t *token); TOK_MK_SIZEOF, // Operators - TOK_OP_PLUS, // + - TOK_OP_MINUS, // - - TOK_OP_ASTERISK, // * - TOK_OP_SLASH, // / - TOK_OP_PERCENT, // % - TOK_OP_AMPERSAND, // & - TOK_OP_PIPE, // | - TOK_OP_CARET, // ^ - TOK_OP_TILDE, // ~ - TOK_OP_EXCLAMATION, // ! - TOK_OP_EQUALS, // = - TOK_OP_LESS_THAN, // &lt; - TOK_OP_GREATER_THAN, // &gt; - TOK_OP_PLUS_PLUS, // ++ - TOK_OP_MINUS_MINUS, // -- - TOK_OP_EQUALS_EQUALS, // == - TOK_OP_NOT_EQUALS, // != - TOK_OP_LESS_EQUALS, // &lt;= - TOK_OP_GREATER_EQUALS, // &gt;= - TOK_OP_AMP_AMP, // && - TOK_OP_PIPE_PIPE, // || - TOK_OP_ARROW, // -&gt; - TOK_OP_DOT, // . - TOK_OP_COLON, // : - TOK_OP_QUESTION, // ? - TOK_OP_COMMA, // , - TOK_OP_SEMICOLON, // ; - TOK_OP_ELLIPSIS, // ... - TOK_OP_ASSIGN_PLUS, // += - TOK_OP_ASSIGN_MINUS, // -= - TOK_OP_ASSIGN_ASTERISK, // *= - TOK_OP_ASSIGN_SLASH, // /= - TOK_OP_ASSIGN_PERCENT, // %= - TOK_OP_ASSIGN_AMPERSAND, // &= - TOK_OP_ASSIGN_PIPE, // |= - TOK_OP_ASSIGN_CARET, // ^= - TOK_OP_ASSIGN_LEFT_SHIFT, // &lt;&lt;= - TOK_OP_ASSIGN_RIGHT_SHIFT, // &gt;&gt;= + TOK_OP_ADD, // + + TOK_OP_SUB, // - + TOK_OP_MUL, // * + TOK_OP_DIV, // / + TOK_OP_MOD, // % + TOK_OP_BIT_AND, // & + TOK_OP_BIT_OR, // | + TOK_OP_BIT_XOR, // ^ + TOK_OP_BIT_NOT, // ~ + TOK_OP_LSHIFT, // << + TOK_OP_RSHIFT, // >> + TOK_OP_NOT, // ! + TOK_OP_ASSIGN, // = + TOK_OP_LT, // < + TOK_OP_GT, // > + TOK_OP_INC, // ++ + TOK_OP_DEC, // -- + TOK_OP_EQ, // == + TOK_OP_NE, // != + TOK_OP_LE, // <= + TOK_OP_GE, // >= + TOK_OP_AND, // && + TOK_OP_OR, // || + TOK_OP_MEMBER_POINTER, // -> + TOK_OP_MEMBER, // . + TOK_OP_COND_DECISION, // : + TOK_OP_COND, // ? + TOK_OP_ASSIGN_ADD, // += + TOK_OP_ASSIGN_SUB, // -= + TOK_OP_ASSIGN_MUL, // *= + TOK_OP_ASSIGN_DIV, // /= + TOK_OP_ASSIGN_MOD, // %= + TOK_OP_ASSIGN_BITAND, // &= + TOK_OP_ASSIGN_BITOR, // |= + TOK_OP_ASSIGN_BITXOR, // ^= + TOK_OP_ASSIGN_LSHIFT, // <<= + TOK_OP_ASSIGN_RSHIFT, // >>= // Separators - TOK_SEP_LEFT_PAREN, // ( - TOK_SEP_RIGHT_PAREN, // ) - TOK_SEP_LEFT_BRACKET, // [ + TOK_SEP_LEFT_PAREN, // ( + TOK_SEP_RIGHT_PAREN, // ) + TOK_SEP_LEFT_BRACKET, // [ TOK_SEP_RIGHT_BRACKET, // ] - TOK_SEP_LEFT_BRACE, // { - TOK_SEP_RIGHT_BRACE, // } - TOK_SEP_COMMA, // , - TOK_SEP_SEMICOLON, // ; - TOK_SEP_COLON, // : - TOK_SEP_DOT, // . - TOK_SEP_ELLIPSIS, // ... - TOK_SEP_HASH, // # + TOK_SEP_LEFT_BRACE, // { + TOK_SEP_RIGHT_BRACE, // } + TOK_SEP_COMMA, // , + TOK_SEP_SEMICOLON, // ; + TOK_SEP_DOT, // . + TOK_SEP_ELLIPSIS, // ... + TOK_SEP_HASH, // # // Identifiers - TOK_ID, // Any identifier not ending in _t - TOK_TID, // Any identifier ending in _t + TOK_ID, // Any identifier not ending in _t // Constants - TOK_CONST_INTEGER_U32, // u - TOK_CONST_INTEGER_U64, // ul - TOK_CONST_INTEGER_S32, // (no suffix) - TOK_CONST_INTEGER_S64, // l - TOK_CONST_FLOAT_32, // f - TOK_CONST_FLOAT_64, // (no suffix) - TOK_CONST_CHAR, // 'c' + TOK_CONST_INTEGER_U32, // u + TOK_CONST_INTEGER_U64, // ul + TOK_CONST_INTEGER_S32, // (no suffix) + TOK_CONST_INTEGER_S64, // l + TOK_CONST_FLOAT_32, // f + TOK_CONST_FLOAT_64, // (no suffix) + TOK_CONST_CHAR, // 'c' TOK_CONST_STRING_ASCII, // "string" (width of 8 bits) - TOK_CONST_STRING_UNICODE, // L"string" (width of 32 bits) // Special TOK_SPECIAL_EOF, TOK_SPECIAL_ERROR, -} c_token_types; -</code></pre> +} c_token_types;</code></pre> </div> - <p>One decision of interest here is the separation of identifiers into two categories: <code>TOK_ID</code> and - <code>TOK_TID</code>. This helps prevent the need for a lexer hack for typedefs, as something like - <div class="code-block"> - <span class="file-name">N/A</span> - <span class="block-header"> - <strong class="block-title"><em><a id=":vexing-parse" href="#:vexing-parse">The - Vexing Parse</a></em></strong></span> - <pre class="prettyprint"><code class="">typedef int foo; -int main() { - foo foo; - foo = 5; - return foo; -}</code></pre> - </div> - which requires syntactic context to parse correctly is not allowed (since foo is a <code - class="prettyprint">TOK_ID</code> and not a - <code class="prettyprint">TOK_TID</code>, the parser will error when it's in the position of a type). </p> <h4>Tokenizer Design Decisions</h4> <p>A lot of introductory compiler courses focus on the design of deterministic finite automatas for lexing. While that's definitely interesting from a CS perspective, for this project I'll be focusing more on simplicity and readability, and so I'll write a lexer by hand. </p> - <p>I'll implement the token API first, and then write the tokenizer</p> + <p>I'll implement the token API first, and then write the tokenizer. This'll be a 'one-at-a-time' tokenizer, and won't return a list of tokens. </p> <h4>Token Implementation</h4> <p>The token struct is pretty simple. It contains the token information, some metadata about the token, and the optional value of the token. </p> @@ -412,30 +388,28 @@ int main() { <span class="block-header"> <strong class="block-title"><em><a id=":token-struct" href="#:token-struct">Token struct</a></em></strong></span> - <pre class="prettyprint"><code class=""> -#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK" -#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN" -struct token { - long magic; - int line; - int column; - short kind; - char opt_data[0]; -}; - -typedef struct token token_t; - -struct token_data { - union { - long long i; - double f; - const char *s; - char c; - } data; -}; - -typedef struct token_data token_data_t; -</code></pre> + <pre class="prettyprint"><code class="">#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" + #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" + struct token { + long magic; + int line; + int column; + short kind; + long opt_data[0]; + }; + + typedef struct token token_t; + + struct token_data { + union { + long long i; + double f; + const char *s; + char c; + } data; + }; + + typedef struct token_data token_data_t;</code></pre> </div> <p>By keeping the token data off of simple tokens (like keywords and operators), we save 8 bytes per token. This will add up quickly, as the lexer will produce a lot of tokens. </p> @@ -445,46 +419,62 @@ typedef struct token_data token_data_t; <span class="block-header"> <strong class="block-title"><em><a id=":token-internals" href="#:token-internals">Token Internals</a></em></strong></span> - <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)-&gt;opt_data)) -static token_t *token_data_create(c_token_types kind, int line, int column) { + <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data)) +static token_t *token_data_create(c_token_types kind, int lin, int col, + int len) { token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); - token-&gt;magic = TOK_MAGIC_1; - token-&gt;line = line; - token-&gt;column = column; - token-&gt;kind = kind; + if (token == NULL) { + fputs("Out of memory\n", stderr); + exit(1); + } + token->magic = TOK_MAGIC_1; + token->line = lin; + token->column = col; + column += len; + token->kind = kind; return token; } -static token_t *token_create(c_token_types kind, int line, int column) { +static token_t *token_create(c_token_types kind, int lin, int col, int len) { token_t *token = malloc(sizeof(token_t)); - token-&gt;magic = TOK_MAGIC_2; - token-&gt;line = line; - token-&gt;column = column; - token-&gt;kind = kind; + if (token == NULL) { + fputs("Out of memory\n", stderr); + exit(1); + } + token->magic = TOK_MAGIC_2; + token->line = lin; + token->column = col; + column += len; + token->kind = kind; return token; } -static token_t *token_create_int(c_token_types kind, int line, int column, int64_t i) { - token_t *token = token_data_create(kind, line, column); - token_data(token)-&gt;data.i = i; +static token_t *token_create_int(c_token_types kind, int lin, int col, + int64_t i, int len) { + token_t *token = token_data_create(kind, lin, col, len); + token_data(token)->data.i = i; return token; } -static token_t *token_create_float(c_token_types kind, int line, int column, double f) { - token_t *token = token_data_create(kind, line, column); - token_data(token)-&gt;data.f = f; +static token_t *token_create_float(c_token_types kind, int lin, int col, + double f, int len) { + token_t *token = token_data_create(kind, lin, col, len); + token_data(token)->data.f = f; return token; } -static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) { - // Not yet +static token_t *token_create_char(c_token_types kind, int lin, int col, char c, + int len) { + token_t *token = token_data_create(kind, lin, col, len); + token_data(token)->data.c = c; + return token; } -static token_t *token_create_char(c_token_types kind, int line, int column, char c) { - token_t *token = token_data_create(kind, line, column); - token_data(token)-&gt;data.c = c; - return token; +static token_t *token_create_string(c_token_types kind, int lin, int col, + const char *s, int len) { + // Not yet } + </code></pre> </div> <p>You might wonder why token_create_string was left out. Before you ask that, try counting up all the repeated @@ -511,13 +501,15 @@ static token_t *token_create_char(c_token_types kind, int line, int column, char <pre class="prettyprint"><code class="">typedef struct hash_table hash_table_t; typedef struct hash_table_entry hash_table_entry_t; typedef int (*hash_table_cmp_fn)(void *key1, void *key2); -typedef unsigned long (*hash_table_hash_fn)(void *key); -hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash); -void hash_table_destroy(hash_table_t *table); +typedef unsigned int (*hash_table_hash_fn)(void *key); +hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, + hash_table_hash_fn hash); + +typedef void (*hash_table_dtor)(void *value, int is_key); +void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor); void *hash_table_get(hash_table_t *table, void *key); int hash_table_put(hash_table_t *table, void *key, void *value, int replace); -void hash_table_remove(hash_table_t *table, void *key); -</code></pre> +void hash_table_remove(hash_table_t *table, void *key);</code></pre> </div> <p>Now that we have the hash table interface, we can implement the generic hash table. </p> <div class="code-block"> @@ -537,88 +529,96 @@ struct hash_table_entry { hash_table_entry_t *next; }; -hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash) { +hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, + hash_table_hash_fn hash) { hash_table_t *table = malloc(sizeof(hash_table_t)); if (table == NULL) { return NULL; } - table-&gt;entries = calloc(size, sizeof(hash_table_entry_t *)); - if (table-&gt;entries == NULL) { + table->entries = calloc(size, sizeof(hash_table_entry_t *)); + if (table->entries == NULL) { free(table); return NULL; } - table-&gt;size = size; - table-&gt;cmp = cmp; - table-&gt;hash = hash; + table->size = size; + table->cmp = cmp; + table->hash = hash; return table; } -void hash_table_destroy(hash_table_t *table) { - for (int i = 0; i &lt; table-&gt;size; i++) { - hash_table_entry_t *entry = table-&gt;entries[i]; +void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) { + for (int i = 0; i < table->size; i++) { + hash_table_entry_t *entry = table->entries[i]; while (entry != NULL) { - hash_table_entry_t *next = entry-&gt;next; + hash_table_entry_t *next = entry->next; + if (dtor != NULL) { + dtor(entry->key, 1); + dtor(entry->value, 0); + } free(entry); entry = next; } } - free(table-&gt;entries); + free(table->entries); free(table); } void *hash_table_get(hash_table_t *table, void *key) { - unsigned long hash = table-&gt;hash(key) % table-&gt;size; - hash_table_entry_t *entry = table-&gt;entries[hash]; + unsigned long hash = table->hash(key) % table->size; + hash_table_entry_t *entry = table->entries[hash]; while (entry != NULL) { - if (table-&gt;cmp(entry-&gt;key, key) == 0) { - return entry-&gt;value; + if (table->cmp(entry->key, key) == 0) { + return entry->value; } - entry = entry-&gt;next; + entry = entry->next; } return NULL; } int hash_table_put(hash_table_t *table, void *key, void *value, int replace) { - unsigned long hash = table-&gt;hash(key) % table-&gt;size; - hash_table_entry_t *entry = table-&gt;entries[hash]; + unsigned long hash = table->hash(key) % table->size; + hash_table_entry_t *entry = table->entries[hash]; while (entry != NULL) { - if (table-&gt;cmp(entry-&gt;key, key) == 0) { + if (table->cmp(entry->key, key) == 0) { if (replace) { - entry-&gt;value = value; + entry->value = value; return 0; } else { - return 1; + return 1; } } - entry = entry-&gt;next; + entry = entry->next; } entry = malloc(sizeof(hash_table_entry_t)); - entry-&gt;key = key; - entry-&gt;value = value; - entry-&gt;next = table-&gt;entries[hash]; - table-&gt;entries[hash] = entry; + if (entry == NULL) { + fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n"); + exit(1); + } + entry->key = key; + entry->value = value; + entry->next = table->entries[hash]; + table->entries[hash] = entry; return 0; } void hash_table_remove(hash_table_t *table, void *key) { - unsigned long hash = table-&gt;hash(key) % table-&gt;size; - hash_table_entry_t *entry = table-&gt;entries[hash]; + unsigned long hash = table->hash(key) % table->size; + hash_table_entry_t *entry = table->entries[hash]; hash_table_entry_t *prev = NULL; while (entry != NULL) { - if (table-&gt;cmp(entry-&gt;key, key) == 0) { + if (table->cmp(entry->key, key) == 0) { if (prev == NULL) { - table-&gt;entries[hash] = entry-&gt;next; + table->entries[hash] = entry->next; } else { - prev-&gt;next = entry-&gt;next; + prev->next = entry->next; } free(entry); return; } prev = entry; - entry = entry-&gt;next; + entry = entry->next; } -} -</code></pre> +}</code></pre> </div> <p>One implementation detail of note is that hash_table_put returns 1 if the key already exists in the table, and, unless replace is set to 1, will not replace the value. </p> @@ -627,28 +627,39 @@ void hash_table_remove(hash_table_t *table, void *key) { <h4>String hashing</h4> <p>Now we can implement the string hash table for the tokenizer.</p> <p>A lot of research has been done on hash functions (See <a href="https://www.strchr.com/hash_functions">here for a - good review of theory and comparison of various functions</a>). For this project, I'll be using the hash - function from the famous "Red Dragon Book" (Aho, Sethi, Ullman, Compilers: Principles, Techniques, and Tools, 1st - edition). as it's optimized for compiler symbols. </p> + good review of theory and comparison of various functions</a>). For this project, I'll be using the ELF hash + function from glibc, which + </p> <div class="code-block"> <span class="file-name">tokenizer.c</span> <span class="block-header"> <strong class="block-title"><em><a id=":string-hash-function" href="#:string-hash-function">String hash function</a></em></strong></span> - <pre class="prettyprint"><code class="">#define PRIME 211 -static unsigned long hash_string(void *key) { - unsigned long hash = 0, g; + <pre class="prettyprint"><code class="">static unsigned int hash_string(void *key) { + unsigned long hash = 0, hi = 0; char *p = key; - while (*p) { - hash = (hash &lt;&lt; 4) + *p++; - if ((g = hash &amp; 0xf0000000) != 0) { - hash ^= g &gt;&gt; 24; - hash ^= g; + hash = *p; + if (hash != 0 && p[1] != 0) { + hash = (hash << 4) + p[1]; + if (p[2] != 0) { + hash = (hash << 4) + p[2]; + if (p[3] != 0) { + hash = (hash << 4) + p[3]; + if (p[4] != 0) { + hash = (hash << 4) + p[4]; + p += 5; + while (*p != 0) { + hash = (hash << 4) + *p++; + hi = hash & 0xf0000000l; + hash ^= hi >> 24; + } + hash &= 0x0fffffffl; + } + } } } return hash; -} -</code></pre> +}</code></pre> </div> <p>We'll also need a comparison function for strings. </p> <div class="code-block"> @@ -668,17 +679,18 @@ static unsigned long hash_string(void *key) { <strong class="block-title"><em><a id=":string-creation" href="#:string-creation">String creation</a></em></strong></span> <pre class="prettyprint"><code class="">static hash_table_t *string_table; -static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) { +static token_t *token_create_string(c_token_types kind, int lin, int col, + const char *s, int len) { if (string_table == NULL) { - string_table = hash_table_create(1024, cmp_string, hash_string); + string_table = hash_table_create(2048, cmp_string, hash_string); } - token_t *token = token_data_create(kind, line, column); - char *key = strdup(s); - if (hash_table_put(string_table, key, key, 0)) { - free(key); - key = hash_table_get(string_table, key); + token_t *token = token_data_create(kind, lin, col, len); + char *key = hash_table_get(string_table, (void *)s); + if (key == NULL) { + key = strdup(s); + hash_table_put(string_table, key, key, 1); } - token_data(token)-&gt;data.s = key; + token_data(token)->data.s = key; return token; } </code></pre> @@ -692,57 +704,52 @@ static token_t *token_create_string(c_token_types kind, int line, int column, co <span class="block-header"> <strong class="block-title"><em><a id=":external-token-api" href="#:external-token-api">External token API</a></em></strong></span> - <pre class="prettyprint"><code class=""> -c_token_types token_type(token_t *token) { - assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2); - return token-&gt;kind; + <pre class="prettyprint"><code class="">c_token_types token_type(token_t *token) { + assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); + return token->kind; } int64_t token_int(token_t *token) { - assert(token-&gt;kind == TOK_CONST_INTEGER_U32 || - token-&gt;kind == TOK_CONST_INTEGER_U64 || - token-&gt;kind == TOK_CONST_INTEGER_S32 || - token-&gt;kind == TOK_CONST_INTEGER_S64); - assert(token-&gt;magic == TOK_MAGIC_1); - return token_data(token)-&gt;data.i; + assert(token->kind == TOK_CONST_INTEGER_U32 || + token->kind == TOK_CONST_INTEGER_U64 || + token->kind == TOK_CONST_INTEGER_S32 || + token->kind == TOK_CONST_INTEGER_S64); + assert(token->magic == TOK_MAGIC_1); + return token_data(token)->data.i; } double token_float(token_t *token) { - assert(token-&gt;kind == TOK_CONST_FLOAT_32 || - token-&gt;kind == TOK_CONST_FLOAT_64); - assert(token-&gt;magic == TOK_MAGIC_1); - return token_data(token)-&gt;data.f; + assert(token->kind == TOK_CONST_FLOAT_32 || + token->kind == TOK_CONST_FLOAT_64); + assert(token->magic == TOK_MAGIC_1); + return token_data(token)->data.f; } const char *token_string(token_t *token) { - assert(token-&gt;kind == TOK_CONST_STRING_ASCII); - assert(token-&gt;magic == TOK_MAGIC_1); - return token_data(token)-&gt;data.s; + assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID); + assert(token->magic == TOK_MAGIC_1); + return token_data(token)->data.s; } char token_char(token_t *token) { - assert(token-&gt;kind == TOK_CONST_CHAR); - assert(token-&gt;magic == TOK_MAGIC_1); - return token_data(token)-&gt;data.c; + assert(token->kind == TOK_CONST_CHAR); + assert(token->magic == TOK_MAGIC_1); + return token_data(token)->data.c; } int token_line(token_t *token) { - assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2); - return token-&gt;line; + assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); + return token->line; } int token_column(token_t *token) { - assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2); - return token-&gt;column; + assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); + return token->column; } void token_destroy(token_t *token) { - assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2); - if (token-&gt;magic == TOK_MAGIC_1) { - if (token-&gt;kind == TOK_CONST_STRING_ASCII) { - free((void *)token_data(token)-&gt;data.s); - } - } + assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); + // Don't free the string table, it's a global variable free(token); }</code></pre> </div> @@ -755,7 +762,10 @@ void token_destroy(token_t *token) { <span class="block-header"> <strong class="block-title"><em><a id=":tokenizer-implementation" href="#:tokenizer-implementation">Tokenizer implementation</a></em></strong></span> - <pre class="prettyprint"><code class="">token_t *unget_token = NULL; + <pre class="prettyprint"><code class="">int line = 1; +int column = 1; +char file_name[1024]; +token_t *unget_token = NULL; void tokenizer_init(const char *filename) { input_init(filename); @@ -774,7 +784,25 @@ void tokenizer_destroy(void) { hash_table_destroy(string_table); } -}</code></pre> +} + +token_t *read_token(void) { + if (unget_token != NULL) { + token_t *tok = unget_token; + unget_token = NULL; + return tok; + } + // What goes here? + int c = input_getc(); + if (c == EOF) { + return NULL; + } + tok_warn( + "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, + line, column); + return read_token(); +} +</code></pre> </div> <p>Now we can implement the meat of the tokenizer. The general structure is as follows: </p> <ol> @@ -797,7 +825,7 @@ void tokenizer_destroy(void) <span class="block-header"> <strong class="block-title"><em><a id=":skip-whitespace" href="#:skip-whitespace">Skip whitespace</a></em></strong></span> - <pre class="prettyprint"><code class="">static void skip_whitespace(void) { + <pre class="prettyprint"><code class="">static token_t *skip_whitespace(void) { int c; while ((c = input_getc()) != EOF) { if (isspace(c)) { @@ -807,6 +835,23 @@ void tokenizer_destroy(void) } else { column++; } + } else if (c == '#') // GCC preprocessor line control directive. + { + char buf[512]; + int i = 0; + while ((c = input_getc()) != EOF && c != '\n') { + buf[i++] = c; + column++; + } + buf[i] = '\0'; + if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) { + column = 1; + } else { + tok_error("Invalid #line directive\n"); + } + if (c == EOF) { + return NULL; + } } else if (c == '/') { c = input_getc(); if (c == '/') { @@ -814,7 +859,7 @@ void tokenizer_destroy(void) column++; } if (c == EOF) { - return; + return NULL; } line++; column = 1; @@ -833,17 +878,20 @@ void tokenizer_destroy(void) } } if (c == EOF) { - return; + return NULL; } } else { + if (c == '=') + return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); input_ungetc(c); - return; + return token_create(TOK_OP_DIV, line, column, 1); } } else { input_ungetc(c); - return; + return NULL; } } + return NULL; }</code></pre> </div> <p>Now we'll implement code to recognize keywords and identifiers. The rules for identifiers are that they must @@ -858,95 +906,1190 @@ void tokenizer_destroy(void) char buf[1024]; int i = 0; c = input_getc(); - if (!isalpha(c) &amp;&amp; c != '_') { + if (!isalpha(c) && c != '_') { input_ungetc(c); return NULL; } buf[i++] = c; while ((c = input_getc()) != EOF) { - if (!isalnum(c) &amp;&amp; c != '_') { + if (!isalnum(c) && c != '_') { input_ungetc(c); break; } buf[i++] = c; + if (i >= 1008) { + tok_error("Identifier too long\n"); + exit(1); + } } buf[i] = '\0'; - + column += i; // Check if it's a keyword - c_token_types kind = get_keyword(buf); - - if (kind == TOK_ID) { - // Check if it's a typedeffed identifier - int len = strlen(buf); - if (len &gt; 2 &amp;&amp; buf[len - 2] == '_' &amp;&amp; buf[len - 1] == 't') { - kind = TOK_TID; - } + c_token_types kind = get_keyword(buf, i); + if (kind != TOK_ID) { + return token_create(kind, line, column, i); } - return token_create_string(kind, line, column, buf); + return token_create_string(kind, line, column, buf, i); }</code></pre> </div> + <p> We have to match the following keywords: </p> + <ul> + <li>Control Keywords: <code>if</code>, <code>else</code>, <code>switch</code>, <code>case</code>, + <code>default</code>, <code>while</code>, <code>do</code>, <code>for</code>, <code>continue</code>, + <code>return</code>, <code>goto</code> + </li> + <li>Type Keywords: <code>void</code>, <code>char</code>, <code>short</code>, <code>int</code>, <code>long</code>, + <code>float</code>, <code>double</code>, <code>unsigned</code>, <code>struct</code>, <code>union</code>, + <code>enum</code>, <code>typedef</code> + </li> + <li>Storage Class/Specifier Keywords: <code>auto</code>, <code>register</code>, <code>static</code>, + <code>extern</code>, <code>const</code>, <code>volatile</code> + </li> + <li>Misc Keywords: <code>sizeof</code></li> + </ul> <p>For recognizing keywords, I'll use a decision tree. This is a simple and space-efficient way to recognize keywords. Originally I wanted to use a trie, but those end up using a lot of memory for a small number of items. </p> - <p> To generate the decision tree, I'll use a simple C program which works off the following template: </p> - <div class="code-block"> - <span class="file-name">N/A</span> - <span class="block-header"> - <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword - decision tree</a></em></strong></span> - <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) { + <p> To build that, we first need to figure out what keywords have common prefixes. We can do that by sorting them: + </p> + <div class="code-block"> + <span class="file-name">N/A</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":keyword-sort" href="#:keyword-sort">Keyword + sort</a></em></strong></span> + <pre class="prettyprint"><code class="">auto +break +case +char +const +continue +default +do +double +else +enum +extern +float +for +goto +if +int +long +register +return +short +signed +sizeof +static +struct +switch +typedef +union +unsigned +void +volatile +while</code></pre> + </div> + <p>From this, the organization is pretty clear. </p> + <div class="code-block"> + <span class="file-name">N/A</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword + decision tree</a></em></strong></span> + <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf, int len) { switch (buf[0]) { - case [first letter]: - switch (buf[1]) { - case [second letter]: - switch (buf[2]) { - case [third letter]: - return [keyword]; - default: - return TOK_ID; - } - default: - return TOK_ID; + case 'a': + if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') + return TOK_SCSK_AUTO; + break; + + case 'b': + if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && + buf[4] == 'k') + return TOK_CTK_BREAK; + break; + + case 'c': + switch (buf[1]) { + case 'a': + if (len == 4 && buf[2] == 's' && buf[3] == 'e') + return TOK_CTK_CASE; + break; + case 'h': + if (len == 4 && buf[2] == 'a' && buf[3] == 'r') + return TOK_TK_CHAR; + break; + case 'o': + if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') + return TOK_SCSK_CONST; + if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && + buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') + return TOK_CTK_CONTINUE; + break; + } + break; + + case 'd': + switch (buf[1]) { + case 'e': + if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && + buf[5] == 'l' && buf[6] == 't') + return TOK_CTK_DEFAULT; + break; + case 'o': + if (len == 2 && buf[2] == '\0') + return TOK_CTK_DO; + if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && + buf[5] == 'e') + return TOK_TK_DOUBLE; + break; + } + break; + + case 'e': + switch (buf[1]) { + case 'l': + if (len == 4 && buf[2] == 's' && buf[3] == 'e') + return TOK_CTK_ELSE; + break; + case 'n': + if (len == 4 && buf[2] == 'u' && buf[3] == 'm') + return TOK_TK_ENUM; + break; + case 'x': + if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && + buf[5] == 'n') + return TOK_SCSK_EXTERN; + break; + } + break; + + case 'f': + switch (buf[1]) { + case 'l': + if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') + return TOK_TK_FLOAT; + break; + case 'o': + if (len == 3 && buf[2] == 'r') + return TOK_CTK_FOR; + break; + } + break; + + case 'g': + if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') + return TOK_CTK_GOTO; + break; + + case 'i': + switch (buf[1]) { + case 'f': + if (len == 2 && buf[2] == '\0') + return TOK_CTK_IF; + break; + case 'n': + if (len == 3 && buf[2] == 't') + return TOK_TK_INT; + break; + } + break; + + case 'l': + if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') + return TOK_TK_LONG; + break; + + case 'r': + switch (buf[1]) { + case 'e': + if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && + buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') + return TOK_SCSK_REGISTER; + if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && + buf[5] == 'n') + return TOK_CTK_RETURN; + break; + } + break; + + case 's': + switch (buf[1]) { + case 'h': + if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') + return TOK_TK_SHORT; + break; + case 't': + if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && + buf[5] == 'c') + return TOK_SCSK_STATIC; + break; + case 'i': + if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && + buf[5] == 'd') + return TOK_TK_SIGNED; + if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && + buf[5] == 'f') + return TOK_MK_SIZEOF; + break; + case 'r': + if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') + return TOK_TK_STRUCT; + break; + case 'w': + if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && + buf[5] == 'h') + return TOK_CTK_SWITCH; + break; + } + break; + + case 't': + if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && + buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') + return TOK_TK_TYPEDEF; + break; + + case 'u': + switch (buf[1]) { + case 'n': + if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') + return TOK_TK_UNION; + if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && + buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') + return TOK_TK_UNSIGNED; + break; + } + break; + + case 'v': + switch (buf[1]) { + case 'o': + if (len == 4 && buf[2] == 'i' && buf[3] == 'd') + return TOK_TK_VOID; + if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && + buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') + return TOK_SCSK_VOLATILE; + break; + } + break; + + case 'w': + if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && + buf[4] == 'e') + return TOK_CTK_WHILE; + break; + + default: + return TOK_ID; + } + return TOK_ID; +}</code></pre> + </div> + <p> It's very tedious, but it performs pretty well. You might wonder why I didn't use strcmp, and the answer is that + this approach turned out faster in testing, plus, considering I want this to be self-hosting, I can't rely on + optimizations in the standard library. </p> + <p>We'll use the same approach for recognizing operators. Let's list both out (Table 2-3 in the C + Reference Manual). </p> + <div class="code-block"> + <span class="file-name">N/A</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":operator" href="#:operators">Operators</a></em></strong></span> + <pre class="prettyprint"><code class="">! +!= +% +%= +&amp; +&amp;&amp; +&amp;= +( +) +* +*= ++ +++ ++= +, +- +-- +-= +-&gt; +. +... +/ +/= +: +; +&lt; +&lt;&lt; +&lt;&lt;= +&lt;= += +== +&gt; +&gt;= +&gt;&gt; +&gt;&gt;= +? +[ +] +^ +^= +{ +| +|= +|| +} +~</code></pre> + </div> + <p>And the decision tree for operators: </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":operator-decision-tree" href="#:operator-decision-tree">Operator + decision tree</a></em></strong></span> + <pre class="prettyprint"><code class="">token_t *read_operator(void) { + int c; + c = input_getc(); + switch (c) { + case '!': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_NE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_NOT, line, column, 1); + } + case '%': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_MOD, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_MOD, line, column, 1); + } + case '&': { + c = input_getc(); + if (c == '&') + return token_create(TOK_OP_AND, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_AND, line, column, 1); + } + case '(': + return token_create(TOK_SEP_LEFT_PAREN, line, column, 1); + case ')': + return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1); + case '*': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_MUL, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_MUL, line, column, 1); + } + case '+': { + c = input_getc(); + if (c == '+') + return token_create(TOK_OP_INC, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_ADD, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_ADD, line, column, 2); + } + case ',': + return token_create(TOK_SEP_COMMA, line, column, 1); + case '-': { + c = input_getc(); + if (c == '-') + return token_create(TOK_OP_DEC, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_SUB, line, column, 2); + if (c == '>') + return token_create(TOK_OP_MEMBER_POINTER, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_SUB, line, column, 1); + } + case '.': { + c = input_getc(); + if (c == '.') { + c = input_getc(); + if (c == '.') { + return token_create(TOK_SEP_ELLIPSIS, line, column, 3); + } else { + // Bail out, can't store more than one unget + tok_error("Unexpected character '.' at line %d, column %d\n", line, + column); + exit(1); } - default: - return TOK_ID; + } + return token_create('.', line, column, 1); + } + case '/': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_DIV, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_DIV, line, column, 1); + } + case ':': + return token_create(TOK_OP_COND_DECISION, line, column, 1); + case ';': + return token_create(TOK_SEP_SEMICOLON, line, column, 1); + case '<': { + c = input_getc(); + if (c == '<') { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3); + input_ungetc(c); + return token_create(TOK_OP_LSHIFT, line, column, 2); + } + if (c == '=') + return token_create(TOK_OP_LE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_LT, line, column, 1); + } + case '=': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_ASSIGN, line, column, 1); + } + case '>': { + c = input_getc(); + if (c == '>') { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3); + input_ungetc(c); + return token_create(TOK_OP_RSHIFT, line, column, 2); + } + if (c == '=') + return token_create(TOK_OP_GE, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_GT, line, column, 1); + } + case '?': + return token_create(TOK_OP_COND, line, column, 1); + case '[': + return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1); + case ']': + return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1); + case '^': { + c = input_getc(); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_XOR, line, column, 1); + } + case '{': + return token_create(TOK_SEP_LEFT_BRACE, line, column, 1); + case '|': { + c = input_getc(); + if (c == '|') + return token_create(TOK_OP_OR, line, column, 2); + if (c == '=') + return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2); + input_ungetc(c); + return token_create(TOK_OP_BIT_OR, line, column, 1); + } + case '}': + return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1); + case '~': + return token_create(TOK_OP_BIT_NOT, line, column, 1); + default: + input_ungetc(c); + return NULL; + } +} +</code></pre> + </div> + <p>Now we can implement the functions for reading numbers. C recognizes the following grammar for numbers (if you + haven't seen a formal grammar before, you read '::=' as "can be", '|' as "or", and '[]' as "optional", so an + integer constant can be a decimal constant with an optional suffix, an octal constant with an optional suffix, or + a hexadecimal constant with an optional suffix): </p> + <div + style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;"> + <pre class="prettyprint"><code class="">constant ::= integer-constant | floating-constant + + integer-constant ::= decimal-constant [integer-suffix] + | octal-constant [integer-suffix] + | hexadecimal-constant [integer-suffix] + + decimal-constant ::= nonzero-digit [digit-sequence] | '0' + + octal-constant ::= '0' octal-digit-sequence + + hexadecimal-constant ::= '0x' hexadecimal-digit-sequence + | '0X' hexadecimal-digit-sequence + + integer-suffix ::= 'u' | 'U' | 'l' | 'L' + | 'ul' | 'UL' | 'lu' | 'LU' + + floating-constant ::= fractional-constant [exponent-part] [floating-suffix] + | digit-sequence exponent-part [floating-suffix] + + fractional-constant ::= digit-sequence '.' [digit-sequence] + | '.' digit-sequence + + exponent-part ::= 'e' [sign] digit-sequence + | 'E' [sign] digit-sequence + + sign ::= '+' | '-' + + floating-suffix ::= 'f' | 'F' + + digit-sequence ::= digit [digit-sequence] + + nonzero-digit ::= '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' + + octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' + + hexadecimal-digit ::= digit + | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' + | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' + + digit ::= '0' | nonzero-digit +</code></pre> + </div> + <p>Where the following semantics apply: </p> + <ul> + <li>Integer constants are evaluated according to their radix, where <code>decimal-constant</code> is base 10, + <code>octal-constant</code> is base 8, and <code>hexadecimal-constant</code> is base 16. + </li> + <li>Integer suffixes are used to specify the type of the constant. A suffix of <code>u</code> or <code>U</code> + specifies an unsigned integer, and a suffix of <code>l</code> or <code>L</code> specifies a long integer. </li> + <li>Suffixes can be combined, so <code>ul</code> specifies an unsigned long integer. </li> + <li>Floating constants are evaluated as double precision unless a suffix of <code>f</code> or <code>F</code> is + used, in which case they are evaluated as single precision. </li> + </ul> + <p> Looking at these rules, we can see that all valid integer constants start with a digit from 0-9 or a period. + Since periods are also used for struct members, we'll have to handle that as well. </p> + <p> The code for reading numbers is pretty complex, so I'll comment it heavily. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":read-number" href="#:read-number">Read + number</a></em></strong></span> + <pre class="prettyprint"><code class=""> +static token_t *read_number(void) { + int c; + char buf[1024]; + int i = 0; + c = input_getc(); + // If we don't have a digit or decimal point, it's not a number + if (!isdigit(c) && c != '.') { + input_ungetc(c); + return NULL; + } + // Decimal point followed by non-digit is a struct member + if (c == '.') { + char cnext = input_getc(); + if (!isdigit(cnext)) { + input_ungetc(cnext); + return token_create(TOK_OP_MEMBER, line, column, 1); + } + input_ungetc(cnext); + } + int radix = 10; + int is_float = 0; + // Check for hex and octal. + if (c == '0') { + char cnext = input_getc(); + if (cnext == 'x' || cnext == 'X') { + // Hex, discard the 0x + radix = 16; + } else { + // Octal, discard the 0 + input_ungetc(cnext); + radix = 8; + } + } else { + // Decimal, append the first digit + buf[i++] = c; + } + // Read the rest of the number + while ((c = input_getc()) != EOF) { + // Since there can be multiple writes to the buffer, we want to make sure we + // don't overflow by giving a 4 byte pad + if (i > 1020) { + tok_error("Number too long\n"); + return NULL; + } + // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and + // a-f/A-F for hex + if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) || + (radix == 8 && c >= '0' && c <= '7')) { + buf[i++] = c; + // Decimal point and not a float yet, must be a float + } else if (c == '.' && !is_float) { + is_float = 1; + if (radix != 10) { + tok_error("Invalid floating point constant, expected decimal, got %s\n", + radix == 16 ? "hexadecimal" : "octal"); + return NULL; + } + buf[i++] = c; + } + // Exponent on the end of a constant. (By standard this forces it to be a + // float) + else if (c == 'e' || c == 'E') { + buf[i++] = c; + c = input_getc(); + // Sign on the exponent + if (c == '+' || c == '-') { + buf[i++] = c; + c = input_getc(); + } + // Exponent must be a digit, I.E no 1e1.2 + if (!isdigit(c)) { + tok_error("Invalid floating point exponent\n"); + return NULL; + } + buf[i++] = c; + is_float = 1; + } else { + // Reached the end, unget the character so other functions can read it + input_ungetc(c); + break; + } + } + buf[i] = '\0'; + // Check for suffixes + int is_unsigned = 0; + int is_long = 0; + int is_single = 0; + // Loop to get all possible suffixes. Warn when duplicated. + while (1) { + c = input_getc(); + if (c == 'u' || c == 'U') { + if (is_unsigned) { + tok_warn( + "Warning: Duplicate suffix 'u' for integer constant ignored\n"); + } + is_unsigned = 1; + } else if (c == 'l' || c == 'L') { + if (is_long) { + tok_warn( + "Warning: Duplicate suffix 'l' for integer constant ignored\n"); + } + is_long = 1; + } else if (c == 'f' || c == 'F') { + if (is_single) { + tok_warn("Warning: Duplicate suffix 'f' for floating point constant " + "ignored\n"); + } + is_single = 1; + } else { + input_ungetc(c); + break; + } + } + // Resolve invalid suffixes. Doesn't error because you can still compile with + // them. + if (is_single && is_long) { + tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " + "constant. Ignoring 'l'\n"); + is_long = 0; + } + if (is_single && is_unsigned) { + tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point " + "constant. Ignoring 'u'\n"); + is_unsigned = 0; + } + if (is_single && !is_float) { + tok_warn( + "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n"); + is_single = 0; } + // use the strtox functions to convert the string to a number + if (is_float) { + errno = 0; + // Strtod generates a unix-style error when it's given something out of + // range, so we want to get on top of that quickly instead of ignoring it + // That way we can avoid some nasty NAN-propagation in the constant folder. + double f = strtod(buf, NULL); + if (errno == ERANGE) { + tok_error("Floating point constant out of range\n"); + return NULL; + } + // Warn if the constant is out of range for a float, I.E it's too big or too + // small + if (is_single && (f < FLT_MIN || f > FLT_MAX)) { + tok_warn( + "Warning: Floating point constant %f is out of range for float\n", f); + } + // Warn if the constant is too precise for a float + if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) { + tok_warn("Warning: Converting double precision floating point constant " + "%f to float loses " + "precision\n", + f); + } + return token_create_float(is_single ? TOK_CONST_FLOAT_32 + : TOK_CONST_FLOAT_64, + line, column, f, i); + } else { + errno = 0; + uint64_t int_ = strtoull(buf, NULL, radix); + // Same as above, but for integers + if (errno == ERANGE) { + tok_error("Integer constant out of range\n"); + return NULL; + } + if (is_unsigned) { + if (is_long) { + return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i); + } else { + if (int_ > UINT32_MAX) { + tok_warn( + "Warning: Integer constant %lld is out of range for unsigned " + "int\n", + int_); + } + return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i); + } + } else { + if (is_long) { + // If the highest bit is set, that means this will overflow a signed + // long (Due to two's complement) + if (int_ & (1UL << 63)) { + tok_warn( + "Warning: Integer constant %lld is out of range for long long\n", + i); + } + return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i); + } else { + if (int_ & (1UL << 31)) { + tok_warn("Warning: Integer constant %lld is out of range for int\n", + int_); + } + return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i); + } + } + } + return NULL; }</code></pre> - </div> - <p>For example, if we just fed the program the keyword "if", it would output the following: </p> - <div class="code-block"> - <span class="file-name">N/A</span> - <span class="block-header"> - <strong class="block-title"><em><a id=":if-keyword" href="#:if-keyword">If - keyword</a></em></strong></span> - <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) { - switch (buf[0]) { - case 'i': - switch (buf[1]) { - case 'f': - switch (buf[2]) { - case '\0': - return TOK_CTK_IF; - default: - return TOK_ID; - } - default: - return TOK_ID; + </div> + <p>Now let's implement char constants. The rules for those are pretty simple, as I'm not doing multi-byte or wide + characters. </p> + <div + style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;"> + <pre class="prettyprint"><code class="">character-constant ::= '\'' c-char-sequence '\'' +c-char-sequence ::= c-char | c-char-sequence c-char +c-char ::= any member of the source character set except the single-quote ', backslash \, or new-line character | escape-sequence + +escape-sequence ::= simple-escape-sequence | octal-escape-sequence | hexadecimal-escape-sequence +simple-escape-sequence ::= '\' ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | '\'' | '"' | '?' | '\') +octal-escape-sequence ::= '\' octal-digit | '\' octal-digit octal-digit | '\' octal-digit octal-digit octal-digit +hexadecimal-escape-sequence ::= '\' 'x' hexadecimal-digit | hexadecimal-escape-sequence hexadecimal-digit + +octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' +hexadecimal-digit ::= digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' +digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre> + </div> + <p>We start with the code to read escape sequences (which can be reused for string literals). </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":read-esacpe-sequence" href="#:read-escape-sequence">Read + escape sequence</a></em></strong></span> + <pre class="prettyprint"><code class="">static char read_escape_sequence(int *len) { + int c = input_getc(); + *len += 1; + switch (c) { + case 'a': + return '\a'; + case 'b': + return '\b'; + case 'f': + return '\f'; + case 'n': + return '\n'; + case 'r': + return '\r'; + case 't': + return '\t'; + case 'v': + return '\v'; + case '\'': + return '\''; + case '"': + return '"'; + case '?': + return '?'; + case '\\': + return '\\'; + case '0': + return '\0'; + case 'x': { + c = input_getc(); + if (!isxdigit(c)) { + tok_error("Invalid hexadecimal escape sequence\n"); + return 0; + } + int val = 0; + while (isxdigit(c)) { + *len += 1; + val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10); + c = input_getc(); + } + input_ungetc(c); + return (char)val; + } + default: + if (!isdigit(c)) { + tok_error("Invalid escape sequence\n"); + return 0; + } + int val = 0; + while (isdigit(c)) { + *len += 1; + val = val * 8 + c - '0'; + c = input_getc(); + } + input_ungetc(c); + return (char)val; + } +}</code></pre> + </div> + <p>Now we can implement the code to read character constants. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":read-char-constant" href="#:read-char-constant">Read + character constant</a></em></strong></span> + <pre class="prettyprint"><code class="">static token_t *read_char_constant(void) { + int c; + int len = 0; + c = input_getc(); + if (c != '\'') { + input_ungetc(c); + return NULL; + } + len++; + c = input_getc(); + if (c == '\'') { + tok_error("Empty character constant\n"); + return NULL; + } + if (c == '\\') { + c = read_escape_sequence(&len); + } + int val = c; + c = input_getc(); + if (c != '\'') { + tok_error("Expected closing quote for character constant\n"); + return NULL; + } + len++; + return token_create_char(TOK_CONST_CHAR, line, column, val, len); +}</code></pre> + </div> + <p>Finally, we can implement string literals. </p> + <p>The grammar for string literals is pretty simple. </p> + <div + style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;"> + <pre + class="prettyprint"><code class="">string-literal ::= '"' s-char-sequence '"' +s-char-sequence ::= s-char | s-char-sequence s-char +s-char ::= any member of the source character set except the double-quote ", backslash \, or new-line character | escape-sequence</code></pre> + </div> + <p>What that means for us is that we just have to skip over the string until we hit an unescaped double quote. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":read-string-literal" href="#:read-string-literal">Read + string literal</a></em></strong></span> + <pre class="prettyprint"><code class="">static token_t *read_string_literal(void) { + int c; + c = input_getc(); + if (c != '"') { + input_ungetc(c); + return NULL; + } + int i = 0; + // Malloc is used for the buf here, the pointer stays function local as string + // interning duplicates the string + char s_buf[512]; + char *buf = s_buf; + int len = 512; + while ((c = input_getc()) != EOF) { + if (c == '"') { + // Implicit skip of closing quote + break; + } + if (c == '\\') { + c = read_escape_sequence(); + if (c == 0) { + return NULL; } - default: - return TOK_ID; + } + if (i >= len) { + if (buf == s_buf) { + buf = malloc(1024); + if (buf == NULL) { + fputs("Out of memory. Could not parse string literal.\n", stderr); + exit(1); + } + memcpy(buf, s_buf, 512); + len *= 2; + } else { + len *= 2; + buf = realloc(buf, len); + } + } + buf[i++] = c; + } + buf[i] = '\0'; + if (c == EOF) { + tok_error("Unterminated string literal\n"); + if (buf != s_buf) { + free(buf); + } + return NULL; + } + + token_t *tok = + token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, i); + if (buf != s_buf) { + free(buf); + } + return tok; +}</code></pre> + </div> + <p>This function uses a smaller buffer on the stack to store the string, and if it grows too large, it allocates a + larger buffer on the heap. This helps to avoid unnecessary allocations for small strings. </p> + <p>Let's recap. We've implemented code for recognizing keywords, operators, numbers, character constants, and string + literals. That's everything we need for the main function. We just call them all in order. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":main-loop" href="#:main-loop">Main + loop</a></em></strong></span> + <pre class="prettyprint"><code class="">token_t *read_token(void) { + if (unget_token != NULL) { + token_t *tok = unget_token; + unget_token = NULL; + return tok; + } + token_t *tok = skip_whitespace(); + if (tok != NULL) { + return tok; + } + tok = read_identifier(); + if (tok != NULL) { + return tok; } + tok = read_number(); + if (tok != NULL) { + return tok; + } + tok = read_char_constant(); + if (tok != NULL) { + return tok; + } + tok = read_string_literal(); + if (tok != NULL) { + return tok; + } + tok = read_operator(); + if (tok != NULL) { + return tok; + } + int c = input_getc(); + if (c == EOF) { + return NULL; + } + tok_warn( + "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, + line, column); + return read_token(); } </code></pre> - </div> - <p>The program will be placed into the main tokenizer.c file using the HIGH QUALITY TECHNIQUE of including a c file - in another c file. </p> - <p> Time to write the program. </p> - + </div> + <p>With that, we're basically done.</p> + <h3>Error Handling</h3> + <p>One thing we haven't covered yet is error handling. We've used <code>tok_error</code> and <code>tok_warn</code> + to print errors and warnings, but we haven't actually implemented them. Let's do that now. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":error-handling" href="#:error-handling">Error + handling</a></em></strong></span> + <pre class="prettyprint"><code class="">void tok_error(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line, + column); + vfprintf(stderr, fmt, args); + va_end(args); +} +void tok_warn(const char *fmt, ...) { + va_list args; + va_start(args, fmt); + fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line, + column); + vfprintf(stderr, fmt, args); + va_end(args); +}</code></pre> + </div> + <h3>Debugging</h3> + <p>It'd be helpful to have a way to print out the tokens we're reading. Let's add a function for that. </p> + <div class="code-block"> + <span class="file-name">tokenizer.c</span> + <span class="block-header"> + <strong class="block-title"><em><a id=":print-token" href="#:print-token">Print + token</a></em></strong></span> + <pre class="prettyprint"><code class="">char *re_escape_string(const char *str) { + int len = strlen(str); + char *buf = malloc(len * 2 + 1); + if (buf == NULL) { + fprintf(stderr, "Out of memory. Cannot escape string\n"); + exit(1); + } + int i = 0; + for (int j = 0; j < len; j++) { + switch (str[j]) { + case '\a': + buf[i++] = '\\'; + buf[i++] = 'a'; + break; + case '\b': + buf[i++] = '\\'; + buf[i++] = 'b'; + break; + case '\f': + buf[i++] = '\\'; + buf[i++] = 'f'; + break; + case '\n': + buf[i++] = '\\'; + buf[i++] = 'n'; + break; + case '\r': + buf[i++] = '\\'; + buf[i++] = 'r'; + break; + case '\t': + buf[i++] = '\\'; + buf[i++] = 't'; + break; + case '\v': + buf[i++] = '\\'; + buf[i++] = 'v'; + break; + case '\\': + buf[i++] = '\\'; + buf[i++] = '\\'; + break; + case '\'': + buf[i++] = '\\'; + buf[i++] = '\''; + break; + case '"': + buf[i++] = '\\'; + buf[i++] = '"'; + break; + default: + buf[i++] = str[j]; + break; + } + } + buf[i] = '\0'; + return buf; +} + +void print_token(token_t *tok) { + if (tok == NULL) { + printf("NULL\n"); + return; + } + const char *name = token_name_from_type(tok->kind); + switch (tok->kind) { + case TOK_ID: + case TOK_CONST_STRING_ASCII: { + char *escaped = re_escape_string(token_string(tok)); + printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column); + free(escaped); + break; + } + case TOK_CONST_CHAR: + printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column); + break; + case TOK_CONST_INTEGER_S32: + case TOK_CONST_INTEGER_U32: + case TOK_CONST_INTEGER_S64: + case TOK_CONST_INTEGER_U64: + printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column); + break; + case TOK_CONST_FLOAT_32: + case TOK_CONST_FLOAT_64: + printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column); + break; + default: + printf("%s@%d:%d\n", name, tok->line, tok->column); + break; + } +}</code></pre> + </div> + <p>The name of the token is retrieved using <code>token_name_from_type</code>, which is a simple switch statement. + It's long and boring, check the source code if you're interested. </p> + <h3>Bugs/Errata</h3> + <p> I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The + code you see here is the final version, with all bugs fixed. </p> + <ul> + <li> had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for + lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code> + check which if true returns EOF, preventing input initialization. Fixed by changing to + <code>buffer_pos == buffer_size</code>. + </li> + <li>assertion <code>token->kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot + to expand check for identifiers which also use token_string. Fixed by changing to + <code>token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>. + </li> + <li> token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving + the call to free after + the call to <code>hash_table_get</code>.</li> + <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created + double free. Fixed by + rewriting part of function.</li> + <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.</li> + <li>Destructor for string literals missing in tokenizer teardown, added it in.</li> + <li> read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting. + Added explicit cast.</li> + <li>read_char_constant - Forgot to handle '\0'. Added.</li> + <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by + adding a check for division operator.</li> + <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced + with ELF hash function.</li> + <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an + array index. Ew. Used i instead of strlen.</li> + <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead + of token_create_string for keywords.</li> + <li>Everywhere - Checks added for memory allocation failure.</li> + <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.</li> + </ul> + <h3>Conclusion</h3> + <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust. </p> + <p>Next time, we'll start on the parser. </p> + <h1>Source code, biblography</h1> + <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is + available <a href="/projects/cminus/code/tokenizer.h">here</a>. </p> + <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is + available <a href="/projects/cminus/code/input.h">here</a>. </p> + <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is + available <a href="/projects/cminus/code/hash_table.h">here</a>. </p> + <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable + Compiler for ANSI C. Grammars are from The C Reference Manual.</p> + <p> I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).</p> + <p> Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.</p> + <p> The code for the ELF hash function is from the glibc source code.</p> + <p> The idea for decision trees came from LCC.</p> </main> - <footer style="text-align: center; padding: 20px;"> + <footer style=" text-align: center; padding: 20px;"> <p>© 2021 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p> </footer>