Lexer - website - Website contents

commit 80be860aa6798159a32a91aa6a2d5ef46680e4ee
parent c3abcec683384c5aaa5140189f4226cafed68239
Author: Reagan <rfische2@uccs.edu>
Date:   Thu, 22 Aug 2024 01:24:54 -0600

Lexer

Diffstat:
M blog.html  | 1 -
M projects/cminus/code/hash_table.c  | 14 +++++++++++---
M projects/cminus/code/hash_table.h  | 9 +++++----
M projects/cminus/code/input.c  | 17 ++++++++---------
M projects/cminus/code/input.h  | 3 +--
M projects/cminus/code/tokenizer.c  | 1299 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M projects/cminus/code/tokenizer.h  | 120 +++++++++++++++++++++++++++++++++++++++----------------------------------------
M projects/cminus/lexer.html  | 1767 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------

8 files changed, 2794 insertions(+), 436 deletions(-)
diff --git a/blog.html b/blog.html
@@ -39,7 +39,6 @@
         </nav>
     </header>
 
-    <img>
     <img alt="Under Construction" src="images/page-under-construction.gif">
     <br>
     Coming soon...
diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c
@@ -1,4 +1,5 @@
 #include "hash_table.h"
+#include <stdio.h>
 #include <stdlib.h>
 struct hash_table {
   hash_table_entry_t **entries;
@@ -29,11 +30,15 @@ hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
   return table;
 }
 
-void hash_table_destroy(hash_table_t *table) {
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
   for (int i = 0; i < table->size; i++) {
     hash_table_entry_t *entry = table->entries[i];
     while (entry != NULL) {
       hash_table_entry_t *next = entry->next;
+      if (dtor != NULL) {
+        dtor(entry->key, 1);
+        dtor(entry->value, 0);
+      }
       free(entry);
       entry = next;
     }
@@ -69,6 +74,10 @@ int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
     entry = entry->next;
   }
   entry = malloc(sizeof(hash_table_entry_t));
+  if (entry == NULL) {
+    fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
+    exit(1);
+  }
   entry->key = key;
   entry->value = value;
   entry->next = table->entries[hash];
@@ -120,4 +129,4 @@ int main() {
   hash_table_destroy(table);
   return 0;
 }
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h
@@ -3,11 +3,13 @@
 typedef struct hash_table hash_table_t;
 typedef struct hash_table_entry hash_table_entry_t;
 typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
-typedef unsigned long (*hash_table_hash_fn)(void *key);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
 hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
                                 hash_table_hash_fn hash);
-void hash_table_destroy(hash_table_t *table);
+
+typedef void (*hash_table_dtor)(void *value, int is_key);
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
 void *hash_table_get(hash_table_t *table, void *key);
 int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
 void hash_table_remove(hash_table_t *table, void *key);
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/input.c b/projects/cminus/code/input.c
@@ -5,7 +5,8 @@
 static char buffer[CHUNK_SIZE];
 static int buffer_pos = 0;
 static int buffer_size = 0;
-static char unget_buffer = '\0';
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
 
 static FILE *file = NULL;
 
@@ -18,10 +19,8 @@ void input_init(const char *filename) {
 }
 
 int input_getc(void) {
-  if (unget_buffer != '\0') {
-    char c = unget_buffer;
-    unget_buffer = '\0';
-    return c;
+  if (unget_buffer_stack_pos > 0) {
+    return unget_buffer_stack[--unget_buffer_stack_pos];
   }
   if (buffer_pos == buffer_size) {
     buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
@@ -30,10 +29,11 @@ int input_getc(void) {
   if (buffer_size == 0) {
     return EOF;
   }
-  return buffer[buffer_pos++];
+  char c = buffer[buffer_pos++];
+  return c;
 }
 
-void input_ungetc(int c) { unget_buffer = c; }
+void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }
 
 void input_destroy(void) { fclose(file); }
 
@@ -52,4 +52,4 @@ int main() {
   input_destroy();
   return 0;
 }
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/input.h b/projects/cminus/code/input.h
@@ -4,4 +4,4 @@ void input_init(const char *filename);
 int input_getc(void);
 void input_ungetc(int c);
 void input_destroy(void);
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c
@@ -1,18 +1,25 @@
 #include "tokenizer.h"
 #include "hash_table.h"
+#include "input.h"
 #include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdarg.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // Token data
-#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
+#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
 struct token {
   long magic;
   int line;
   int column;
   short kind;
-  char opt_data[0];
+  long opt_data[0];
 };
 
 typedef struct token token_t;
@@ -29,48 +36,81 @@ struct token_data {
 typedef struct token_data token_data_t;
 
 // Token creation
+int line = 1;
+int column = 1;
+char file_name[1024];
 #define token_data(token) ((struct token_data *)((token)->opt_data))
-static token_t *token_data_create(c_token_types kind, int line, int column) {
+static token_t *token_data_create(c_token_types kind, int lin, int col,
+                                  int len) {
   token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+  if (token == NULL) {
+    fputs("Out of memory\n", stderr);
+    exit(1);
+  }
   token->magic = TOK_MAGIC_1;
-  token->line = line;
-  token->column = column;
+  token->line = lin;
+  token->column = col;
+  column += len;
   token->kind = kind;
   return token;
 }
 
-static token_t *token_create(c_token_types kind, int line, int column) {
+static token_t *token_create(c_token_types kind, int lin, int col, int len) {
   token_t *token = malloc(sizeof(token_t));
+  if (token == NULL) {
+    fputs("Out of memory\n", stderr);
+    exit(1);
+  }
   token->magic = TOK_MAGIC_2;
-  token->line = line;
-  token->column = column;
+  token->line = lin;
+  token->column = col;
+  column += len;
   token->kind = kind;
   return token;
 }
 
-static token_t *token_create_int(c_token_types kind, int line, int column,
-                                 int64_t i) {
-  token_t *token = token_data_create(kind, line, column);
+static token_t *token_create_int(c_token_types kind, int lin, int col,
+                                 int64_t i, int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
   token_data(token)->data.i = i;
   return token;
 }
 
-static token_t *token_create_float(c_token_types kind, int line, int column,
-                                   double f) {
-  token_t *token = token_data_create(kind, line, column);
+static token_t *token_create_float(c_token_types kind, int lin, int col,
+                                   double f, int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
   token_data(token)->data.f = f;
   return token;
 }
 
-#define PRIME 211
-static unsigned long hash_string(void *key) {
-  unsigned long hash = 0, g;
+static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
+                                  int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
+  token_data(token)->data.c = c;
+  return token;
+}
+
+static unsigned int hash_string(void *key) {
+  unsigned long hash = 0, hi = 0;
   char *p = key;
-  while (*p) {
-    hash = (hash << 4) + *p++;
-    if ((g = hash & 0xf0000000) != 0) {
-      hash ^= g >> 24;
-      hash ^= g;
+  hash = *p;
+  if (hash != 0 && p[1] != 0) {
+    hash = (hash << 4) + p[1];
+    if (p[2] != 0) {
+      hash = (hash << 4) + p[2];
+      if (p[3] != 0) {
+        hash = (hash << 4) + p[3];
+        if (p[4] != 0) {
+          hash = (hash << 4) + p[4];
+          p += 5;
+          while (*p != 0) {
+            hash = (hash << 4) + *p++;
+            hi = hash & 0xf0000000l;
+            hash ^= hi >> 24;
+          }
+          hash &= 0x0fffffffl;
+        }
+      }
     }
   }
   return hash;
@@ -81,30 +121,22 @@ static int cmp_string(void *key1, void *key2) {
 }
 
 static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int line, int column,
-                                    const char *s) {
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+                                    const char *s, int len) {
   if (string_table == NULL) {
-    string_table = hash_table_create(1024, cmp_string, hash_string);
+    string_table = hash_table_create(2048, cmp_string, hash_string);
   }
-  token_t *token = token_data_create(kind, line, column);
-  char *key = strdup(s);
-  if (hash_table_put(string_table, key, key, 0)) {
-    free(key);
-    key = hash_table_get(string_table, key);
+  token_t *token = token_data_create(kind, lin, col, len);
+  char *key = hash_table_get(string_table, (void *)s);
+  if (key == NULL) {
+    key = strdup(s);
+    hash_table_put(string_table, key, key, 1);
   }
   token_data(token)->data.s = key;
   return token;
 }
 
-static token_t *token_create_char(c_token_types kind, int line, int column,
-                                  char c) {
-  token_t *token = token_data_create(kind, line, column);
-  token_data(token)->data.c = c;
-  return token;
-}
-
 // External token operations
-
 c_token_types token_type(token_t *token) {
   assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
   return token->kind;
@@ -127,7 +159,7 @@ double token_float(token_t *token) {
 }
 
 const char *token_string(token_t *token) {
-  assert(token->kind == TOK_CONST_STRING_ASCII);
+  assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
   assert(token->magic == TOK_MAGIC_1);
   return token_data(token)->data.s;
 }
@@ -150,10 +182,1189 @@ int token_column(token_t *token) {
 
 void token_destroy(token_t *token) {
   assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
-  if (token->magic == TOK_MAGIC_1) {
-    if (token->kind == TOK_CONST_STRING_ASCII) {
-      free((void *)token_data(token)->data.s);
+  // Don't free the string table, it's a global variable
+  free(token);
+}
+
+token_t *unget_token = NULL;
+void tokenizer_init(const char *filename) { input_init(filename); }
+
+void tokenizer_unget(token_t *token) {
+  assert(unget_token == NULL);
+  unget_token = token;
+}
+
+void string_table_dtor(void *value, int is_key) {
+  if (is_key) {
+    free(value);
+  }
+}
+
+void tokenizer_destroy(void) {
+  input_destroy();
+  if (string_table != NULL) {
+    hash_table_destroy(string_table, string_table_dtor);
+  }
+}
+
+// Error handling
+void tok_error(const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+          column);
+  vfprintf(stderr, fmt, args);
+  va_end(args);
+}
+
+void tok_warn(const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+          column);
+  vfprintf(stderr, fmt, args);
+  va_end(args);
+}
+
+// Tokenizer
+static token_t *skip_whitespace(void) {
+  int c;
+  while ((c = input_getc()) != EOF) {
+    if (isspace(c)) {
+      if (c == '\n') {
+        line++;
+        column = 1;
+      } else {
+        column++;
+      }
+    } else if (c == '#') // GCC preprocessor line control directive.
+    {
+      char buf[512];
+      int i = 0;
+      while ((c = input_getc()) != EOF && c != '\n') {
+        buf[i++] = c;
+        column++;
+      }
+      buf[i] = '\0';
+      if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+        column = 1;
+      } else {
+        tok_error("Invalid #line directive\n");
+      }
+      if (c == EOF) {
+        return NULL;
+      }
+    } else if (c == '/') {
+      c = input_getc();
+      if (c == '/') {
+        while ((c = input_getc()) != EOF && c != '\n') {
+          column++;
+        }
+        if (c == EOF) {
+          return NULL;
+        }
+        line++;
+        column = 1;
+      } else if (c == '*') {
+        while ((c = input_getc()) != EOF) {
+          if (c == '*') {
+            c = input_getc();
+            if (c == '/') {
+              break;
+            }
+          } else if (c == '\n') {
+            line++;
+            column = 1;
+          } else {
+            column++;
+          }
+        }
+        if (c == EOF) {
+          return NULL;
+        }
+      } else {
+        if (c == '=')
+          return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+        input_ungetc(c);
+        return token_create(TOK_OP_DIV, line, column, 1);
+      }
+    } else {
+      input_ungetc(c);
+      return NULL;
     }
   }
-  free(token);
+  return NULL;
+}
+
+c_token_types get_keyword(const char *buf, int len) {
+  switch (buf[0]) {
+  case 'a':
+    if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+      return TOK_SCSK_AUTO;
+    break;
+
+  case 'b':
+    if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+        buf[4] == 'k')
+      return TOK_CTK_BREAK;
+    break;
+
+  case 'c':
+    switch (buf[1]) {
+    case 'a':
+      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+        return TOK_CTK_CASE;
+      break;
+    case 'h':
+      if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+        return TOK_TK_CHAR;
+      break;
+    case 'o':
+      if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+        return TOK_SCSK_CONST;
+      if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+          buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+        return TOK_CTK_CONTINUE;
+      break;
+    }
+    break;
+
+  case 'd':
+    switch (buf[1]) {
+    case 'e':
+      if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+          buf[5] == 'l' && buf[6] == 't')
+        return TOK_CTK_DEFAULT;
+      break;
+    case 'o':
+      if (len == 2 && buf[2] == '\0')
+        return TOK_CTK_DO;
+      if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+          buf[5] == 'e')
+        return TOK_TK_DOUBLE;
+      break;
+    }
+    break;
+
+  case 'e':
+    switch (buf[1]) {
+    case 'l':
+      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+        return TOK_CTK_ELSE;
+      break;
+    case 'n':
+      if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+        return TOK_TK_ENUM;
+      break;
+    case 'x':
+      if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+          buf[5] == 'n')
+        return TOK_SCSK_EXTERN;
+      break;
+    }
+    break;
+
+  case 'f':
+    switch (buf[1]) {
+    case 'l':
+      if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+        return TOK_TK_FLOAT;
+      break;
+    case 'o':
+      if (len == 3 && buf[2] == 'r')
+        return TOK_CTK_FOR;
+      break;
+    }
+    break;
+
+  case 'g':
+    if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+      return TOK_CTK_GOTO;
+    break;
+
+  case 'i':
+    switch (buf[1]) {
+    case 'f':
+      if (len == 2 && buf[2] == '\0')
+        return TOK_CTK_IF;
+      break;
+    case 'n':
+      if (len == 3 && buf[2] == 't')
+        return TOK_TK_INT;
+      break;
+    }
+    break;
+
+  case 'l':
+    if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+      return TOK_TK_LONG;
+    break;
+
+  case 'r':
+    switch (buf[1]) {
+    case 'e':
+      if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+          buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+        return TOK_SCSK_REGISTER;
+      if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+          buf[5] == 'n')
+        return TOK_CTK_RETURN;
+      break;
+    }
+    break;
+
+  case 's':
+    switch (buf[1]) {
+    case 'h':
+      if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+        return TOK_TK_SHORT;
+      break;
+    case 't':
+      if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+          buf[5] == 'c')
+        return TOK_SCSK_STATIC;
+      break;
+    case 'i':
+      if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+          buf[5] == 'd')
+        return TOK_TK_SIGNED;
+      if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+          buf[5] == 'f')
+        return TOK_MK_SIZEOF;
+      break;
+    case 'r':
+      if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+        return TOK_TK_STRUCT;
+      break;
+    case 'w':
+      if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+          buf[5] == 'h')
+        return TOK_CTK_SWITCH;
+      break;
+    }
+    break;
+
+  case 't':
+    if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+        buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+      return TOK_TK_TYPEDEF;
+    break;
+
+  case 'u':
+    switch (buf[1]) {
+    case 'n':
+      if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+        return TOK_TK_UNION;
+      if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+          buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+        return TOK_TK_UNSIGNED;
+      break;
+    }
+    break;
+
+  case 'v':
+    switch (buf[1]) {
+    case 'o':
+      if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+        return TOK_TK_VOID;
+      if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+          buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+        return TOK_SCSK_VOLATILE;
+      break;
+    }
+    break;
+
+  case 'w':
+    if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+        buf[4] == 'e')
+      return TOK_CTK_WHILE;
+    break;
+
+  default:
+    return TOK_ID;
+  }
+  return TOK_ID;
+}
+
+static token_t *read_identifier(void) {
+  int c;
+  char buf[1024];
+  int i = 0;
+  c = input_getc();
+  if (!isalpha(c) && c != '_') {
+    input_ungetc(c);
+    return NULL;
+  }
+  buf[i++] = c;
+  while ((c = input_getc()) != EOF) {
+    if (!isalnum(c) && c != '_') {
+      input_ungetc(c);
+      break;
+    }
+    buf[i++] = c;
+    if (i >= 1008) {
+      tok_error("Identifier too long\n");
+      exit(1);
+    }
+  }
+  buf[i] = '\0';
+  column += i;
+  // Check if it's a keyword
+  c_token_types kind = get_keyword(buf, i);
+  if (kind != TOK_ID) {
+    return token_create(kind, line, column, i);
+  }
+  return token_create_string(kind, line, column, buf, i);
 }
+
+token_t *read_operator(void) {
+  int c;
+  c = input_getc();
+  switch (c) {
+  case '!': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_NE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_NOT, line, column, 1);
+  }
+  case '%': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_MOD, line, column, 1);
+  }
+  case '&': {
+    c = input_getc();
+    if (c == '&')
+      return token_create(TOK_OP_AND, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_AND, line, column, 1);
+  }
+  case '(':
+    return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+  case ')':
+    return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+  case '*': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_MUL, line, column, 1);
+  }
+  case '+': {
+    c = input_getc();
+    if (c == '+')
+      return token_create(TOK_OP_INC, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_ADD, line, column, 2);
+  }
+  case ',':
+    return token_create(TOK_SEP_COMMA, line, column, 1);
+  case '-': {
+    c = input_getc();
+    if (c == '-')
+      return token_create(TOK_OP_DEC, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+    if (c == '>')
+      return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_SUB, line, column, 1);
+  }
+  case '.': {
+    c = input_getc();
+    if (c == '.') {
+      c = input_getc();
+      if (c == '.') {
+        return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+      } else {
+        // Bail out, can't store more than one unget
+        tok_error("Unexpected character '.' at line %d, column %d\n", line,
+                  column);
+        exit(1);
+      }
+    }
+    return token_create('.', line, column, 1);
+  }
+  case '/': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_DIV, line, column, 1);
+  }
+  case ':':
+    return token_create(TOK_OP_COND_DECISION, line, column, 1);
+  case ';':
+    return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+  case '<': {
+    c = input_getc();
+    if (c == '<') {
+      c = input_getc();
+      if (c == '=')
+        return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+      input_ungetc(c);
+      return token_create(TOK_OP_LSHIFT, line, column, 2);
+    }
+    if (c == '=')
+      return token_create(TOK_OP_LE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_LT, line, column, 1);
+  }
+  case '=': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_ASSIGN, line, column, 1);
+  }
+  case '>': {
+    c = input_getc();
+    if (c == '>') {
+      c = input_getc();
+      if (c == '=')
+        return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+      input_ungetc(c);
+      return token_create(TOK_OP_RSHIFT, line, column, 2);
+    }
+    if (c == '=')
+      return token_create(TOK_OP_GE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_GT, line, column, 1);
+  }
+  case '?':
+    return token_create(TOK_OP_COND, line, column, 1);
+  case '[':
+    return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+  case ']':
+    return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+  case '^': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_XOR, line, column, 1);
+  }
+  case '{':
+    return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+  case '|': {
+    c = input_getc();
+    if (c == '|')
+      return token_create(TOK_OP_OR, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_OR, line, column, 1);
+  }
+  case '}':
+    return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+  case '~':
+    return token_create(TOK_OP_BIT_NOT, line, column, 1);
+  default:
+    input_ungetc(c);
+    return NULL;
+  }
+}
+
+static token_t *read_number(void) {
+  int c;
+  char buf[1024];
+  int i = 0;
+  c = input_getc();
+  // If we don't have a digit or decimal point, it's not a number
+  if (!isdigit(c) && c != '.') {
+    input_ungetc(c);
+    return NULL;
+  }
+  // Decimal point followed by non-digit is a struct member
+  if (c == '.') {
+    char cnext = input_getc();
+    if (!isdigit(cnext)) {
+      input_ungetc(cnext);
+      return token_create(TOK_OP_MEMBER, line, column, 1);
+    }
+    input_ungetc(cnext);
+  }
+  int radix = 10;
+  int is_float = 0;
+  // Check for hex and octal.
+  if (c == '0') {
+    char cnext = input_getc();
+    if (cnext == 'x' || cnext == 'X') {
+      // Hex, discard the 0x
+      radix = 16;
+    } else {
+      // Octal, discard the 0
+      input_ungetc(cnext);
+      radix = 8;
+    }
+  } else {
+    // Decimal, append the first digit
+    buf[i++] = c;
+  }
+  // Read the rest of the number
+  while ((c = input_getc()) != EOF) {
+    // Since there can be multiple writes to the buffer, we want to make sure we
+    // don't overflow by giving a 4 byte pad
+    if (i > 1020) {
+      tok_error("Number too long\n");
+      return NULL;
+    }
+    // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+    // a-f/A-F for hex
+    if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+        (radix == 8 && c >= '0' && c <= '7')) {
+      buf[i++] = c;
+      // Decimal point and not a float yet, must be a float
+    } else if (c == '.' && !is_float) {
+      is_float = 1;
+      if (radix != 10) {
+        tok_error("Invalid floating point constant, expected decimal, got %s\n",
+                  radix == 16 ? "hexadecimal" : "octal");
+        return NULL;
+      }
+      buf[i++] = c;
+    }
+    // Exponent on the end of a constant. (By standard this forces it to be a
+    // float)
+    else if (c == 'e' || c == 'E') {
+      buf[i++] = c;
+      c = input_getc();
+      // Sign on the exponent
+      if (c == '+' || c == '-') {
+        buf[i++] = c;
+        c = input_getc();
+      }
+      // Exponent must be a digit, I.E no 1e1.2
+      if (!isdigit(c)) {
+        tok_error("Invalid floating point exponent\n");
+        return NULL;
+      }
+      buf[i++] = c;
+      is_float = 1;
+    } else {
+      // Reached the end, unget the character so other functions can read it
+      input_ungetc(c);
+      break;
+    }
+  }
+  buf[i] = '\0';
+  // Check for suffixes
+  int is_unsigned = 0;
+  int is_long = 0;
+  int is_single = 0;
+  // Loop to get all possible suffixes. Warn when duplicated.
+  while (1) {
+    c = input_getc();
+    if (c == 'u' || c == 'U') {
+      if (is_unsigned) {
+        tok_warn(
+            "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+      }
+      is_unsigned = 1;
+    } else if (c == 'l' || c == 'L') {
+      if (is_long) {
+        tok_warn(
+            "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+      }
+      is_long = 1;
+    } else if (c == 'f' || c == 'F') {
+      if (is_single) {
+        tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+                 "ignored\n");
+      }
+      is_single = 1;
+    } else {
+      input_ungetc(c);
+      break;
+    }
+  }
+  // Resolve invalid suffixes. Doesn't error because you can still compile with
+  // them.
+  if (is_single && is_long) {
+    tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+             "constant. Ignoring 'l'\n");
+    is_long = 0;
+  }
+  if (is_single && is_unsigned) {
+    tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+             "constant. Ignoring 'u'\n");
+    is_unsigned = 0;
+  }
+  if (is_single && !is_float) {
+    tok_warn(
+        "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+    is_single = 0;
+  }
+  // use the strtox functions to convert the string to a number
+  if (is_float) {
+    errno = 0;
+    // Strtod generates a unix-style error when it's given something out of
+    // range, so we want to get on top of that quickly instead of ignoring it
+    // That way we can avoid some nasty NAN-propagation in the constant folder.
+    double f = strtod(buf, NULL);
+    if (errno == ERANGE) {
+      tok_error("Floating point constant out of range\n");
+      return NULL;
+    }
+    // Warn if the constant is out of range for a float, I.E it's too big or too
+    // small
+    if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+      tok_warn(
+          "Warning: Floating point constant %f is out of range for float\n", f);
+    }
+    // Warn if the constant is too precise for a float
+    if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+      tok_warn("Warning: Converting double precision floating point constant "
+               "%f to float loses "
+               "precision\n",
+               f);
+    }
+    return token_create_float(is_single ? TOK_CONST_FLOAT_32
+                                        : TOK_CONST_FLOAT_64,
+                              line, column, f, i);
+  } else {
+    errno = 0;
+    uint64_t int_ = strtoull(buf, NULL, radix);
+    // Same as above, but for integers
+    if (errno == ERANGE) {
+      tok_error("Integer constant out of range\n");
+      return NULL;
+    }
+    if (is_unsigned) {
+      if (is_long) {
+        return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+      } else {
+        if (int_ > UINT32_MAX) {
+          tok_warn(
+              "Warning: Integer constant %lld is out of range for unsigned "
+              "int\n",
+              int_);
+        }
+        return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+      }
+    } else {
+      if (is_long) {
+        // If the highest bit is set, that means this will overflow a signed
+        // long (Due to two's complement)
+        if (int_ & (1UL << 63)) {
+          tok_warn(
+              "Warning: Integer constant %lld is out of range for long long\n",
+              i);
+        }
+        return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+      } else {
+        if (int_ & (1UL << 31)) {
+          tok_warn("Warning: Integer constant %lld is out of range for int\n",
+                   int_);
+        }
+        return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+      }
+    }
+  }
+  return NULL;
+}
+
+static char read_escape_sequence(int *len) {
+  int c = input_getc();
+  *len += 1;
+  switch (c) {
+  case 'a':
+    return '\a';
+  case 'b':
+    return '\b';
+  case 'f':
+    return '\f';
+  case 'n':
+    return '\n';
+  case 'r':
+    return '\r';
+  case 't':
+    return '\t';
+  case 'v':
+    return '\v';
+  case '\'':
+    return '\'';
+  case '"':
+    return '"';
+  case '?':
+    return '?';
+  case '\\':
+    return '\\';
+  case '0':
+    return '\0';
+  case 'x': {
+    c = input_getc();
+    if (!isxdigit(c)) {
+      tok_error("Invalid hexadecimal escape sequence\n");
+      return 0;
+    }
+    int val = 0;
+    while (isxdigit(c)) {
+      *len += 1;
+      val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+      c = input_getc();
+    }
+    input_ungetc(c);
+    return (char)val;
+  }
+  default:
+    if (!isdigit(c)) {
+      tok_error("Invalid escape sequence\n");
+      return 0;
+    }
+    int val = 0;
+    while (isdigit(c)) {
+      *len += 1;
+      val = val * 8 + c - '0';
+      c = input_getc();
+    }
+    input_ungetc(c);
+    return (char)val;
+  }
+}
+
+static token_t *read_char_constant(void) {
+  int c;
+  int len = 0;
+  c = input_getc();
+  if (c != '\'') {
+    input_ungetc(c);
+    return NULL;
+  }
+  len++;
+  c = input_getc();
+  if (c == '\'') {
+    tok_error("Empty character constant\n");
+    return NULL;
+  }
+  if (c == '\\') {
+    c = read_escape_sequence(&len);
+  }
+  int val = c;
+  c = input_getc();
+  if (c != '\'') {
+    tok_error("Expected closing quote for character constant\n");
+    return NULL;
+  }
+  len++;
+  return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}
+
+static token_t *read_string_literal(void) {
+  int c;
+  c = input_getc();
+  if (c != '"') {
+    input_ungetc(c);
+    return NULL;
+  }
+  int i = 0;
+  // Malloc is used for the buf here, the pointer stays function local as string
+  // interning duplicates the string
+  char s_buf[512];
+  char *buf = s_buf;
+  int len = 512;
+  int esc_pad = 0;
+  while ((c = input_getc()) != EOF) {
+    if (c == '"') {
+      // Implicit skip of closing quote
+      break;
+    }
+    if (c == '\\') {
+      c = read_escape_sequence(&esc_pad);
+      if (c == 0) {
+        return NULL;
+      }
+    }
+    if (i >= len) {
+      if (buf == s_buf) {
+        buf = malloc(1024);
+        if (buf == NULL) {
+          fputs("Out of memory. Could not parse string literal.\n", stderr);
+          exit(1);
+        }
+        memcpy(buf, s_buf, 512);
+        len *= 2;
+      } else {
+        len *= 2;
+        buf = realloc(buf, len);
+      }
+    }
+    buf[i++] = c;
+  }
+  buf[i] = '\0';
+  if (c == EOF) {
+    tok_error("Unterminated string literal\n");
+    if (buf != s_buf) {
+      free(buf);
+    }
+    return NULL;
+  }
+
+  token_t *tok = token_create_string(TOK_CONST_STRING_ASCII, line, column, buf,
+                                     i + esc_pad + 2);
+  if (buf != s_buf) {
+    free(buf);
+  }
+  return tok;
+}
+
+token_t *read_token(void) {
+  if (unget_token != NULL) {
+    token_t *tok = unget_token;
+    unget_token = NULL;
+    return tok;
+  }
+  token_t *tok = skip_whitespace();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_identifier();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_number();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_char_constant();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_string_literal();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_operator();
+  if (tok != NULL) {
+    return tok;
+  }
+  int c = input_getc();
+  if (c == EOF) {
+    return NULL;
+  }
+  tok_warn(
+      "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+      line, column);
+  return read_token();
+}
+
+const char *token_name_from_type(c_token_types type) {
+  switch (type) {
+  case TOK_CTK_IF:
+    return "TOK_CTK_IF";
+  case TOK_CTK_ELSE:
+    return "TOK_CTK_ELSE";
+  case TOK_CTK_SWITCH:
+    return "TOK_CTK_SWITCH";
+  case TOK_CTK_CASE:
+    return "TOK_CTK_CASE";
+  case TOK_CTK_DEFAULT:
+    return "TOK_CTK_DEFAULT";
+  case TOK_CTK_WHILE:
+    return "TOK_CTK_WHILE";
+  case TOK_CTK_DO:
+    return "TOK_CTK_DO";
+  case TOK_CTK_FOR:
+    return "TOK_CTK_FOR";
+  case TOK_CTK_CONTINUE:
+    return "TOK_CTK_CONTINUE";
+  case TOK_CTK_BREAK:
+    return "TOK_CTK_BREAK";
+  case TOK_CTK_RETURN:
+    return "TOK_CTK_RETURN";
+  case TOK_CTK_GOTO:
+    return "TOK_CTK_GOTO";
+  case TOK_TK_VOID:
+    return "TOK_TK_VOID";
+  case TOK_TK_CHAR:
+    return "TOK_TK_CHAR";
+  case TOK_TK_SHORT:
+    return "TOK_TK_SHORT";
+  case TOK_TK_INT:
+    return "TOK_TK_INT";
+  case TOK_TK_LONG:
+    return "TOK_TK_LONG";
+  case TOK_TK_FLOAT:
+    return "TOK_TK_FLOAT";
+  case TOK_TK_DOUBLE:
+    return "TOK_TK_DOUBLE";
+  case TOK_TK_SIGNED:
+    return "TOK_TK_SIGNED";
+  case TOK_TK_UNSIGNED:
+    return "TOK_TK_UNSIGNED";
+  case TOK_TK_STRUCT:
+    return "TOK_TK_STRUCT";
+  case TOK_TK_UNION:
+    return "TOK_TK_UNION";
+  case TOK_TK_ENUM:
+    return "TOK_TK_ENUM";
+  case TOK_TK_TYPEDEF:
+    return "TOK_TK_TYPEDEF";
+  case TOK_SCSK_AUTO:
+    return "TOK_SCSK_AUTO";
+  case TOK_SCSK_REGISTER:
+    return "TOK_SCSK_REGISTER";
+  case TOK_SCSK_STATIC:
+    return "TOK_SCSK_STATIC";
+  case TOK_SCSK_EXTERN:
+    return "TOK_SCSK_EXTERN";
+  case TOK_SCSK_CONST:
+    return "TOK_SCSK_CONST";
+  case TOK_SCSK_VOLATILE:
+    return "TOK_SCSK_VOLATILE";
+  case TOK_MK_SIZEOF:
+    return "TOK_MK_SIZEOF";
+  case TOK_OP_ADD:
+    return "TOK_OP_ADD";
+  case TOK_OP_SUB:
+    return "TOK_OP_SUB";
+  case TOK_OP_MUL:
+    return "TOK_OP_MUL";
+  case TOK_OP_DIV:
+    return "TOK_OP_DIV";
+  case TOK_OP_MOD:
+    return "TOK_OP_MOD";
+  case TOK_OP_BIT_AND:
+    return "TOK_OP_BIT_AND";
+  case TOK_OP_BIT_OR:
+    return "TOK_OP_BIT_OR";
+  case TOK_OP_BIT_XOR:
+    return "TOK_OP_BIT_XOR";
+  case TOK_OP_BIT_NOT:
+    return "TOK_OP_BIT_NOT";
+  case TOK_OP_LSHIFT:
+    return "TOK_OP_LSHIFT";
+  case TOK_OP_RSHIFT:
+    return "TOK_OP_RSHIFT";
+  case TOK_OP_NOT:
+    return "TOK_OP_NOT";
+  case TOK_OP_ASSIGN:
+    return "TOK_OP_ASSIGN";
+  case TOK_OP_LT:
+    return "TOK_OP_LT";
+  case TOK_OP_GT:
+    return "TOK_OP_GT";
+  case TOK_OP_INC:
+    return "TOK_OP_INC";
+  case TOK_OP_DEC:
+    return "TOK_OP_DEC";
+  case TOK_OP_EQ:
+    return "TOK_OP_EQ";
+  case TOK_OP_NE:
+    return "TOK_OP_NE";
+  case TOK_OP_LE:
+    return "TOK_OP_LE";
+  case TOK_OP_GE:
+    return "TOK_OP_GE";
+  case TOK_OP_AND:
+    return "TOK_OP_AND";
+  case TOK_OP_OR:
+    return "TOK_OP_OR";
+  case TOK_OP_MEMBER_POINTER:
+    return "TOK_OP_MEMBER_POINTER";
+  case TOK_OP_MEMBER:
+    return "TOK_OP_MEMBER";
+  case TOK_OP_COND_DECISION:
+    return "TOK_OP_COND_DECISION";
+  case TOK_OP_COND:
+    return "TOK_OP_COND";
+  case TOK_OP_ASSIGN_ADD:
+    return "TOK_OP_ASSIGN_ADD";
+  case TOK_OP_ASSIGN_SUB:
+    return "TOK_OP_ASSIGN_SUB";
+  case TOK_OP_ASSIGN_MUL:
+    return "TOK_OP_ASSIGN_MUL";
+  case TOK_OP_ASSIGN_DIV:
+    return "TOK_OP_ASSIGN_DIV";
+  case TOK_OP_ASSIGN_MOD:
+    return "TOK_OP_ASSIGN_MOD";
+  case TOK_OP_ASSIGN_BITAND:
+    return "TOK_OP_ASSIGN_BITAND";
+  case TOK_OP_ASSIGN_BITOR:
+    return "TOK_OP_ASSIGN_BITOR";
+  case TOK_OP_ASSIGN_BITXOR:
+    return "TOK_OP_ASSIGN_BITXOR";
+  case TOK_OP_ASSIGN_LSHIFT:
+    return "TOK_OP_ASSIGN_LSHIFT";
+  case TOK_OP_ASSIGN_RSHIFT:
+    return "TOK_OP_ASSIGN_RSHIFT";
+  case TOK_SEP_HASH:
+    return "TOK_SEP_HASH";
+  case TOK_ID:
+    return "TOK_ID";
+  case TOK_CONST_INTEGER_U32:
+    return "TOK_CONST_INTEGER_U32";
+  case TOK_CONST_INTEGER_U64:
+    return "TOK_CONST_INTEGER_U64";
+  case TOK_CONST_INTEGER_S32:
+    return "TOK_CONST_INTEGER_S32";
+  case TOK_CONST_INTEGER_S64:
+    return "TOK_CONST_INTEGER_S64";
+  case TOK_CONST_FLOAT_32:
+    return "TOK_CONST_FLOAT_32";
+  case TOK_CONST_FLOAT_64:
+    return "TOK_CONST_FLOAT_64";
+  case TOK_CONST_CHAR:
+    return "TOK_CONST_CHAR";
+  case TOK_CONST_STRING_ASCII:
+    return "TOK_CONST_STRING_ASCII";
+  case TOK_SPECIAL_EOF:
+    return "TOK_SPECIAL_EOF";
+  case TOK_SPECIAL_ERROR:
+    return "TOK_SPECIAL_ERROR";
+  case TOK_SEP_LEFT_PAREN:
+    return "TOK_SEP_LEFT_PAREN";
+  case TOK_SEP_RIGHT_PAREN:
+    return "TOK_SEP_RIGHT_PAREN";
+  case TOK_SEP_LEFT_BRACKET:
+    return "TOK_SEP_LEFT_BRACKET";
+  case TOK_SEP_RIGHT_BRACKET:
+    return "TOK_SEP_RIGHT_BRACKET";
+  case TOK_SEP_LEFT_BRACE:
+    return "TOK_SEP_LEFT_BRACE";
+  case TOK_SEP_RIGHT_BRACE:
+    return "TOK_SEP_RIGHT_BRACE";
+  case TOK_SEP_COMMA:
+    return "TOK_SEP_COMMA";
+  case TOK_SEP_SEMICOLON:
+    return "TOK_SEP_SEMICOLON";
+  case TOK_SEP_DOT:
+    return "TOK_SEP_DOT";
+  case TOK_SEP_ELLIPSIS:
+    return "TOK_SEP_ELLIPSIS";
+  }
+  return "UNKNOWN";
+}
+
+char *re_escape_string(const char *str) {
+  int len = strlen(str);
+  char *buf = malloc(len * 2 + 1);
+  if (buf == NULL) {
+    fprintf(stderr, "Out of memory. Cannot escape string\n");
+    exit(1);
+  }
+  int i = 0;
+  for (int j = 0; j < len; j++) {
+    switch (str[j]) {
+    case '\a':
+      buf[i++] = '\\';
+      buf[i++] = 'a';
+      break;
+    case '\b':
+      buf[i++] = '\\';
+      buf[i++] = 'b';
+      break;
+    case '\f':
+      buf[i++] = '\\';
+      buf[i++] = 'f';
+      break;
+    case '\n':
+      buf[i++] = '\\';
+      buf[i++] = 'n';
+      break;
+    case '\r':
+      buf[i++] = '\\';
+      buf[i++] = 'r';
+      break;
+    case '\t':
+      buf[i++] = '\\';
+      buf[i++] = 't';
+      break;
+    case '\v':
+      buf[i++] = '\\';
+      buf[i++] = 'v';
+      break;
+    case '\\':
+      buf[i++] = '\\';
+      buf[i++] = '\\';
+      break;
+    case '\'':
+      buf[i++] = '\\';
+      buf[i++] = '\'';
+      break;
+    case '"':
+      buf[i++] = '\\';
+      buf[i++] = '"';
+      break;
+    default:
+      buf[i++] = str[j];
+      break;
+    }
+  }
+  buf[i] = '\0';
+  return buf;
+}
+
+void print_token(token_t *tok) {
+  if (tok == NULL) {
+    printf("NULL\n");
+    return;
+  }
+  const char *name = token_name_from_type(tok->kind);
+  switch (tok->kind) {
+  case TOK_ID:
+  case TOK_CONST_STRING_ASCII: {
+    char *escaped = re_escape_string(token_string(tok));
+    printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+    free(escaped);
+    break;
+  }
+  case TOK_CONST_CHAR:
+    printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+    break;
+  case TOK_CONST_INTEGER_S32:
+  case TOK_CONST_INTEGER_U32:
+  case TOK_CONST_INTEGER_S64:
+  case TOK_CONST_INTEGER_U64:
+    printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+    break;
+  case TOK_CONST_FLOAT_32:
+  case TOK_CONST_FLOAT_64:
+    printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+    break;
+  default:
+    printf("%s@%d:%d\n", name, tok->line, tok->column);
+    break;
+  }
+}
+
+#ifdef TOK_TEST
+
+char *preprocess(char *in) {
+  char *output_name = malloc(1024);
+  snprintf(output_name, 1024, "%s.preprocessed", in);
+  char *command = malloc(2048);
+  snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
+  system(command);
+  free(command);
+  return output_name;
+}
+
+// Tokenize ourselves
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
+    return 1;
+  }
+  char *input_name = argv[1];
+  char *preprocessed = preprocess(input_name);
+  tokenizer_init(preprocessed);
+  token_t *tok;
+  while ((tok = read_token()) != NULL) {
+    print_token(tok);
+    token_destroy(tok);
+  }
+  tokenizer_destroy();
+  remove(preprocessed);
+  free(preprocessed);
+  return 0;
+}
+
+#endif
diff --git a/projects/cminus/code/tokenizer.h b/projects/cminus/code/tokenizer.h
@@ -5,17 +5,18 @@
 typedef struct token token_t;
 typedef enum {
   // Control Keywords
-  TOK_CTK_IF,       // if
-  TOK_CTK_ELSE,     // else
-  TOK_CTK_SWITCH,   // switch
-  TOK_CTK_CASE,     // case
-  TOK_CTK_DEFAULT,  // default
-  TOK_CTK_WHILE,    // while
-  TOK_CTK_DO,       // do
-  TOK_CTK_FOR,      // for
-  TOK_CTK_CONTINUE, // continue
-  TOK_CTK_RETURN,   // return
-  TOK_CTK_GOTO,     // goto
+  TOK_CTK_IF,
+  TOK_CTK_ELSE,
+  TOK_CTK_SWITCH,
+  TOK_CTK_CASE,
+  TOK_CTK_DEFAULT,
+  TOK_CTK_WHILE,
+  TOK_CTK_DO,
+  TOK_CTK_FOR,
+  TOK_CTK_CONTINUE,
+  TOK_CTK_BREAK,
+  TOK_CTK_RETURN,
+  TOK_CTK_GOTO,
 
   // Type Keywords
   TOK_TK_VOID,
@@ -44,44 +45,43 @@ typedef enum {
   TOK_MK_SIZEOF,
 
   // Operators
-  TOK_OP_PLUS,               // +
-  TOK_OP_MINUS,              // -
-  TOK_OP_ASTERISK,           // *
-  TOK_OP_SLASH,              // /
-  TOK_OP_PERCENT,            // %
-  TOK_OP_AMPERSAND,          // &
-  TOK_OP_PIPE,               // |
-  TOK_OP_CARET,              // ^
-  TOK_OP_TILDE,              // ~
-  TOK_OP_EXCLAMATION,        // !
-  TOK_OP_EQUALS,             // =
-  TOK_OP_LESS_THAN,          // <
-  TOK_OP_GREATER_THAN,       // >
-  TOK_OP_PLUS_PLUS,          // ++
-  TOK_OP_MINUS_MINUS,        // --
-  TOK_OP_EQUALS_EQUALS,      // ==
-  TOK_OP_NOT_EQUALS,         // !=
-  TOK_OP_LESS_EQUALS,        // <=
-  TOK_OP_GREATER_EQUALS,     // >=
-  TOK_OP_AMP_AMP,            // &&
-  TOK_OP_PIPE_PIPE,          // ||
-  TOK_OP_ARROW,              // ->
-  TOK_OP_DOT,                // .
-  TOK_OP_COLON,              // :
-  TOK_OP_QUESTION,           // ?
-  TOK_OP_COMMA,              // ,
-  TOK_OP_SEMICOLON,          // ;
-  TOK_OP_ELLIPSIS,           // ...
-  TOK_OP_ASSIGN_PLUS,        // +=
-  TOK_OP_ASSIGN_MINUS,       // -=
-  TOK_OP_ASSIGN_ASTERISK,    // *=
-  TOK_OP_ASSIGN_SLASH,       // /=
-  TOK_OP_ASSIGN_PERCENT,     // %=
-  TOK_OP_ASSIGN_AMPERSAND,   // &=
-  TOK_OP_ASSIGN_PIPE,        // |=
-  TOK_OP_ASSIGN_CARET,       // ^=
-  TOK_OP_ASSIGN_LEFT_SHIFT,  // <<=
-  TOK_OP_ASSIGN_RIGHT_SHIFT, // >>=
+  TOK_OP_ADD,            // +
+  TOK_OP_SUB,            // -
+  TOK_OP_MUL,            // *
+  TOK_OP_DIV,            // /
+  TOK_OP_MOD,            // %
+  TOK_OP_BIT_AND,        // &
+  TOK_OP_BIT_OR,         // |
+  TOK_OP_BIT_XOR,        // ^
+  TOK_OP_BIT_NOT,        // ~
+  TOK_OP_LSHIFT,         // <<
+  TOK_OP_RSHIFT,         // >>
+  TOK_OP_NOT,            // !
+  TOK_OP_ASSIGN,         // =
+  TOK_OP_LT,             // <
+  TOK_OP_GT,             // >
+  TOK_OP_INC,            // ++
+  TOK_OP_DEC,            // --
+  TOK_OP_EQ,             // ==
+  TOK_OP_NE,             // !=
+  TOK_OP_LE,             // <=
+  TOK_OP_GE,             // >=
+  TOK_OP_AND,            // &&
+  TOK_OP_OR,             // ||
+  TOK_OP_MEMBER_POINTER, // ->
+  TOK_OP_MEMBER,         // .
+  TOK_OP_COND_DECISION,  // :
+  TOK_OP_COND,           // ?
+  TOK_OP_ASSIGN_ADD,     // +=
+  TOK_OP_ASSIGN_SUB,     // -=
+  TOK_OP_ASSIGN_MUL,     // *=
+  TOK_OP_ASSIGN_DIV,     // /=
+  TOK_OP_ASSIGN_MOD,     // %=
+  TOK_OP_ASSIGN_BITAND,  // &=
+  TOK_OP_ASSIGN_BITOR,   // |=
+  TOK_OP_ASSIGN_BITXOR,  // ^=
+  TOK_OP_ASSIGN_LSHIFT,  // <<=
+  TOK_OP_ASSIGN_RSHIFT,  // >>=
 
   // Separators
   TOK_SEP_LEFT_PAREN,    // (
@@ -92,30 +92,29 @@ typedef enum {
   TOK_SEP_RIGHT_BRACE,   // }
   TOK_SEP_COMMA,         // ,
   TOK_SEP_SEMICOLON,     // ;
-  TOK_SEP_COLON,         // :
   TOK_SEP_DOT,           // .
   TOK_SEP_ELLIPSIS,      // ...
   TOK_SEP_HASH,          // #
 
   // Identifiers
   TOK_ID,  // Any identifier not ending in _t
-  TOK_TID, // Any identifier ending in _t
 
   // Constants
-  TOK_CONST_INTEGER_U32,    // u
-  TOK_CONST_INTEGER_U64,    // ul
-  TOK_CONST_INTEGER_S32,    // (no suffix)
-  TOK_CONST_INTEGER_S64,    // l
-  TOK_CONST_FLOAT_32,       // f
-  TOK_CONST_FLOAT_64,       // (no suffix)
-  TOK_CONST_CHAR,           // 'c'
-  TOK_CONST_STRING_ASCII,   // "string" (width of 8 bits)
+  TOK_CONST_INTEGER_U32,  // u
+  TOK_CONST_INTEGER_U64,  // ul
+  TOK_CONST_INTEGER_S32,  // (no suffix)
+  TOK_CONST_INTEGER_S64,  // l
+  TOK_CONST_FLOAT_32,     // f
+  TOK_CONST_FLOAT_64,     // (no suffix)
+  TOK_CONST_CHAR,         // 'c'
+  TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
 
   // Special
   TOK_SPECIAL_EOF,
   TOK_SPECIAL_ERROR,
 } c_token_types;
 
+
 void tokenizer_init(const char *filename);
 token_t *tokenizer_get(void);
 void tokenizer_unget(token_t *token);
@@ -130,4 +129,4 @@ int token_line(token_t *token);
 int token_column(token_t *token);
 void token_destroy(token_t *token);
 
-#endif-
\ No newline at end of file
+#endif
diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html
@@ -155,47 +155,46 @@ static char buffer[CHUNK_SIZE];
 static int buffer_pos = 0;
 static int buffer_size = 0;
 static char unget_buffer = '\0';
+static char unget_buffer_stack[8];
+static int unget_buffer_stack_pos = 0;
 
 static FILE *file = NULL;
 </code></pre>
     </div>
-    <p>When the program calls <code>input_init</code>, we open the file and read the first chunk into the buffer. </p>
+    <p>When the program calls <code>input_init</code>, we open the file.</p>
     <div class="code-block">
       <span class="file-name">input.c</span>
       <span class="block-header">
         <strong class="block-title"><em><a id=":input-init" href="#:input-init">Input
               initialization</a></em></strong></span>
       <pre class="prettyprint"><code class="">void input_init(const char *filename) {
-  // Open the file and read the first chunk
   file = fopen(filename, "r");
   if (file == NULL) {
-    fprintf(stderr, "Error: could not open file %s\n", filename);
+    fprintf(stderr, "Error: Cannot open file %s\n", filename);
     exit(1);
   }
-  buffer_pos = 0;
-  nextline();
 }</code></pre>
     </div>
     <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is
-      exhausted, we call <code>nextline</code>. </p>
+      exhausted, we call <code>nextline</code>. We also track the line and column. </p>
     <div class="code-block">
       <span class="file-name">input.c</span>
       <span class="block-header">
         <strong class="block-title"><em><a id=":input-getc" href="#:input-getc">Input
               getc</a></em></strong></span>
       <pre class="prettyprint"><code class="">int input_getc(void) {
-  if (unget_buffer != '\0') {
-    char c = unget_buffer;
-    unget_buffer = '\0';
-    return c;
+  if (unget_buffer_stack_pos > 0) {
+    return unget_buffer_stack[--unget_buffer_stack_pos];
   }
   if (buffer_pos == buffer_size) {
-    nextline();
+    buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
+    buffer_pos = 0;
   }
   if (buffer_size == 0) {
     return EOF;
   }
-  return buffer[buffer_pos++];
+  char c = buffer[buffer_pos++];
+  return c;
 }</code></pre>
     </div>
     <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. </p>
@@ -204,9 +203,8 @@ static FILE *file = NULL;
       <span class="block-header">
         <strong class="block-title"><em><a id=":input-ungetc" href="#:input-ungetc">Input
               ungetc</a></em></strong></span>
-      <pre class="prettyprint"><code class="">void input_ungetc(int c) {
-  unget_buffer = c;
-}</code></pre>
+      <pre
+        class="prettyprint"><code class="">void input_ungetc(int c) { unget_buffer_stack[unget_buffer_stack_pos++] = c; }</code></pre>
     </div>
     <p>Since we're not using dynamic memory allocation, cleanup is pretty simple.</p>
     <div class="code-block">
@@ -257,8 +255,7 @@ const char *token_string(token_t *token);
 char token_char(token_t *token);
 int token_line(token_t *token);
 int token_column(token_t *token);
-void token_destroy(token_t *token);
-</code></pre>
+void token_destroy(token_t *token);</code></pre>
     </div>
     <p>I'll also define an opaque struct for tokens and an enum for token types. </p>
     <div class="code-block">
@@ -267,19 +264,20 @@ void token_destroy(token_t *token);
         <strong class="block-title"><em><a id=":token-types" href="#:token-types">Token
               types</a></em></strong></span>
       <pre class="prettyprint"><code class="">typedef struct token token_t;
-  typedef enum {
+typedef enum {
   // Control Keywords
-  TOK_CTK_IF, // if
-  TOK_CTK_ELSE, // else
-  TOK_CTK_SWITCH, // switch
-  TOK_CTK_CASE, // case
-  TOK_CTK_DEFAULT, // default
-  TOK_CTK_WHILE, // while
-  TOK_CTK_DO, // do
-  TOK_CTK_FOR, // for
-  TOK_CTK_CONTINUE, // continue
-  TOK_CTK_RETURN, // return
-  TOK_CTK_GOTO, // goto
+  TOK_CTK_IF,
+  TOK_CTK_ELSE,
+  TOK_CTK_SWITCH,
+  TOK_CTK_CASE,
+  TOK_CTK_DEFAULT,
+  TOK_CTK_WHILE,
+  TOK_CTK_DO,
+  TOK_CTK_FOR,
+  TOK_CTK_CONTINUE,
+  TOK_CTK_BREAK,
+  TOK_CTK_RETURN,
+  TOK_CTK_GOTO,
 
   // Type Keywords
   TOK_TK_VOID,
@@ -308,102 +306,80 @@ void token_destroy(token_t *token);
   TOK_MK_SIZEOF,
 
   // Operators
-  TOK_OP_PLUS, // +
-  TOK_OP_MINUS, // -
-  TOK_OP_ASTERISK, // *
-  TOK_OP_SLASH, // /
-  TOK_OP_PERCENT, // %
-  TOK_OP_AMPERSAND, // &
-  TOK_OP_PIPE, // |
-  TOK_OP_CARET, // ^
-  TOK_OP_TILDE, // ~
-  TOK_OP_EXCLAMATION, // !
-  TOK_OP_EQUALS, // =
-  TOK_OP_LESS_THAN, // &lt;
-  TOK_OP_GREATER_THAN, // &gt;
-  TOK_OP_PLUS_PLUS, // ++
-  TOK_OP_MINUS_MINUS, // --
-  TOK_OP_EQUALS_EQUALS, // ==
-  TOK_OP_NOT_EQUALS, // !=
-  TOK_OP_LESS_EQUALS, // &lt;=
-  TOK_OP_GREATER_EQUALS, // &gt;=
-  TOK_OP_AMP_AMP, // &&
-  TOK_OP_PIPE_PIPE, // ||
-  TOK_OP_ARROW, // -&gt;
-  TOK_OP_DOT, // .
-  TOK_OP_COLON, // :
-  TOK_OP_QUESTION, // ?
-  TOK_OP_COMMA, // ,
-  TOK_OP_SEMICOLON, // ;
-  TOK_OP_ELLIPSIS, // ...
-  TOK_OP_ASSIGN_PLUS, // +=
-  TOK_OP_ASSIGN_MINUS, // -=
-  TOK_OP_ASSIGN_ASTERISK, // *=
-  TOK_OP_ASSIGN_SLASH, // /=
-  TOK_OP_ASSIGN_PERCENT, // %=
-  TOK_OP_ASSIGN_AMPERSAND, // &=
-  TOK_OP_ASSIGN_PIPE, // |=
-  TOK_OP_ASSIGN_CARET, // ^=
-  TOK_OP_ASSIGN_LEFT_SHIFT, // &lt;&lt;=
-  TOK_OP_ASSIGN_RIGHT_SHIFT, // &gt;&gt;=
+  TOK_OP_ADD,            // +
+  TOK_OP_SUB,            // -
+  TOK_OP_MUL,            // *
+  TOK_OP_DIV,            // /
+  TOK_OP_MOD,            // %
+  TOK_OP_BIT_AND,        // &
+  TOK_OP_BIT_OR,         // |
+  TOK_OP_BIT_XOR,        // ^
+  TOK_OP_BIT_NOT,        // ~
+  TOK_OP_LSHIFT,         // <<
+  TOK_OP_RSHIFT,         // >>
+  TOK_OP_NOT,            // !
+  TOK_OP_ASSIGN,         // =
+  TOK_OP_LT,             // <
+  TOK_OP_GT,             // >
+  TOK_OP_INC,            // ++
+  TOK_OP_DEC,            // --
+  TOK_OP_EQ,             // ==
+  TOK_OP_NE,             // !=
+  TOK_OP_LE,             // <=
+  TOK_OP_GE,             // >=
+  TOK_OP_AND,            // &&
+  TOK_OP_OR,             // ||
+  TOK_OP_MEMBER_POINTER, // ->
+  TOK_OP_MEMBER,         // .
+  TOK_OP_COND_DECISION,  // :
+  TOK_OP_COND,           // ?
+  TOK_OP_ASSIGN_ADD,     // +=
+  TOK_OP_ASSIGN_SUB,     // -=
+  TOK_OP_ASSIGN_MUL,     // *=
+  TOK_OP_ASSIGN_DIV,     // /=
+  TOK_OP_ASSIGN_MOD,     // %=
+  TOK_OP_ASSIGN_BITAND,  // &=
+  TOK_OP_ASSIGN_BITOR,   // |=
+  TOK_OP_ASSIGN_BITXOR,  // ^=
+  TOK_OP_ASSIGN_LSHIFT,  // <<=
+  TOK_OP_ASSIGN_RSHIFT,  // >>=
 
   // Separators
-  TOK_SEP_LEFT_PAREN, // (
-  TOK_SEP_RIGHT_PAREN, // )
-  TOK_SEP_LEFT_BRACKET, // [
+  TOK_SEP_LEFT_PAREN,    // (
+  TOK_SEP_RIGHT_PAREN,   // )
+  TOK_SEP_LEFT_BRACKET,  // [
   TOK_SEP_RIGHT_BRACKET, // ]
-  TOK_SEP_LEFT_BRACE, // {
-  TOK_SEP_RIGHT_BRACE, // }
-  TOK_SEP_COMMA, // ,
-  TOK_SEP_SEMICOLON, // ;
-  TOK_SEP_COLON, // :
-  TOK_SEP_DOT, // .
-  TOK_SEP_ELLIPSIS, // ...
-  TOK_SEP_HASH, // #
+  TOK_SEP_LEFT_BRACE,    // {
+  TOK_SEP_RIGHT_BRACE,   // }
+  TOK_SEP_COMMA,         // ,
+  TOK_SEP_SEMICOLON,     // ;
+  TOK_SEP_DOT,           // .
+  TOK_SEP_ELLIPSIS,      // ...
+  TOK_SEP_HASH,          // #
 
   // Identifiers
-  TOK_ID, // Any identifier not ending in _t
-  TOK_TID, // Any identifier ending in _t
+  TOK_ID,  // Any identifier not ending in _t
 
   // Constants
-  TOK_CONST_INTEGER_U32, // u
-  TOK_CONST_INTEGER_U64, // ul
-  TOK_CONST_INTEGER_S32, // (no suffix)
-  TOK_CONST_INTEGER_S64, // l
-  TOK_CONST_FLOAT_32, // f
-  TOK_CONST_FLOAT_64, // (no suffix)
-  TOK_CONST_CHAR, // 'c'
+  TOK_CONST_INTEGER_U32,  // u
+  TOK_CONST_INTEGER_U64,  // ul
+  TOK_CONST_INTEGER_S32,  // (no suffix)
+  TOK_CONST_INTEGER_S64,  // l
+  TOK_CONST_FLOAT_32,     // f
+  TOK_CONST_FLOAT_64,     // (no suffix)
+  TOK_CONST_CHAR,         // 'c'
   TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
-  TOK_CONST_STRING_UNICODE, // L"string" (width of 32 bits)
 
   // Special
   TOK_SPECIAL_EOF,
   TOK_SPECIAL_ERROR,
-} c_token_types;
-</code></pre>
+} c_token_types;</code></pre>
     </div>
-    <p>One decision of interest here is the separation of identifiers into two categories: <code>TOK_ID</code> and
-      <code>TOK_TID</code>. This helps prevent the need for a lexer hack for typedefs, as something like
-    <div class="code-block">
-      <span class="file-name">N/A</span>
-      <span class="block-header">
-        <strong class="block-title"><em><a id=":vexing-parse" href="#:vexing-parse">The
-              Vexing Parse</a></em></strong></span>
-      <pre class="prettyprint"><code class="">typedef int foo;
-int main() {
-  foo foo;
-  foo = 5;
-  return foo;
-}</code></pre>
-    </div>
-    which requires syntactic context to parse correctly is not allowed (since foo is a <code
-      class="prettyprint">TOK_ID</code> and not a
-    <code class="prettyprint">TOK_TID</code>, the parser will error when it's in the position of a type). </p>
     <h4>Tokenizer Design Decisions</h4>
     <p>A lot of introductory compiler courses focus on the design of deterministic finite automatas for lexing. While
       that's definitely interesting from a CS perspective, for this project I'll be focusing more on simplicity and
       readability, and so I'll write a lexer by hand. </p>
-    <p>I'll implement the token API first, and then write the tokenizer</p>
+    <p>I'll implement the token API first, and then write the tokenizer. This'll be a 'one-at-a-time' tokenizer, and won't return a list of tokens. </p>
     <h4>Token Implementation</h4>
     <p>The token struct is pretty simple. It contains the token information, some metadata about the token, and the
       optional value of the token. </p>
@@ -412,30 +388,28 @@ int main() {
       <span class="block-header">
         <strong class="block-title"><em><a id=":token-struct" href="#:token-struct">Token
               struct</a></em></strong></span>
-      <pre class="prettyprint"><code class="">
-#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
-struct token {
-  long magic;
-  int line;
-  int column;
-  short kind;
-  char opt_data[0];
-};
-
-typedef struct token token_t;
-
-struct token_data {
-  union {
-    long long i;
-    double f;
-    const char *s;
-    char c;
-  } data;
-};
-
-typedef struct token_data token_data_t;
-</code></pre>
+      <pre class="prettyprint"><code class="">#define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
+      #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
+      struct token {
+      long magic;
+      int line;
+      int column;
+      short kind;
+      long opt_data[0];
+      };
+      
+      typedef struct token token_t;
+      
+      struct token_data {
+      union {
+      long long i;
+      double f;
+      const char *s;
+      char c;
+      } data;
+      };
+      
+      typedef struct token_data token_data_t;</code></pre>
     </div>
     <p>By keeping the token data off of simple tokens (like keywords and operators), we save 8 bytes per token. This
       will add up quickly, as the lexer will produce a lot of tokens. </p>
@@ -445,46 +419,62 @@ typedef struct token_data token_data_t;
       <span class="block-header">
         <strong class="block-title"><em><a id=":token-internals" href="#:token-internals">Token
               Internals</a></em></strong></span>
-      <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)-&gt;opt_data))
-static token_t *token_data_create(c_token_types kind, int line, int column) {
+      <pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data))
+static token_t *token_data_create(c_token_types kind, int lin, int col,
+                                  int len) {
   token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
-  token-&gt;magic = TOK_MAGIC_1;
-  token-&gt;line = line;
-  token-&gt;column = column;
-  token-&gt;kind = kind;
+  if (token == NULL) {
+    fputs("Out of memory\n", stderr);
+    exit(1);
+  }
+  token->magic = TOK_MAGIC_1;
+  token->line = lin;
+  token->column = col;
+  column += len;
+  token->kind = kind;
   return token;
 }
 
-static token_t *token_create(c_token_types kind, int line, int column) {
+static token_t *token_create(c_token_types kind, int lin, int col, int len) {
   token_t *token = malloc(sizeof(token_t));
-  token-&gt;magic = TOK_MAGIC_2;
-  token-&gt;line = line;
-  token-&gt;column = column;
-  token-&gt;kind = kind;
+  if (token == NULL) {
+    fputs("Out of memory\n", stderr);
+    exit(1);
+  }
+  token->magic = TOK_MAGIC_2;
+  token->line = lin;
+  token->column = col;
+  column += len;
+  token->kind = kind;
   return token;
 }
 
-static token_t *token_create_int(c_token_types kind, int line, int column, int64_t i) {
-  token_t *token = token_data_create(kind, line, column);
-  token_data(token)-&gt;data.i = i;
+static token_t *token_create_int(c_token_types kind, int lin, int col,
+                                 int64_t i, int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
+  token_data(token)->data.i = i;
   return token;
 }
 
-static token_t *token_create_float(c_token_types kind, int line, int column, double f) {
-  token_t *token = token_data_create(kind, line, column);
-  token_data(token)-&gt;data.f = f;
+static token_t *token_create_float(c_token_types kind, int lin, int col,
+                                   double f, int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
+  token_data(token)->data.f = f;
   return token;
 }
 
-static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) {
-  // Not yet
+static token_t *token_create_char(c_token_types kind, int lin, int col, char c,
+                                  int len) {
+  token_t *token = token_data_create(kind, lin, col, len);
+  token_data(token)->data.c = c;
+  return token;
 }
 
-static token_t *token_create_char(c_token_types kind, int line, int column, char c) {
-  token_t *token = token_data_create(kind, line, column);
-  token_data(token)-&gt;data.c = c;
-  return token;
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+                                    const char *s, int len) {
+  // Not yet
 }
+
 </code></pre>
     </div>
     <p>You might wonder why token_create_string was left out. Before you ask that, try counting up all the repeated
@@ -511,13 +501,15 @@ static token_t *token_create_char(c_token_types kind, int line, int column, char
       <pre class="prettyprint"><code class="">typedef struct hash_table hash_table_t;
 typedef struct hash_table_entry hash_table_entry_t;
 typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
-typedef unsigned long (*hash_table_hash_fn)(void *key);
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash);
-void hash_table_destroy(hash_table_t *table);
+typedef unsigned int (*hash_table_hash_fn)(void *key);
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+                                hash_table_hash_fn hash);
+
+typedef void (*hash_table_dtor)(void *value, int is_key);
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor);
 void *hash_table_get(hash_table_t *table, void *key);
 int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
-void hash_table_remove(hash_table_t *table, void *key);
-</code></pre>
+void hash_table_remove(hash_table_t *table, void *key);</code></pre>
     </div>
     <p>Now that we have the hash table interface, we can implement the generic hash table. </p>
     <div class="code-block">
@@ -537,88 +529,96 @@ struct hash_table_entry {
   hash_table_entry_t *next;
 };
 
-hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash) {
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+                                hash_table_hash_fn hash) {
   hash_table_t *table = malloc(sizeof(hash_table_t));
   if (table == NULL) {
     return NULL;
   }
-  table-&gt;entries = calloc(size, sizeof(hash_table_entry_t *));
-  if (table-&gt;entries == NULL) {
+  table->entries = calloc(size, sizeof(hash_table_entry_t *));
+  if (table->entries == NULL) {
     free(table);
     return NULL;
   }
-  table-&gt;size = size;
-  table-&gt;cmp = cmp;
-  table-&gt;hash = hash;
+  table->size = size;
+  table->cmp = cmp;
+  table->hash = hash;
   return table;
 }
 
-void hash_table_destroy(hash_table_t *table) {
-  for (int i = 0; i &lt; table-&gt;size; i++) {
-    hash_table_entry_t *entry = table-&gt;entries[i];
+void hash_table_destroy(hash_table_t *table, hash_table_dtor dtor) {
+  for (int i = 0; i < table->size; i++) {
+    hash_table_entry_t *entry = table->entries[i];
     while (entry != NULL) {
-      hash_table_entry_t *next = entry-&gt;next;
+      hash_table_entry_t *next = entry->next;
+      if (dtor != NULL) {
+        dtor(entry->key, 1);
+        dtor(entry->value, 0);
+      }
       free(entry);
       entry = next;
     }
   }
-  free(table-&gt;entries);
+  free(table->entries);
   free(table);
 }
 
 void *hash_table_get(hash_table_t *table, void *key) {
-  unsigned long hash = table-&gt;hash(key) % table-&gt;size;
-  hash_table_entry_t *entry = table-&gt;entries[hash];
+  unsigned long hash = table->hash(key) % table->size;
+  hash_table_entry_t *entry = table->entries[hash];
   while (entry != NULL) {
-    if (table-&gt;cmp(entry-&gt;key, key) == 0) {
-      return entry-&gt;value;
+    if (table->cmp(entry->key, key) == 0) {
+      return entry->value;
     }
-    entry = entry-&gt;next;
+    entry = entry->next;
   }
   return NULL;
 }
 
 int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
-  unsigned long hash = table-&gt;hash(key) % table-&gt;size;
-  hash_table_entry_t *entry = table-&gt;entries[hash];
+  unsigned long hash = table->hash(key) % table->size;
+  hash_table_entry_t *entry = table->entries[hash];
   while (entry != NULL) {
-    if (table-&gt;cmp(entry-&gt;key, key) == 0) {
+    if (table->cmp(entry->key, key) == 0) {
       if (replace) {
-        entry-&gt;value = value;
+        entry->value = value;
         return 0;
       } else {
-      return 1;
+        return 1;
       }
     }
-    entry = entry-&gt;next;
+    entry = entry->next;
   }
   entry = malloc(sizeof(hash_table_entry_t));
-  entry-&gt;key = key;
-  entry-&gt;value = value;
-  entry-&gt;next = table-&gt;entries[hash];
-  table-&gt;entries[hash] = entry;
+  if (entry == NULL) {
+    fprintf(stderr, "Error: Out of memory. Could not allocate hash table entry\n");
+    exit(1);
+  }
+  entry->key = key;
+  entry->value = value;
+  entry->next = table->entries[hash];
+  table->entries[hash] = entry;
   return 0;
 }
 
 void hash_table_remove(hash_table_t *table, void *key) {
-  unsigned long hash = table-&gt;hash(key) % table-&gt;size;
-  hash_table_entry_t *entry = table-&gt;entries[hash];
+  unsigned long hash = table->hash(key) % table->size;
+  hash_table_entry_t *entry = table->entries[hash];
   hash_table_entry_t *prev = NULL;
   while (entry != NULL) {
-    if (table-&gt;cmp(entry-&gt;key, key) == 0) {
+    if (table->cmp(entry->key, key) == 0) {
       if (prev == NULL) {
-        table-&gt;entries[hash] = entry-&gt;next;
+        table->entries[hash] = entry->next;
       } else {
-        prev-&gt;next = entry-&gt;next;
+        prev->next = entry->next;
       }
       free(entry);
       return;
     }
     prev = entry;
-    entry = entry-&gt;next;
+    entry = entry->next;
   }
-}
-</code></pre>
+}</code></pre>
     </div>
     <p>One implementation detail of note is that hash_table_put returns 1 if the key already exists in the table, and,
       unless replace is set to 1, will not replace the value. </p>
@@ -627,28 +627,39 @@ void hash_table_remove(hash_table_t *table, void *key) {
     <h4>String hashing</h4>
     <p>Now we can implement the string hash table for the tokenizer.</p>
     <p>A lot of research has been done on hash functions (See <a href="https://www.strchr.com/hash_functions">here for a
-        good review of theory and comparison of various functions</a>). For this project, I'll be using the hash
-      function from the famous "Red Dragon Book" (Aho, Sethi, Ullman, Compilers: Principles, Techniques, and Tools, 1st
-      edition). as it's optimized for compiler symbols. </p>
+        good review of theory and comparison of various functions</a>). For this project, I'll be using the ELF hash
+      function from glibc, which
+    </p>
     <div class="code-block">
       <span class="file-name">tokenizer.c</span>
       <span class="block-header">
         <strong class="block-title"><em><a id=":string-hash-function" href="#:string-hash-function">String
               hash function</a></em></strong></span>
-      <pre class="prettyprint"><code class="">#define PRIME 211
-static unsigned long hash_string(void *key) {
-  unsigned long hash = 0, g;
+      <pre class="prettyprint"><code class="">static unsigned int hash_string(void *key) {
+  unsigned long hash = 0, hi = 0;
   char *p = key;
-  while (*p) {
-    hash = (hash &lt;&lt; 4) + *p++;
-    if ((g = hash &amp; 0xf0000000) != 0) {
-      hash ^= g &gt;&gt; 24;
-      hash ^= g;
+  hash = *p;
+  if (hash != 0 && p[1] != 0) {
+    hash = (hash << 4) + p[1];
+    if (p[2] != 0) {
+      hash = (hash << 4) + p[2];
+      if (p[3] != 0) {
+        hash = (hash << 4) + p[3];
+        if (p[4] != 0) {
+          hash = (hash << 4) + p[4];
+          p += 5;
+          while (*p != 0) {
+            hash = (hash << 4) + *p++;
+            hi = hash & 0xf0000000l;
+            hash ^= hi >> 24;
+          }
+          hash &= 0x0fffffffl;
+        }
+      }
     }
   }
   return hash;
-}
-</code></pre>
+}</code></pre>
     </div>
     <p>We'll also need a comparison function for strings. </p>
     <div class="code-block">
@@ -668,17 +679,18 @@ static unsigned long hash_string(void *key) {
         <strong class="block-title"><em><a id=":string-creation" href="#:string-creation">String
               creation</a></em></strong></span>
       <pre class="prettyprint"><code class="">static hash_table_t *string_table;
-static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) {
+static token_t *token_create_string(c_token_types kind, int lin, int col,
+                                    const char *s, int len) {
   if (string_table == NULL) {
-    string_table = hash_table_create(1024, cmp_string, hash_string);
+    string_table = hash_table_create(2048, cmp_string, hash_string);
   }
-  token_t *token = token_data_create(kind, line, column);
-  char *key = strdup(s);
-  if (hash_table_put(string_table, key, key, 0)) {
-    free(key);
-    key = hash_table_get(string_table, key);
+  token_t *token = token_data_create(kind, lin, col, len);
+  char *key = hash_table_get(string_table, (void *)s);
+  if (key == NULL) {
+    key = strdup(s);
+    hash_table_put(string_table, key, key, 1);
   }
-  token_data(token)-&gt;data.s = key;
+  token_data(token)->data.s = key;
   return token;
 }
 </code></pre>
@@ -692,57 +704,52 @@ static token_t *token_create_string(c_token_types kind, int line, int column, co
       <span class="block-header">
         <strong class="block-title"><em><a id=":external-token-api" href="#:external-token-api">External
               token API</a></em></strong></span>
-      <pre class="prettyprint"><code class="">
-c_token_types token_type(token_t *token) {
-  assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
-  return token-&gt;kind;
+      <pre class="prettyprint"><code class="">c_token_types token_type(token_t *token) {
+  assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+  return token->kind;
 }
 
 int64_t token_int(token_t *token) {
-  assert(token-&gt;kind == TOK_CONST_INTEGER_U32 ||
-         token-&gt;kind == TOK_CONST_INTEGER_U64 ||
-         token-&gt;kind == TOK_CONST_INTEGER_S32 ||
-         token-&gt;kind == TOK_CONST_INTEGER_S64);
-  assert(token-&gt;magic == TOK_MAGIC_1);
-  return token_data(token)-&gt;data.i;
+  assert(token->kind == TOK_CONST_INTEGER_U32 ||
+         token->kind == TOK_CONST_INTEGER_U64 ||
+         token->kind == TOK_CONST_INTEGER_S32 ||
+         token->kind == TOK_CONST_INTEGER_S64);
+  assert(token->magic == TOK_MAGIC_1);
+  return token_data(token)->data.i;
 }
 
 double token_float(token_t *token) {
-  assert(token-&gt;kind == TOK_CONST_FLOAT_32 ||
-         token-&gt;kind == TOK_CONST_FLOAT_64);
-  assert(token-&gt;magic == TOK_MAGIC_1);
-  return token_data(token)-&gt;data.f;
+  assert(token->kind == TOK_CONST_FLOAT_32 ||
+         token->kind == TOK_CONST_FLOAT_64);
+  assert(token->magic == TOK_MAGIC_1);
+  return token_data(token)->data.f;
 }
 
 const char *token_string(token_t *token) {
-  assert(token-&gt;kind == TOK_CONST_STRING_ASCII);
-  assert(token-&gt;magic == TOK_MAGIC_1);
-  return token_data(token)-&gt;data.s;
+  assert(token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID);
+  assert(token->magic == TOK_MAGIC_1);
+  return token_data(token)->data.s;
 }
 
 char token_char(token_t *token) {
-  assert(token-&gt;kind == TOK_CONST_CHAR);
-  assert(token-&gt;magic == TOK_MAGIC_1);
-  return token_data(token)-&gt;data.c;
+  assert(token->kind == TOK_CONST_CHAR);
+  assert(token->magic == TOK_MAGIC_1);
+  return token_data(token)->data.c;
 }
 
 int token_line(token_t *token) {
-  assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
-  return token-&gt;line;
+  assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+  return token->line;
 }
 
 int token_column(token_t *token) {
-  assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
-  return token-&gt;column;
+  assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+  return token->column;
 }
 
 void token_destroy(token_t *token) {
-  assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
-  if (token-&gt;magic == TOK_MAGIC_1) {
-    if (token-&gt;kind == TOK_CONST_STRING_ASCII) {
-      free((void *)token_data(token)-&gt;data.s);
-    }
-  }
+  assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+  // Don't free the string table, it's a global variable
   free(token);
 }</code></pre>
     </div>
@@ -755,7 +762,10 @@ void token_destroy(token_t *token) {
       <span class="block-header">
         <strong class="block-title"><em><a id=":tokenizer-implementation" href="#:tokenizer-implementation">Tokenizer
               implementation</a></em></strong></span>
-      <pre class="prettyprint"><code class="">token_t *unget_token = NULL;
+      <pre class="prettyprint"><code class="">int line = 1;
+int column = 1;
+char file_name[1024];
+token_t *unget_token = NULL;
 void tokenizer_init(const char *filename)
 {
     input_init(filename);
@@ -774,7 +784,25 @@ void tokenizer_destroy(void)
   {
     hash_table_destroy(string_table);
   }
-}</code></pre>
+}
+
+token_t *read_token(void) {
+  if (unget_token != NULL) {
+    token_t *tok = unget_token;
+    unget_token = NULL;
+    return tok;
+  }
+  // What goes here?
+  int c = input_getc();
+  if (c == EOF) {
+    return NULL;
+  }
+  tok_warn(
+      "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+      line, column);
+  return read_token();
+}
+</code></pre>
     </div>
     <p>Now we can implement the meat of the tokenizer. The general structure is as follows: </p>
     <ol>
@@ -797,7 +825,7 @@ void tokenizer_destroy(void)
       <span class="block-header">
         <strong class="block-title"><em><a id=":skip-whitespace" href="#:skip-whitespace">Skip
               whitespace</a></em></strong></span>
-      <pre class="prettyprint"><code class="">static void skip_whitespace(void) {
+      <pre class="prettyprint"><code class="">static token_t *skip_whitespace(void) {
   int c;
   while ((c = input_getc()) != EOF) {
     if (isspace(c)) {
@@ -807,6 +835,23 @@ void tokenizer_destroy(void)
       } else {
         column++;
       }
+    } else if (c == '#') // GCC preprocessor line control directive.
+    {
+      char buf[512];
+      int i = 0;
+      while ((c = input_getc()) != EOF && c != '\n') {
+        buf[i++] = c;
+        column++;
+      }
+      buf[i] = '\0';
+      if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
+        column = 1;
+      } else {
+        tok_error("Invalid #line directive\n");
+      }
+      if (c == EOF) {
+        return NULL;
+      }
     } else if (c == '/') {
       c = input_getc();
       if (c == '/') {
@@ -814,7 +859,7 @@ void tokenizer_destroy(void)
           column++;
         }
         if (c == EOF) {
-          return;
+          return NULL;
         }
         line++;
         column = 1;
@@ -833,17 +878,20 @@ void tokenizer_destroy(void)
           }
         }
         if (c == EOF) {
-          return;
+          return NULL;
         }
       } else {
+        if (c == '=')
+          return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
         input_ungetc(c);
-        return;
+        return token_create(TOK_OP_DIV, line, column, 1);
       }
     } else {
       input_ungetc(c);
-      return;
+      return NULL;
     }
   }
+  return NULL;
 }</code></pre>
     </div>
     <p>Now we'll implement code to recognize keywords and identifiers. The rules for identifiers are that they must
@@ -858,95 +906,1190 @@ void tokenizer_destroy(void)
   char buf[1024];
   int i = 0;
   c = input_getc();
-  if (!isalpha(c) &amp;&amp; c != '_') {
+  if (!isalpha(c) && c != '_') {
     input_ungetc(c);
     return NULL;
   }
   buf[i++] = c;
   while ((c = input_getc()) != EOF) {
-    if (!isalnum(c) &amp;&amp; c != '_') {
+    if (!isalnum(c) && c != '_') {
       input_ungetc(c);
       break;
     }
     buf[i++] = c;
+    if (i >= 1008) {
+      tok_error("Identifier too long\n");
+      exit(1);
+    }
   }
   buf[i] = '\0';
-
+  column += i;
   // Check if it's a keyword
-  c_token_types kind = get_keyword(buf);
-
-  if (kind == TOK_ID) {
-    // Check if it's a typedeffed identifier
-    int len = strlen(buf);
-    if (len &gt; 2 &amp;&amp; buf[len - 2] == '_' &amp;&amp; buf[len - 1] == 't') {
-      kind = TOK_TID;
-    }
+  c_token_types kind = get_keyword(buf, i);
+  if (kind != TOK_ID) {
+    return token_create(kind, line, column, i);
   }
-  return token_create_string(kind, line, column, buf);
+  return token_create_string(kind, line, column, buf, i);
 }</code></pre>
     </div>
+    <p> We have to match the following keywords: </p>
+    <ul>
+      <li>Control Keywords: <code>if</code>, <code>else</code>, <code>switch</code>, <code>case</code>,
+        <code>default</code>, <code>while</code>, <code>do</code>, <code>for</code>, <code>continue</code>,
+        <code>return</code>, <code>goto</code>
+      </li>
+      <li>Type Keywords: <code>void</code>, <code>char</code>, <code>short</code>, <code>int</code>, <code>long</code>,
+        <code>float</code>, <code>double</code>, <code>unsigned</code>, <code>struct</code>, <code>union</code>,
+        <code>enum</code>, <code>typedef</code>
+      </li>
+      <li>Storage Class/Specifier Keywords: <code>auto</code>, <code>register</code>, <code>static</code>,
+        <code>extern</code>, <code>const</code>, <code>volatile</code>
+      </li>
+      <li>Misc Keywords: <code>sizeof</code></li>
+    </ul>
     <p>For recognizing keywords, I'll use a decision tree. This is a simple and space-efficient way to recognize
       keywords. Originally I wanted to use a trie, but those end up using a lot of memory for a small number of
       items. </p>
-      <p> To generate the decision tree, I'll use a simple C program which works off the following template: </p>
-      <div class="code-block">
-        <span class="file-name">N/A</span>
-        <span class="block-header">
-          <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
-                decision tree</a></em></strong></span>
-        <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
+    <p> To build that, we first need to figure out what keywords have common prefixes. We can do that by sorting them:
+    </p>
+    <div class="code-block">
+      <span class="file-name">N/A</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":keyword-sort" href="#:keyword-sort">Keyword
+              sort</a></em></strong></span>
+      <pre class="prettyprint"><code class="">auto
+break
+case
+char
+const
+continue
+default
+do
+double
+else
+enum
+extern
+float
+for
+goto
+if
+int
+long
+register
+return
+short
+signed
+sizeof
+static
+struct
+switch
+typedef
+union
+unsigned
+void
+volatile
+while</code></pre>
+    </div>
+    <p>From this, the organization is pretty clear. </p>
+    <div class="code-block">
+      <span class="file-name">N/A</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
+              decision tree</a></em></strong></span>
+      <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf, int len) {
   switch (buf[0]) {
-    case [first letter]:
-      switch (buf[1]) {
-        case [second letter]:
-          switch (buf[2]) {
-            case [third letter]:
-              return [keyword];
-            default:
-              return TOK_ID;
-          }
-        default:
-          return TOK_ID;
+  case 'a':
+    if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
+      return TOK_SCSK_AUTO;
+    break;
+
+  case 'b':
+    if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
+        buf[4] == 'k')
+      return TOK_CTK_BREAK;
+    break;
+
+  case 'c':
+    switch (buf[1]) {
+    case 'a':
+      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+        return TOK_CTK_CASE;
+      break;
+    case 'h':
+      if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
+        return TOK_TK_CHAR;
+      break;
+    case 'o':
+      if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
+        return TOK_SCSK_CONST;
+      if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
+          buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
+        return TOK_CTK_CONTINUE;
+      break;
+    }
+    break;
+
+  case 'd':
+    switch (buf[1]) {
+    case 'e':
+      if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
+          buf[5] == 'l' && buf[6] == 't')
+        return TOK_CTK_DEFAULT;
+      break;
+    case 'o':
+      if (len == 2 && buf[2] == '\0')
+        return TOK_CTK_DO;
+      if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
+          buf[5] == 'e')
+        return TOK_TK_DOUBLE;
+      break;
+    }
+    break;
+
+  case 'e':
+    switch (buf[1]) {
+    case 'l':
+      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
+        return TOK_CTK_ELSE;
+      break;
+    case 'n':
+      if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
+        return TOK_TK_ENUM;
+      break;
+    case 'x':
+      if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
+          buf[5] == 'n')
+        return TOK_SCSK_EXTERN;
+      break;
+    }
+    break;
+
+  case 'f':
+    switch (buf[1]) {
+    case 'l':
+      if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
+        return TOK_TK_FLOAT;
+      break;
+    case 'o':
+      if (len == 3 && buf[2] == 'r')
+        return TOK_CTK_FOR;
+      break;
+    }
+    break;
+
+  case 'g':
+    if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
+      return TOK_CTK_GOTO;
+    break;
+
+  case 'i':
+    switch (buf[1]) {
+    case 'f':
+      if (len == 2 && buf[2] == '\0')
+        return TOK_CTK_IF;
+      break;
+    case 'n':
+      if (len == 3 && buf[2] == 't')
+        return TOK_TK_INT;
+      break;
+    }
+    break;
+
+  case 'l':
+    if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
+      return TOK_TK_LONG;
+    break;
+
+  case 'r':
+    switch (buf[1]) {
+    case 'e':
+      if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
+          buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
+        return TOK_SCSK_REGISTER;
+      if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
+          buf[5] == 'n')
+        return TOK_CTK_RETURN;
+      break;
+    }
+    break;
+
+  case 's':
+    switch (buf[1]) {
+    case 'h':
+      if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
+        return TOK_TK_SHORT;
+      break;
+    case 't':
+      if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
+          buf[5] == 'c')
+        return TOK_SCSK_STATIC;
+      break;
+    case 'i':
+      if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
+          buf[5] == 'd')
+        return TOK_TK_SIGNED;
+      if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
+          buf[5] == 'f')
+        return TOK_MK_SIZEOF;
+      break;
+    case 'r':
+      if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
+        return TOK_TK_STRUCT;
+      break;
+    case 'w':
+      if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
+          buf[5] == 'h')
+        return TOK_CTK_SWITCH;
+      break;
+    }
+    break;
+
+  case 't':
+    if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
+        buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
+      return TOK_TK_TYPEDEF;
+    break;
+
+  case 'u':
+    switch (buf[1]) {
+    case 'n':
+      if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
+        return TOK_TK_UNION;
+      if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
+          buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
+        return TOK_TK_UNSIGNED;
+      break;
+    }
+    break;
+
+  case 'v':
+    switch (buf[1]) {
+    case 'o':
+      if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
+        return TOK_TK_VOID;
+      if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
+          buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
+        return TOK_SCSK_VOLATILE;
+      break;
+    }
+    break;
+
+  case 'w':
+    if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
+        buf[4] == 'e')
+      return TOK_CTK_WHILE;
+    break;
+
+  default:
+    return TOK_ID;
+  }
+  return TOK_ID;
+}</code></pre>
+    </div>
+    <p> It's very tedious, but it performs pretty well. You might wonder why I didn't use strcmp, and the answer is that
+      this approach turned out faster in testing, plus, considering I want this to be self-hosting, I can't rely on
+      optimizations in the standard library. </p>
+    <p>We'll use the same approach for recognizing operators. Let's list both out (Table 2-3 in the C
+      Reference Manual). </p>
+    <div class="code-block">
+      <span class="file-name">N/A</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":operator" href="#:operators">Operators</a></em></strong></span>
+      <pre class="prettyprint"><code class="">!
+!=
+%
+%=
+&amp;
+&amp;&amp;
+&amp;=
+(
+)
+*
+*=
++
+++
++=
+,
+-
+--
+-=
+-&gt;
+.
+...
+/
+/=
+:
+;
+&lt;
+&lt;&lt;
+&lt;&lt;=
+&lt;=
+=
+==
+&gt;
+&gt;=
+&gt;&gt;
+&gt;&gt;=
+?
+[
+]
+^
+^=
+{
+|
+|=
+||
+}
+~</code></pre>
+    </div>
+    <p>And the decision tree for operators: </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":operator-decision-tree" href="#:operator-decision-tree">Operator
+              decision tree</a></em></strong></span>
+      <pre class="prettyprint"><code class="">token_t *read_operator(void) {
+  int c;
+  c = input_getc();
+  switch (c) {
+  case '!': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_NE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_NOT, line, column, 1);
+  }
+  case '%': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_MOD, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_MOD, line, column, 1);
+  }
+  case '&': {
+    c = input_getc();
+    if (c == '&')
+      return token_create(TOK_OP_AND, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITAND, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_AND, line, column, 1);
+  }
+  case '(':
+    return token_create(TOK_SEP_LEFT_PAREN, line, column, 1);
+  case ')':
+    return token_create(TOK_SEP_RIGHT_PAREN, line, column, 1);
+  case '*': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_MUL, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_MUL, line, column, 1);
+  }
+  case '+': {
+    c = input_getc();
+    if (c == '+')
+      return token_create(TOK_OP_INC, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_ADD, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_ADD, line, column, 2);
+  }
+  case ',':
+    return token_create(TOK_SEP_COMMA, line, column, 1);
+  case '-': {
+    c = input_getc();
+    if (c == '-')
+      return token_create(TOK_OP_DEC, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_SUB, line, column, 2);
+    if (c == '>')
+      return token_create(TOK_OP_MEMBER_POINTER, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_SUB, line, column, 1);
+  }
+  case '.': {
+    c = input_getc();
+    if (c == '.') {
+      c = input_getc();
+      if (c == '.') {
+        return token_create(TOK_SEP_ELLIPSIS, line, column, 3);
+      } else {
+        // Bail out, can't store more than one unget
+        tok_error("Unexpected character '.' at line %d, column %d\n", line,
+                  column);
+        exit(1);
       }
-    default:
-      return TOK_ID;
+    }
+    return token_create('.', line, column, 1);
+  }
+  case '/': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_DIV, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_DIV, line, column, 1);
+  }
+  case ':':
+    return token_create(TOK_OP_COND_DECISION, line, column, 1);
+  case ';':
+    return token_create(TOK_SEP_SEMICOLON, line, column, 1);
+  case '<': {
+    c = input_getc();
+    if (c == '<') {
+      c = input_getc();
+      if (c == '=')
+        return token_create(TOK_OP_ASSIGN_LSHIFT, line, column, 3);
+      input_ungetc(c);
+      return token_create(TOK_OP_LSHIFT, line, column, 2);
+    }
+    if (c == '=')
+      return token_create(TOK_OP_LE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_LT, line, column, 1);
+  }
+  case '=': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_ASSIGN, line, column, 1);
+  }
+  case '>': {
+    c = input_getc();
+    if (c == '>') {
+      c = input_getc();
+      if (c == '=')
+        return token_create(TOK_OP_ASSIGN_RSHIFT, line, column, 3);
+      input_ungetc(c);
+      return token_create(TOK_OP_RSHIFT, line, column, 2);
+    }
+    if (c == '=')
+      return token_create(TOK_OP_GE, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_GT, line, column, 1);
+  }
+  case '?':
+    return token_create(TOK_OP_COND, line, column, 1);
+  case '[':
+    return token_create(TOK_SEP_LEFT_BRACKET, line, column, 1);
+  case ']':
+    return token_create(TOK_SEP_RIGHT_BRACKET, line, column, 1);
+  case '^': {
+    c = input_getc();
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITXOR, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_XOR, line, column, 1);
+  }
+  case '{':
+    return token_create(TOK_SEP_LEFT_BRACE, line, column, 1);
+  case '|': {
+    c = input_getc();
+    if (c == '|')
+      return token_create(TOK_OP_OR, line, column, 2);
+    if (c == '=')
+      return token_create(TOK_OP_ASSIGN_BITOR, line, column, 2);
+    input_ungetc(c);
+    return token_create(TOK_OP_BIT_OR, line, column, 1);
+  }
+  case '}':
+    return token_create(TOK_SEP_RIGHT_BRACE, line, column, 1);
+  case '~':
+    return token_create(TOK_OP_BIT_NOT, line, column, 1);
+  default:
+    input_ungetc(c);
+    return NULL;
+  }
+}
+</code></pre>
+    </div>
+    <p>Now we can implement the functions for reading numbers. C recognizes the following grammar for numbers (if you
+      haven't seen a formal grammar before, you read '::=' as "can be", '|' as "or", and '[]' as "optional", so an
+      integer constant can be a decimal constant with an optional suffix, an octal constant with an optional suffix, or
+      a hexadecimal constant with an optional suffix): </p>
+    <div
+      style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+      <pre class="prettyprint"><code class="">constant ::= integer-constant | floating-constant
+    
+    integer-constant ::= decimal-constant [integer-suffix]
+    | octal-constant [integer-suffix]
+    | hexadecimal-constant [integer-suffix]
+      
+      decimal-constant ::= nonzero-digit [digit-sequence] | '0'
+      
+      octal-constant ::= '0' octal-digit-sequence
+      
+      hexadecimal-constant ::= '0x' hexadecimal-digit-sequence
+      | '0X' hexadecimal-digit-sequence
+      
+      integer-suffix ::= 'u' | 'U' | 'l' | 'L'
+      | 'ul' | 'UL' | 'lu' | 'LU'
+      
+    floating-constant ::= fractional-constant [exponent-part] [floating-suffix]
+    | digit-sequence exponent-part [floating-suffix]
+    
+      fractional-constant ::= digit-sequence '.' [digit-sequence]
+      | '.' digit-sequence
+      
+      exponent-part ::= 'e' [sign] digit-sequence
+      | 'E' [sign] digit-sequence
+      
+      sign ::= '+' | '-'
+
+      floating-suffix ::= 'f' | 'F'
+    
+    digit-sequence ::= digit [digit-sequence]
+    
+    nonzero-digit ::= '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
+    
+    octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
+    
+    hexadecimal-digit ::= digit
+    | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'
+    | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
+    
+    digit ::= '0' | nonzero-digit
+</code></pre>
+    </div>
+    <p>Where the following semantics apply: </p>
+    <ul>
+      <li>Integer constants are evaluated according to their radix, where <code>decimal-constant</code> is base 10,
+        <code>octal-constant</code> is base 8, and <code>hexadecimal-constant</code> is base 16.
+      </li>
+      <li>Integer suffixes are used to specify the type of the constant. A suffix of <code>u</code> or <code>U</code>
+        specifies an unsigned integer, and a suffix of <code>l</code> or <code>L</code> specifies a long integer. </li>
+      <li>Suffixes can be combined, so <code>ul</code> specifies an unsigned long integer. </li>
+      <li>Floating constants are evaluated as double precision unless a suffix of <code>f</code> or <code>F</code> is
+        used, in which case they are evaluated as single precision. </li>
+    </ul>
+    <p> Looking at these rules, we can see that all valid integer constants start with a digit from 0-9 or a period.
+      Since periods are also used for struct members, we'll have to handle that as well. </p>
+    <p> The code for reading numbers is pretty complex, so I'll comment it heavily. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":read-number" href="#:read-number">Read
+              number</a></em></strong></span>
+      <pre class="prettyprint"><code class="">
+static token_t *read_number(void) {
+  int c;
+  char buf[1024];
+  int i = 0;
+  c = input_getc();
+  // If we don't have a digit or decimal point, it's not a number
+  if (!isdigit(c) && c != '.') {
+    input_ungetc(c);
+    return NULL;
+  }
+  // Decimal point followed by non-digit is a struct member
+  if (c == '.') {
+    char cnext = input_getc();
+    if (!isdigit(cnext)) {
+      input_ungetc(cnext);
+      return token_create(TOK_OP_MEMBER, line, column, 1);
+    }
+    input_ungetc(cnext);
+  }
+  int radix = 10;
+  int is_float = 0;
+  // Check for hex and octal.
+  if (c == '0') {
+    char cnext = input_getc();
+    if (cnext == 'x' || cnext == 'X') {
+      // Hex, discard the 0x
+      radix = 16;
+    } else {
+      // Octal, discard the 0
+      input_ungetc(cnext);
+      radix = 8;
+    }
+  } else {
+    // Decimal, append the first digit
+    buf[i++] = c;
+  }
+  // Read the rest of the number
+  while ((c = input_getc()) != EOF) {
+    // Since there can be multiple writes to the buffer, we want to make sure we
+    // don't overflow by giving a 4 byte pad
+    if (i > 1020) {
+      tok_error("Number too long\n");
+      return NULL;
+    }
+    // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
+    // a-f/A-F for hex
+    if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
+        (radix == 8 && c >= '0' && c <= '7')) {
+      buf[i++] = c;
+      // Decimal point and not a float yet, must be a float
+    } else if (c == '.' && !is_float) {
+      is_float = 1;
+      if (radix != 10) {
+        tok_error("Invalid floating point constant, expected decimal, got %s\n",
+                  radix == 16 ? "hexadecimal" : "octal");
+        return NULL;
+      }
+      buf[i++] = c;
+    }
+    // Exponent on the end of a constant. (By standard this forces it to be a
+    // float)
+    else if (c == 'e' || c == 'E') {
+      buf[i++] = c;
+      c = input_getc();
+      // Sign on the exponent
+      if (c == '+' || c == '-') {
+        buf[i++] = c;
+        c = input_getc();
+      }
+      // Exponent must be a digit, I.E no 1e1.2
+      if (!isdigit(c)) {
+        tok_error("Invalid floating point exponent\n");
+        return NULL;
+      }
+      buf[i++] = c;
+      is_float = 1;
+    } else {
+      // Reached the end, unget the character so other functions can read it
+      input_ungetc(c);
+      break;
+    }
+  }
+  buf[i] = '\0';
+  // Check for suffixes
+  int is_unsigned = 0;
+  int is_long = 0;
+  int is_single = 0;
+  // Loop to get all possible suffixes. Warn when duplicated.
+  while (1) {
+    c = input_getc();
+    if (c == 'u' || c == 'U') {
+      if (is_unsigned) {
+        tok_warn(
+            "Warning: Duplicate suffix 'u' for integer constant ignored\n");
+      }
+      is_unsigned = 1;
+    } else if (c == 'l' || c == 'L') {
+      if (is_long) {
+        tok_warn(
+            "Warning: Duplicate suffix 'l' for integer constant ignored\n");
+      }
+      is_long = 1;
+    } else if (c == 'f' || c == 'F') {
+      if (is_single) {
+        tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
+                 "ignored\n");
+      }
+      is_single = 1;
+    } else {
+      input_ungetc(c);
+      break;
+    }
+  }
+  // Resolve invalid suffixes. Doesn't error because you can still compile with
+  // them.
+  if (is_single && is_long) {
+    tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
+             "constant. Ignoring 'l'\n");
+    is_long = 0;
+  }
+  if (is_single && is_unsigned) {
+    tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
+             "constant. Ignoring 'u'\n");
+    is_unsigned = 0;
+  }
+  if (is_single && !is_float) {
+    tok_warn(
+        "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
+    is_single = 0;
   }
+  // use the strtox functions to convert the string to a number
+  if (is_float) {
+    errno = 0;
+    // Strtod generates a unix-style error when it's given something out of
+    // range, so we want to get on top of that quickly instead of ignoring it
+    // That way we can avoid some nasty NAN-propagation in the constant folder.
+    double f = strtod(buf, NULL);
+    if (errno == ERANGE) {
+      tok_error("Floating point constant out of range\n");
+      return NULL;
+    }
+    // Warn if the constant is out of range for a float, I.E it's too big or too
+    // small
+    if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
+      tok_warn(
+          "Warning: Floating point constant %f is out of range for float\n", f);
+    }
+    // Warn if the constant is too precise for a float
+    if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
+      tok_warn("Warning: Converting double precision floating point constant "
+               "%f to float loses "
+               "precision\n",
+               f);
+    }
+    return token_create_float(is_single ? TOK_CONST_FLOAT_32
+                                        : TOK_CONST_FLOAT_64,
+                              line, column, f, i);
+  } else {
+    errno = 0;
+    uint64_t int_ = strtoull(buf, NULL, radix);
+    // Same as above, but for integers
+    if (errno == ERANGE) {
+      tok_error("Integer constant out of range\n");
+      return NULL;
+    }
+    if (is_unsigned) {
+      if (is_long) {
+        return token_create_int(TOK_CONST_INTEGER_U64, line, column, int_, i);
+      } else {
+        if (int_ > UINT32_MAX) {
+          tok_warn(
+              "Warning: Integer constant %lld is out of range for unsigned "
+              "int\n",
+              int_);
+        }
+        return token_create_int(TOK_CONST_INTEGER_U32, line, column, int_, i);
+      }
+    } else {
+      if (is_long) {
+        // If the highest bit is set, that means this will overflow a signed
+        // long (Due to two's complement)
+        if (int_ & (1UL << 63)) {
+          tok_warn(
+              "Warning: Integer constant %lld is out of range for long long\n",
+              i);
+        }
+        return token_create_int(TOK_CONST_INTEGER_S64, line, column, int_, i);
+      } else {
+        if (int_ & (1UL << 31)) {
+          tok_warn("Warning: Integer constant %lld is out of range for int\n",
+                   int_);
+        }
+        return token_create_int(TOK_CONST_INTEGER_S32, line, column, int_, i);
+      }
+    }
+  }
+  return NULL;
 }</code></pre>
-      </div>
-      <p>For example, if we just fed the program the keyword "if", it would output the following: </p>
-      <div class="code-block">
-        <span class="file-name">N/A</span>
-        <span class="block-header">
-          <strong class="block-title"><em><a id=":if-keyword" href="#:if-keyword">If
-                keyword</a></em></strong></span>
-        <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
-  switch (buf[0]) {
-    case 'i':
-      switch (buf[1]) {
-        case 'f':
-          switch (buf[2]) {
-            case '\0':
-              return TOK_CTK_IF;
-            default:
-              return TOK_ID;
-          }
-        default:
-          return TOK_ID;
+    </div>
+    <p>Now let's implement char constants. The rules for those are pretty simple, as I'm not doing multi-byte or wide
+      characters. </p>
+    <div
+      style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+      <pre class="prettyprint"><code class="">character-constant ::= '\'' c-char-sequence '\''
+c-char-sequence ::= c-char | c-char-sequence c-char
+c-char ::= any member of the source character set except the single-quote ', backslash \, or new-line character | escape-sequence
+
+escape-sequence ::= simple-escape-sequence | octal-escape-sequence | hexadecimal-escape-sequence
+simple-escape-sequence ::= '\' ( 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v' | '\'' | '"' | '?' | '\')
+octal-escape-sequence ::= '\' octal-digit | '\' octal-digit octal-digit | '\' octal-digit octal-digit octal-digit
+hexadecimal-escape-sequence ::= '\' 'x' hexadecimal-digit | hexadecimal-escape-sequence hexadecimal-digit
+
+octal-digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7'
+hexadecimal-digit ::= digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
+digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'</code></pre>
+    </div>
+    <p>We start with the code to read escape sequences (which can be reused for string literals). </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":read-esacpe-sequence" href="#:read-escape-sequence">Read
+              escape sequence</a></em></strong></span>
+      <pre class="prettyprint"><code class="">static char read_escape_sequence(int *len) {
+  int c = input_getc();
+  *len += 1;
+  switch (c) {
+  case 'a':
+    return '\a';
+  case 'b':
+    return '\b';
+  case 'f':
+    return '\f';
+  case 'n':
+    return '\n';
+  case 'r':
+    return '\r';
+  case 't':
+    return '\t';
+  case 'v':
+    return '\v';
+  case '\'':
+    return '\'';
+  case '"':
+    return '"';
+  case '?':
+    return '?';
+  case '\\':
+    return '\\';
+  case '0':
+    return '\0';
+  case 'x': {
+    c = input_getc();
+    if (!isxdigit(c)) {
+      tok_error("Invalid hexadecimal escape sequence\n");
+      return 0;
+    }
+    int val = 0;
+    while (isxdigit(c)) {
+      *len += 1;
+      val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
+      c = input_getc();
+    }
+    input_ungetc(c);
+    return (char)val;
+  }
+  default:
+    if (!isdigit(c)) {
+      tok_error("Invalid escape sequence\n");
+      return 0;
+    }
+    int val = 0;
+    while (isdigit(c)) {
+      *len += 1;
+      val = val * 8 + c - '0';
+      c = input_getc();
+    }
+    input_ungetc(c);
+    return (char)val;
+  }
+}</code></pre>
+    </div>
+    <p>Now we can implement the code to read character constants. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":read-char-constant" href="#:read-char-constant">Read
+              character constant</a></em></strong></span>
+      <pre class="prettyprint"><code class="">static token_t *read_char_constant(void) {
+  int c;
+  int len = 0;
+  c = input_getc();
+  if (c != '\'') {
+    input_ungetc(c);
+    return NULL;
+  }
+  len++;
+  c = input_getc();
+  if (c == '\'') {
+    tok_error("Empty character constant\n");
+    return NULL;
+  }
+  if (c == '\\') {
+    c = read_escape_sequence(&len);
+  }
+  int val = c;
+  c = input_getc();
+  if (c != '\'') {
+    tok_error("Expected closing quote for character constant\n");
+    return NULL;
+  }
+  len++;
+  return token_create_char(TOK_CONST_CHAR, line, column, val, len);
+}</code></pre>
+    </div>
+    <p>Finally, we can implement string literals. </p>
+    <p>The grammar for string literals is pretty simple. </p>
+    <div
+      style="color: #333; background-color: #f8f8f8; border: 1px solid #ccc; border-radius: 4px; margin: 20px 0; padding: 20px;">
+      <pre
+        class="prettyprint"><code class="">string-literal ::= '"' s-char-sequence '"'
+s-char-sequence ::= s-char | s-char-sequence s-char
+s-char ::= any member of the source character set except the double-quote ", backslash \, or new-line character | escape-sequence</code></pre>
+    </div>
+    <p>What that means for us is that we just have to skip over the string until we hit an unescaped double quote. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":read-string-literal" href="#:read-string-literal">Read
+              string literal</a></em></strong></span>
+      <pre class="prettyprint"><code class="">static token_t *read_string_literal(void) {
+  int c;
+  c = input_getc();
+  if (c != '"') {
+    input_ungetc(c);
+    return NULL;
+  }
+  int i = 0;
+  // Malloc is used for the buf here, the pointer stays function local as string
+  // interning duplicates the string
+  char s_buf[512];
+  char *buf = s_buf;
+  int len = 512;
+  while ((c = input_getc()) != EOF) {
+    if (c == '"') {
+      // Implicit skip of closing quote
+      break;
+    }
+    if (c == '\\') {
+      c = read_escape_sequence();
+      if (c == 0) {
+        return NULL;
       }
-    default:
-      return TOK_ID;
+    }
+    if (i >= len) {
+      if (buf == s_buf) {
+        buf = malloc(1024);
+        if (buf == NULL) {
+          fputs("Out of memory. Could not parse string literal.\n", stderr);
+          exit(1);
+        }
+        memcpy(buf, s_buf, 512);
+        len *= 2;
+      } else {
+        len *= 2;
+        buf = realloc(buf, len);
+      }
+    }
+    buf[i++] = c;
+  }
+  buf[i] = '\0';
+  if (c == EOF) {
+    tok_error("Unterminated string literal\n");
+    if (buf != s_buf) {
+      free(buf);
+    }
+    return NULL;
+  }
+
+  token_t *tok =
+      token_create_string(TOK_CONST_STRING_ASCII, line, column, buf, i);
+  if (buf != s_buf) {
+    free(buf);
+  }
+  return tok;
+}</code></pre>
+    </div>
+    <p>This function uses a smaller buffer on the stack to store the string, and if it grows too large, it allocates a
+      larger buffer on the heap. This helps to avoid unnecessary allocations for small strings. </p>
+    <p>Let's recap. We've implemented code for recognizing keywords, operators, numbers, character constants, and string
+      literals. That's everything we need for the main function. We just call them all in order. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":main-loop" href="#:main-loop">Main
+              loop</a></em></strong></span>
+      <pre class="prettyprint"><code class="">token_t *read_token(void) {
+  if (unget_token != NULL) {
+    token_t *tok = unget_token;
+    unget_token = NULL;
+    return tok;
+  }
+  token_t *tok = skip_whitespace();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_identifier();
+  if (tok != NULL) {
+    return tok;
   }
+  tok = read_number();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_char_constant();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_string_literal();
+  if (tok != NULL) {
+    return tok;
+  }
+  tok = read_operator();
+  if (tok != NULL) {
+    return tok;
+  }
+  int c = input_getc();
+  if (c == EOF) {
+    return NULL;
+  }
+  tok_warn(
+      "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
+      line, column);
+  return read_token();
 }
 </code></pre>
-      </div>
-      <p>The program will be placed into the main tokenizer.c file using the HIGH QUALITY TECHNIQUE of including a c file
-        in another c file. </p>
-        <p> Time to write the program. </p>
-        
+    </div>
+    <p>With that, we're basically done.</p>
+    <h3>Error Handling</h3>
+    <p>One thing we haven't covered yet is error handling. We've used <code>tok_error</code> and <code>tok_warn</code>
+      to print errors and warnings, but we haven't actually implemented them. Let's do that now. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":error-handling" href="#:error-handling">Error
+              handling</a></em></strong></span>
+      <pre class="prettyprint"><code class="">void tok_error(const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
+          column);
+  vfprintf(stderr, fmt, args);
+  va_end(args);
+}
 
+void tok_warn(const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
+          column);
+  vfprintf(stderr, fmt, args);
+  va_end(args);
+}</code></pre>
+    </div>
+    <h3>Debugging</h3>
+    <p>It'd be helpful to have a way to print out the tokens we're reading. Let's add a function for that. </p>
+    <div class="code-block">
+      <span class="file-name">tokenizer.c</span>
+      <span class="block-header">
+        <strong class="block-title"><em><a id=":print-token" href="#:print-token">Print
+              token</a></em></strong></span>
+      <pre class="prettyprint"><code class="">char *re_escape_string(const char *str) {
+  int len = strlen(str);
+  char *buf = malloc(len * 2 + 1);
+  if (buf == NULL) {
+    fprintf(stderr, "Out of memory. Cannot escape string\n");
+    exit(1);
+  }
+  int i = 0;
+  for (int j = 0; j < len; j++) {
+    switch (str[j]) {
+    case '\a':
+      buf[i++] = '\\';
+      buf[i++] = 'a';
+      break;
+    case '\b':
+      buf[i++] = '\\';
+      buf[i++] = 'b';
+      break;
+    case '\f':
+      buf[i++] = '\\';
+      buf[i++] = 'f';
+      break;
+    case '\n':
+      buf[i++] = '\\';
+      buf[i++] = 'n';
+      break;
+    case '\r':
+      buf[i++] = '\\';
+      buf[i++] = 'r';
+      break;
+    case '\t':
+      buf[i++] = '\\';
+      buf[i++] = 't';
+      break;
+    case '\v':
+      buf[i++] = '\\';
+      buf[i++] = 'v';
+      break;
+    case '\\':
+      buf[i++] = '\\';
+      buf[i++] = '\\';
+      break;
+    case '\'':
+      buf[i++] = '\\';
+      buf[i++] = '\'';
+      break;
+    case '"':
+      buf[i++] = '\\';
+      buf[i++] = '"';
+      break;
+    default:
+      buf[i++] = str[j];
+      break;
+    }
+  }
+  buf[i] = '\0';
+  return buf;
+}
+
+void print_token(token_t *tok) {
+  if (tok == NULL) {
+    printf("NULL\n");
+    return;
+  }
+  const char *name = token_name_from_type(tok->kind);
+  switch (tok->kind) {
+  case TOK_ID:
+  case TOK_CONST_STRING_ASCII: {
+    char *escaped = re_escape_string(token_string(tok));
+    printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column);
+    free(escaped);
+    break;
+  }
+  case TOK_CONST_CHAR:
+    printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column);
+    break;
+  case TOK_CONST_INTEGER_S32:
+  case TOK_CONST_INTEGER_U32:
+  case TOK_CONST_INTEGER_S64:
+  case TOK_CONST_INTEGER_U64:
+    printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column);
+    break;
+  case TOK_CONST_FLOAT_32:
+  case TOK_CONST_FLOAT_64:
+    printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column);
+    break;
+  default:
+    printf("%s@%d:%d\n", name, tok->line, tok->column);
+    break;
+  }
+}</code></pre>
+    </div>
+    <p>The name of the token is retrieved using <code>token_name_from_type</code>, which is a simple switch statement.
+      It's long and boring, check the source code if you're interested. </p>
+    <h3>Bugs/Errata</h3>
+    <p> I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The
+      code you see here is the final version, with all bugs fixed. </p>
+    <ul>
+      <li> had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for
+        lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code>
+        check which if true returns EOF, preventing input initialization. Fixed by changing to
+        <code>buffer_pos == buffer_size</code>.
+      </li>
+      <li>assertion <code>token->kind == TOK_CONST_STRING_ASCII</code> failed in token_string. Forgot
+        to expand check for identifiers which also use token_string. Fixed by changing to
+        <code>token->kind == TOK_CONST_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>.
+      </li>
+      <li> token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving
+        the call to free after
+        the call to <code>hash_table_get</code>.</li>
+      <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created
+        double free. Fixed by
+        rewriting part of function.</li>
+      <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.</li>
+      <li>Destructor for string literals missing in tokenizer teardown, added it in.</li>
+      <li> read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting.
+        Added explicit cast.</li>
+      <li>read_char_constant - Forgot to handle '\0'. Added.</li>
+      <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by
+        adding a check for division operator.</li>
+      <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced
+        with ELF hash function.</li>
+      <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an
+        array index. Ew. Used i instead of strlen.</li>
+      <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead
+        of token_create_string for keywords.</li>
+      <li>Everywhere - Checks added for memory allocation failure.</li>
+      <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.</li>
+    </ul>
+    <h3>Conclusion</h3>
+    <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust. </p>
+    <p>Next time, we'll start on the parser. </p>
+    <h1>Source code, biblography</h1>
+    <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is
+      available <a href="/projects/cminus/code/tokenizer.h">here</a>. </p>
+    <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is
+      available <a href="/projects/cminus/code/input.h">here</a>. </p>
+    <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is
+      available <a href="/projects/cminus/code/hash_table.h">here</a>. </p>
+    <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable
+      Compiler for ANSI C. Grammars are from The C Reference Manual.</p>
+      <p> I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).</p>
+      <p> Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.</p>
+      <p> The code for the ELF hash function is from the glibc source code.</p>
+      <p> The idea for decision trees came from LCC.</p>
   </main>
 
-  <footer style="text-align: center; padding: 20px;">
+  <footer style=" text-align: center; padding: 20px;">
     <p>© 2021 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
       license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p>
   </footer>

	website Website contents
	git clone git://git.reagancfischer.dev/website.git
	Log \| Files \| Refs

M	blog.html	\|	1	-
M	projects/cminus/code/hash_table.c	\|	14	+++++++++++---
M	projects/cminus/code/hash_table.h	\|	9	+++++----
M	projects/cminus/code/input.c	\|	17	++++++++---------
M	projects/cminus/code/input.h	\|	3	+--
M	projects/cminus/code/tokenizer.c	\|	1299	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M	projects/cminus/code/tokenizer.h	\|	120	+++++++++++++++++++++++++++++++++++++++----------------------------------------
M	projects/cminus/lexer.html	\|	1767	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------