commit c3abcec683384c5aaa5140189f4226cafed68239
parent 7d06a346d64a47ff7645340ba6a41a34ee65b7db
Author: Reagan <rfische2@uccs.edu>
Date: Wed, 21 Aug 2024 15:51:10 -0600
Blog
Diffstat:
15 files changed, 998 insertions(+), 83 deletions(-)
diff --git a/about.html b/about.html
@@ -45,7 +45,7 @@
<footer>
<hr>
- <p style="text-align: center; font-size: 0.75em;">© Copyright 2023. Made with <a
+ <p style="text-align: center; font-size: 0.75em;">© Copyright 2024. Made with <a
href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.</p>
<hr>
</footer>
diff --git a/blog.html b/blog.html
@@ -47,7 +47,7 @@
<footer>
<hr>
- <p style="text-align: center; font-size: 0.75em;">© Copyright 2023. Made with <a
+ <p style="text-align: center; font-size: 0.75em;">© Copyright 2024. Made with <a
href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.</p>
<hr>
</footer>
diff --git a/blog/libc_22oct23.html b/blog/libc_22oct23.html
@@ -32,7 +32,7 @@
<footer>
<hr>
- <p style="text-align: center; font-size: 0.75em;">© Copyright 2023. Made with <a
+ <p style="text-align: center; font-size: 0.75em;">© Copyright 2024. Made with <a
href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.</p>
<hr>
</footer>
diff --git a/cat/index.html b/cat/index.html
@@ -8,7 +8,8 @@
<link rel="stylesheet" href="/inter.css">
<link rel="stylesheet" href="/new.css">
<link rel="icon" type="image/x-icon" href="/images/win95.png">
- <link rel="apple-touch-icon" href="/images/win95.png"> <style>
+ <link rel="apple-touch-icon" href="/images/win95.png">
+ <style>
.blue-box {
background-color: #007BFF;
padding: 20px;
@@ -50,48 +51,49 @@
height: 100%;
object-fit: cover;
}
- /* Dark Mode Styles */
- @media (prefers-color-scheme: dark) {
- body {
- background-color: #121212;
- color: #FFFFFF;
- }
-
- header {
- background-color: #1f1f1f;
- color: #FFFFFF;
- }
-
- nav a {
- color: #BBDEFB;
- }
- nav a:hover {
- color: #90CAF9;
- }
-
- .blue-box {
- background-color: #0057C0;
- box-shadow: 0 5px 10px rgba(255, 255, 255, 0.1);
- }
+ /* Dark Mode Styles */
+ @media (prefers-color-scheme: dark) {
+ body {
+ background-color: #121212;
+ color: #FFFFFF;
+ }
- .cat-image-frame {
- border-color: #333;
- }
+ header {
+ background-color: #1f1f1f;
+ color: #FFFFFF;
+ }
- footer {
- background-color: #1f1f1f;
- color: #FFFFFF;
- }
+ nav a {
+ color: #BBDEFB;
+ }
- footer a {
- color: #BBDEFB;
- }
+ nav a:hover {
+ color: #90CAF9;
+ }
+
+ .blue-box {
+ background-color: #0057C0;
+ box-shadow: 0 5px 10px rgba(255, 255, 255, 0.1);
+ }
+
+ .cat-image-frame {
+ border-color: #333;
+ }
- footer a:hover {
- color: #90CAF9;
+ footer {
+ background-color: #1f1f1f;
+ color: #FFFFFF;
+ }
+
+ footer a {
+ color: #BBDEFB;
+ }
+
+ footer a:hover {
+ color: #90CAF9;
+ }
}
- }
</style>
</head>
@@ -103,8 +105,8 @@
<a href="../projects.html">Purrjects</a> /
<a href="../blog.html">Catablog</a> /
<a href="../resume.pdf">Resumeow</a> /
- <a href="../contact.html">Contact Meow</a>
- </nav>
+ <a href="../contact.html">Contact Meow</a>
+ </nav>
</header>
<main>
@@ -117,7 +119,9 @@
</main>
<footer>
- <p>© Copyright 2023. Made with <a href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer. Cat images sourced from <a href="https://www.kaggle.com/datasets/crawford/cat-dataset">this Kaggle Dataset</a></p>
+ <p>© Copyright 2024. Made with <a href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer. Cat
+ images sourced from <a href="https://www.kaggle.com/datasets/crawford/cat-dataset">this Kaggle Dataset</a>
+ </p>
</footer>
<script>
@@ -138,19 +142,19 @@
}
displayRandomCat();
</script>
- <script defer>
- var _paq = window._paq = window._paq || [];
- /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
- _paq.push(['trackPageView']);
- _paq.push(['enableLinkTracking']);
- (function() {
- var u="//matomo.thespringstechguy.com/";
- _paq.push(['setTrackerUrl', u+'matomo.php']);
- _paq.push(['setSiteId', '1']);
- var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
- g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
- })();
- </script>
+ <script defer>
+ var _paq = window._paq = window._paq || [];
+ /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
+ _paq.push(['trackPageView']);
+ _paq.push(['enableLinkTracking']);
+ (function () {
+ var u = "//matomo.thespringstechguy.com/";
+ _paq.push(['setTrackerUrl', u + 'matomo.php']);
+ _paq.push(['setSiteId', '1']);
+ var d = document, g = d.createElement('script'), s = d.getElementsByTagName('script')[0];
+ g.async = true; g.src = u + 'matomo.js'; s.parentNode.insertBefore(g, s);
+ })();
+ </script>
</body>
-</html>
+</html>+
\ No newline at end of file
diff --git a/contact.html b/contact.html
@@ -34,7 +34,7 @@
<footer>
<hr>
- <p style="text-align: center; font-size: 0.75em;">© Copyright 2023. Made with <a
+ <p style="text-align: center; font-size: 0.75em;">© Copyright 2024. Made with <a
href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.</p>
<hr>
</footer>
diff --git a/images/netscapenow.gif b/images/netscapenow.gif
Binary files differ.
diff --git a/index.html b/index.html
@@ -87,7 +87,7 @@
</main>
<footer>
<hr>
- <p style="text-align: center; font-size: 0.75em;">© Copyright 2023. Made with <a
+ <p style="text-align: center; font-size: 0.75em;">© Copyright 2024. Made with <a
href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.</p>
<hr>
</footer>
diff --git a/projects.html b/projects.html
@@ -156,7 +156,7 @@
<footer>
<hr />
<p style="text-align: center; font-size: 0.75em">
- © Copyright 2023. Made with
+ © Copyright 2024. Made with
<a href="https://newcss.net/">love and NewCSS</a> by Reagan Fischer.
</p>
<hr />
diff --git a/projects/cminus/code/hash_table.c b/projects/cminus/code/hash_table.c
@@ -0,0 +1,123 @@
+#include "hash_table.h"
+#include <stdlib.h>
+struct hash_table {
+ hash_table_entry_t **entries;
+ int size;
+ hash_table_cmp_fn cmp;
+ hash_table_hash_fn hash;
+};
+struct hash_table_entry {
+ void *key;
+ void *value;
+ hash_table_entry_t *next;
+};
+
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+ hash_table_hash_fn hash) {
+ hash_table_t *table = malloc(sizeof(hash_table_t));
+ if (table == NULL) {
+ return NULL;
+ }
+ table->entries = calloc(size, sizeof(hash_table_entry_t *));
+ if (table->entries == NULL) {
+ free(table);
+ return NULL;
+ }
+ table->size = size;
+ table->cmp = cmp;
+ table->hash = hash;
+ return table;
+}
+
+void hash_table_destroy(hash_table_t *table) {
+ for (int i = 0; i < table->size; i++) {
+ hash_table_entry_t *entry = table->entries[i];
+ while (entry != NULL) {
+ hash_table_entry_t *next = entry->next;
+ free(entry);
+ entry = next;
+ }
+ }
+ free(table->entries);
+ free(table);
+}
+
+void *hash_table_get(hash_table_t *table, void *key) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ return entry->value;
+ }
+ entry = entry->next;
+ }
+ return NULL;
+}
+
+int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (replace) {
+ entry->value = value;
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+ entry = entry->next;
+ }
+ entry = malloc(sizeof(hash_table_entry_t));
+ entry->key = key;
+ entry->value = value;
+ entry->next = table->entries[hash];
+ table->entries[hash] = entry;
+ return 0;
+}
+
+void hash_table_remove(hash_table_t *table, void *key) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ hash_table_entry_t *prev = NULL;
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (prev == NULL) {
+ table->entries[hash] = entry->next;
+ } else {
+ prev->next = entry->next;
+ }
+ free(entry);
+ return;
+ }
+ prev = entry;
+ entry = entry->next;
+ }
+}
+
+#ifdef TEST_HASH_TABLE
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+int string_cmp(void *key1, void *key2) { return strcmp(key1, key2); }
+unsigned long string_hash(void *key) {
+ unsigned long hash = 5381;
+ char *str = key;
+ while (*str != '\0') {
+ hash = ((hash << 5) + hash) + *str;
+ str++;
+ }
+ return hash;
+}
+
+int main() {
+ hash_table_t *table = hash_table_create(16, string_cmp, string_hash);
+ assert(hash_table_put(table, "foo", "bar", 0) == 0);
+ assert(hash_table_put(table, "foo", "baz", 1) == 0);
+ assert(strcmp(hash_table_get(table, "foo"), "baz") == 0);
+ hash_table_remove(table, "foo");
+ assert(hash_table_get(table, "foo") == NULL);
+ hash_table_destroy(table);
+ return 0;
+}
+#endif+
\ No newline at end of file
diff --git a/projects/cminus/code/hash_table.h b/projects/cminus/code/hash_table.h
@@ -0,0 +1,13 @@
+#ifndef HASH_TABLE_H
+#define HASH_TABLE_H
+typedef struct hash_table hash_table_t;
+typedef struct hash_table_entry hash_table_entry_t;
+typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
+typedef unsigned long (*hash_table_hash_fn)(void *key);
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp,
+ hash_table_hash_fn hash);
+void hash_table_destroy(hash_table_t *table);
+void *hash_table_get(hash_table_t *table, void *key);
+int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
+void hash_table_remove(hash_table_t *table, void *key);
+#endif+
\ No newline at end of file
diff --git a/projects/cminus/code/input.c b/projects/cminus/code/input.c
@@ -33,11 +33,23 @@ int input_getc(void) {
return buffer[buffer_pos++];
}
-void input_ungetc(int c) {
- unget_buffer = c;
-}
+void input_ungetc(int c) { unget_buffer = c; }
+
+void input_destroy(void) { fclose(file); }
-void input_destroy(void) {
+#ifdef TEST_INPUT
+#include <assert.h>
+int main() {
+ FILE *file = fopen("input.c", "rb");
+ input_init("input.c");
+ int c1, c2;
+ while ((c1 = fgetc(file)) != EOF) {
+ c2 = input_getc();
+ assert(c1 == c2);
+ }
+ assert(input_getc() == EOF);
fclose(file);
+ input_destroy();
+ return 0;
}
-
+#endif+
\ No newline at end of file
diff --git a/projects/cminus/code/tokenizer.c b/projects/cminus/code/tokenizer.c
@@ -0,0 +1,159 @@
+#include "tokenizer.h"
+#include "hash_table.h"
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+// Token data
+#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
+#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
+struct token {
+ long magic;
+ int line;
+ int column;
+ short kind;
+ char opt_data[0];
+};
+
+typedef struct token token_t;
+
+struct token_data {
+ union {
+ long long i;
+ double f;
+ const char *s;
+ char c;
+ } data;
+};
+
+typedef struct token_data token_data_t;
+
+// Token creation
+#define token_data(token) ((struct token_data *)((token)->opt_data))
+static token_t *token_data_create(c_token_types kind, int line, int column) {
+ token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
+ token->magic = TOK_MAGIC_1;
+ token->line = line;
+ token->column = column;
+ token->kind = kind;
+ return token;
+}
+
+static token_t *token_create(c_token_types kind, int line, int column) {
+ token_t *token = malloc(sizeof(token_t));
+ token->magic = TOK_MAGIC_2;
+ token->line = line;
+ token->column = column;
+ token->kind = kind;
+ return token;
+}
+
+static token_t *token_create_int(c_token_types kind, int line, int column,
+ int64_t i) {
+ token_t *token = token_data_create(kind, line, column);
+ token_data(token)->data.i = i;
+ return token;
+}
+
+static token_t *token_create_float(c_token_types kind, int line, int column,
+ double f) {
+ token_t *token = token_data_create(kind, line, column);
+ token_data(token)->data.f = f;
+ return token;
+}
+
+#define PRIME 211
+static unsigned long hash_string(void *key) {
+ unsigned long hash = 0, g;
+ char *p = key;
+ while (*p) {
+ hash = (hash << 4) + *p++;
+ if ((g = hash & 0xf0000000) != 0) {
+ hash ^= g >> 24;
+ hash ^= g;
+ }
+ }
+ return hash;
+}
+
+static int cmp_string(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+
+static hash_table_t *string_table;
+static token_t *token_create_string(c_token_types kind, int line, int column,
+ const char *s) {
+ if (string_table == NULL) {
+ string_table = hash_table_create(1024, cmp_string, hash_string);
+ }
+ token_t *token = token_data_create(kind, line, column);
+ char *key = strdup(s);
+ if (hash_table_put(string_table, key, key, 0)) {
+ free(key);
+ key = hash_table_get(string_table, key);
+ }
+ token_data(token)->data.s = key;
+ return token;
+}
+
+static token_t *token_create_char(c_token_types kind, int line, int column,
+ char c) {
+ token_t *token = token_data_create(kind, line, column);
+ token_data(token)->data.c = c;
+ return token;
+}
+
+// External token operations
+
+c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
+}
+
+int64_t token_int(token_t *token) {
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
+}
+
+double token_float(token_t *token) {
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
+}
+
+const char *token_string(token_t *token) {
+ assert(token->kind == TOK_CONST_STRING_ASCII);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
+}
+
+char token_char(token_t *token) {
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
+}
+
+int token_line(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
+}
+
+int token_column(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
+}
+
+void token_destroy(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ if (token->magic == TOK_MAGIC_1) {
+ if (token->kind == TOK_CONST_STRING_ASCII) {
+ free((void *)token_data(token)->data.s);
+ }
+ }
+ free(token);
+}
diff --git a/projects/cminus/code/tokenizer.h b/projects/cminus/code/tokenizer.h
@@ -0,0 +1,133 @@
+#ifndef TOKENIZER_H
+#define TOKENIZER_H
+#include <stdint.h>
+
+typedef struct token token_t;
+typedef enum {
+ // Control Keywords
+ TOK_CTK_IF, // if
+ TOK_CTK_ELSE, // else
+ TOK_CTK_SWITCH, // switch
+ TOK_CTK_CASE, // case
+ TOK_CTK_DEFAULT, // default
+ TOK_CTK_WHILE, // while
+ TOK_CTK_DO, // do
+ TOK_CTK_FOR, // for
+ TOK_CTK_CONTINUE, // continue
+ TOK_CTK_RETURN, // return
+ TOK_CTK_GOTO, // goto
+
+ // Type Keywords
+ TOK_TK_VOID,
+ TOK_TK_CHAR,
+ TOK_TK_SHORT,
+ TOK_TK_INT,
+ TOK_TK_LONG,
+ TOK_TK_FLOAT,
+ TOK_TK_DOUBLE,
+ TOK_TK_SIGNED,
+ TOK_TK_UNSIGNED,
+ TOK_TK_STRUCT,
+ TOK_TK_UNION,
+ TOK_TK_ENUM,
+ TOK_TK_TYPEDEF,
+
+ // Storage Class/Specifier Keywords
+ TOK_SCSK_AUTO,
+ TOK_SCSK_REGISTER,
+ TOK_SCSK_STATIC,
+ TOK_SCSK_EXTERN,
+ TOK_SCSK_CONST,
+ TOK_SCSK_VOLATILE,
+
+ // Misc Keywords
+ TOK_MK_SIZEOF,
+
+ // Operators
+ TOK_OP_PLUS, // +
+ TOK_OP_MINUS, // -
+ TOK_OP_ASTERISK, // *
+ TOK_OP_SLASH, // /
+ TOK_OP_PERCENT, // %
+ TOK_OP_AMPERSAND, // &
+ TOK_OP_PIPE, // |
+ TOK_OP_CARET, // ^
+ TOK_OP_TILDE, // ~
+ TOK_OP_EXCLAMATION, // !
+ TOK_OP_EQUALS, // =
+ TOK_OP_LESS_THAN, // <
+ TOK_OP_GREATER_THAN, // >
+ TOK_OP_PLUS_PLUS, // ++
+ TOK_OP_MINUS_MINUS, // --
+ TOK_OP_EQUALS_EQUALS, // ==
+ TOK_OP_NOT_EQUALS, // !=
+ TOK_OP_LESS_EQUALS, // <=
+ TOK_OP_GREATER_EQUALS, // >=
+ TOK_OP_AMP_AMP, // &&
+ TOK_OP_PIPE_PIPE, // ||
+ TOK_OP_ARROW, // ->
+ TOK_OP_DOT, // .
+ TOK_OP_COLON, // :
+ TOK_OP_QUESTION, // ?
+ TOK_OP_COMMA, // ,
+ TOK_OP_SEMICOLON, // ;
+ TOK_OP_ELLIPSIS, // ...
+ TOK_OP_ASSIGN_PLUS, // +=
+ TOK_OP_ASSIGN_MINUS, // -=
+ TOK_OP_ASSIGN_ASTERISK, // *=
+ TOK_OP_ASSIGN_SLASH, // /=
+ TOK_OP_ASSIGN_PERCENT, // %=
+ TOK_OP_ASSIGN_AMPERSAND, // &=
+ TOK_OP_ASSIGN_PIPE, // |=
+ TOK_OP_ASSIGN_CARET, // ^=
+ TOK_OP_ASSIGN_LEFT_SHIFT, // <<=
+ TOK_OP_ASSIGN_RIGHT_SHIFT, // >>=
+
+ // Separators
+ TOK_SEP_LEFT_PAREN, // (
+ TOK_SEP_RIGHT_PAREN, // )
+ TOK_SEP_LEFT_BRACKET, // [
+ TOK_SEP_RIGHT_BRACKET, // ]
+ TOK_SEP_LEFT_BRACE, // {
+ TOK_SEP_RIGHT_BRACE, // }
+ TOK_SEP_COMMA, // ,
+ TOK_SEP_SEMICOLON, // ;
+ TOK_SEP_COLON, // :
+ TOK_SEP_DOT, // .
+ TOK_SEP_ELLIPSIS, // ...
+ TOK_SEP_HASH, // #
+
+ // Identifiers
+ TOK_ID, // Any identifier not ending in _t
+ TOK_TID, // Any identifier ending in _t
+
+ // Constants
+ TOK_CONST_INTEGER_U32, // u
+ TOK_CONST_INTEGER_U64, // ul
+ TOK_CONST_INTEGER_S32, // (no suffix)
+ TOK_CONST_INTEGER_S64, // l
+ TOK_CONST_FLOAT_32, // f
+ TOK_CONST_FLOAT_64, // (no suffix)
+ TOK_CONST_CHAR, // 'c'
+ TOK_CONST_STRING_ASCII, // "string" (width of 8 bits)
+
+ // Special
+ TOK_SPECIAL_EOF,
+ TOK_SPECIAL_ERROR,
+} c_token_types;
+
+void tokenizer_init(const char *filename);
+token_t *tokenizer_get(void);
+void tokenizer_unget(token_t *token);
+void tokenizer_destroy(void);
+
+c_token_types token_type(token_t *token);
+int64_t token_int(token_t *token);
+double token_float(token_t *token);
+const char *token_string(token_t *token);
+char token_char(token_t *token);
+int token_line(token_t *token);
+int token_column(token_t *token);
+void token_destroy(token_t *token);
+
+#endif+
\ No newline at end of file
diff --git a/projects/cminus/lexer.html b/projects/cminus/lexer.html
@@ -20,6 +20,13 @@
<img src="/images/line.gif" alt="decorative line">
</div>
</header>
+ <noscript>
+ <div style="background-color: #ffcccc; padding: 10px; text-align: center;">
+ <p><img src="/images/netscapenow.gif" alt="Netscape badge" width="88" height="31">
+ This site might look pretty old, but it still uses JS for syntax highlighting. If you'd like to see code
+ examples that don't look monochrome, please enable JS. or don't. I'm not your dad.</p>
+ </div>
+ </noscript>
<main style="padding: 20px;">
<h2>General Project Structure</h2>
<p>Since this is the first article, I'll outline the project structure for the C- interpreter.</p>
@@ -80,6 +87,9 @@
data structures which make the algorithms as simple as possible. </p>
<p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a
simple hash table. </p>
+ <h4>Misc</h4>
+ <p>The code on the blog is not the full code. I'm leaving out stuff like header guards and includes for brevity. The
+ full code is available on the <a href="git.reagancfischer.dev/cminus.git">git repo</a>.</p>
<h2>The Lexer</h2>
<p>A lexical analyzer reads source code and produces tokens, which are the smallest unit of meaning in a language.
For example, in the C programming language, the tokens are things like keywords (if, else, while, etc.),
@@ -244,9 +254,10 @@ c_token_types token_type(token_t *token);
int64_t token_int(token_t *token);
double token_float(token_t *token);
const char *token_string(token_t *token);
+char token_char(token_t *token);
int token_line(token_t *token);
int token_column(token_t *token);
-const char *token_file(token_t *token);
+void token_destroy(token_t *token);
</code></pre>
</div>
<p>I'll also define an opaque struct for tokens and an enum for token types. </p>
@@ -403,7 +414,7 @@ int main() {
struct</a></em></strong></span>
<pre class="prettyprint"><code class="">
#define TOK_MAGIC_1 0x544F4B454E544F4B // "TOKENTOK"
-#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKENT"
+#define TOK_MAGIC_2 0x544F4B544F4B454E // "TOKTOKEN"
struct token {
long magic;
int line;
@@ -411,6 +422,9 @@ struct token {
short kind;
char opt_data[0];
};
+
+typedef struct token token_t;
+
struct token_data {
union {
long long i;
@@ -419,16 +433,18 @@ struct token_data {
char c;
} data;
};
+
+typedef struct token_data token_data_t;
</code></pre>
</div>
<p>By keeping the token data off of simple tokens (like keywords and operators), we save 8 bytes per token. This
will add up quickly, as the lexer will produce a lot of tokens. </p>
- <p>Now that we have the token struct, we can implement some internal utilities for working with tokens </p>
+ <p>Now that we have the token struct, we can implement some internal utilities for working with tokens. </p>
<div class="code-block">
<span class="file-name">tokenizer.c</span>
<span class="block-header">
- <strong class="block-title"><em><a id=":token-api" href="#:token-api">Token
- API</a></em></strong></span>
+ <strong class="block-title"><em><a id=":token-internals" href="#:token-internals">Token
+ Internals</a></em></strong></span>
<pre class="prettyprint"><code class="">#define token_data(token) ((struct token_data *)((token)->opt_data))
static token_t *token_data_create(c_token_types kind, int line, int column) {
token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
@@ -449,13 +465,13 @@ static token_t *token_create(c_token_types kind, int line, int column) {
}
static token_t *token_create_int(c_token_types kind, int line, int column, int64_t i) {
- token_t *token = token_create(kind, line, column);
+ token_t *token = token_data_create(kind, line, column);
token_data(token)->data.i = i;
return token;
}
static token_t *token_create_float(c_token_types kind, int line, int column, double f) {
- token_t *token = token_create(kind, line, column);
+ token_t *token = token_data_create(kind, line, column);
token_data(token)->data.f = f;
return token;
}
@@ -465,18 +481,469 @@ static token_t *token_create_string(c_token_types kind, int line, int column, co
}
static token_t *token_create_char(c_token_types kind, int line, int column, char c) {
- token_t *token = token_create(kind, line, column);
+ token_t *token = token_data_create(kind, line, column);
+ token_data(token)->data.c = c;
+ return token;
+}
+</code></pre>
+ </div>
+ <p>You might wonder why token_create_string was left out. Before you ask that, try counting up all the repeated
+ strings in the code so far. Look at printf, for example. Let's say it's called 20 seperate times throughout the
+ code. that's <math>20 * strlen("printf") = 140</math> bytes for the strings alone. That's a lot of duplicated
+ memory.</p>
+ <p>If you've got some CS under your belt, you're probably screaming "hash table" at your screen right now. And
+ you're
+ right. I'll be implementing a basic hash table local to the lexer to keep track of strings in memory.</p>
+ <h4>Hash tables</h4>
+ <p>If you're not familiar with hash tables, the basic idea is that you have a function that takes a key and returns
+ a number. You then use that number to index into an array. The function calculates a (hopefully) unique value for
+ each input, which means every value has its own unique array index. this means you can get an O(1) lookup for any
+ value in the table. However, we don't have enough memory and hash functions aren't good enough for this ideal
+ case, so we'll get collisions. I'll be using a linked hash table, which means that each array entry is a linked
+ list consisting of the data and the next item in the chain. </p>
+ <p>I'm sure I'll want to use a hash table again in the future, and not necessarily for strings or with strcmp as the
+ key. So I'll make the hash table generic and a separate module. </p>
+ <div class="code-block">
+ <span class="file-name">hash_table.h</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":hash-table-interface" href="#:hash-table-interface">Hash
+ table interface</a></em></strong></span>
+ <pre class="prettyprint"><code class="">typedef struct hash_table hash_table_t;
+typedef struct hash_table_entry hash_table_entry_t;
+typedef int (*hash_table_cmp_fn)(void *key1, void *key2);
+typedef unsigned long (*hash_table_hash_fn)(void *key);
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash);
+void hash_table_destroy(hash_table_t *table);
+void *hash_table_get(hash_table_t *table, void *key);
+int hash_table_put(hash_table_t *table, void *key, void *value, int replace);
+void hash_table_remove(hash_table_t *table, void *key);
+</code></pre>
+ </div>
+ <p>Now that we have the hash table interface, we can implement the generic hash table. </p>
+ <div class="code-block">
+ <span class="file-name">hash_table.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":string-hash-table" href="#:string-hash-table">String
+ hash table</a></em></strong></span>
+ <pre class="prettyprint"><code class="">struct hash_table {
+ hash_table_entry_t **entries;
+ int size;
+ hash_table_cmp_fn cmp;
+ hash_table_hash_fn hash;
+};
+struct hash_table_entry {
+ void *key;
+ void *value;
+ hash_table_entry_t *next;
+};
+
+hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash) {
+ hash_table_t *table = malloc(sizeof(hash_table_t));
+ if (table == NULL) {
+ return NULL;
+ }
+ table->entries = calloc(size, sizeof(hash_table_entry_t *));
+ if (table->entries == NULL) {
+ free(table);
+ return NULL;
+ }
+ table->size = size;
+ table->cmp = cmp;
+ table->hash = hash;
+ return table;
+}
+
+void hash_table_destroy(hash_table_t *table) {
+ for (int i = 0; i < table->size; i++) {
+ hash_table_entry_t *entry = table->entries[i];
+ while (entry != NULL) {
+ hash_table_entry_t *next = entry->next;
+ free(entry);
+ entry = next;
+ }
+ }
+ free(table->entries);
+ free(table);
+}
+
+void *hash_table_get(hash_table_t *table, void *key) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ return entry->value;
+ }
+ entry = entry->next;
+ }
+ return NULL;
+}
+
+int hash_table_put(hash_table_t *table, void *key, void *value, int replace) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (replace) {
+ entry->value = value;
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+ entry = entry->next;
+ }
+ entry = malloc(sizeof(hash_table_entry_t));
+ entry->key = key;
+ entry->value = value;
+ entry->next = table->entries[hash];
+ table->entries[hash] = entry;
+ return 0;
+}
+
+void hash_table_remove(hash_table_t *table, void *key) {
+ unsigned long hash = table->hash(key) % table->size;
+ hash_table_entry_t *entry = table->entries[hash];
+ hash_table_entry_t *prev = NULL;
+ while (entry != NULL) {
+ if (table->cmp(entry->key, key) == 0) {
+ if (prev == NULL) {
+ table->entries[hash] = entry->next;
+ } else {
+ prev->next = entry->next;
+ }
+ free(entry);
+ return;
+ }
+ prev = entry;
+ entry = entry->next;
+ }
+}
+</code></pre>
+ </div>
+ <p>One implementation detail of note is that hash_table_put returns 1 if the key already exists in the table, and,
+ unless replace is set to 1, will not replace the value. </p>
+ <p>This is useful as our token_create_string function can just check if the string already exists in the table and
+ free the memory if it does. </p>
+ <h4>String hashing</h4>
+ <p>Now we can implement the string hash table for the tokenizer.</p>
+ <p>A lot of research has been done on hash functions (See <a href="https://www.strchr.com/hash_functions">here for a
+ good review of theory and comparison of various functions</a>). For this project, I'll be using the hash
+ function from the famous "Red Dragon Book" (Aho, Sethi, Ullman, Compilers: Principles, Techniques, and Tools, 1st
+ edition). as it's optimized for compiler symbols. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":string-hash-function" href="#:string-hash-function">String
+ hash function</a></em></strong></span>
+ <pre class="prettyprint"><code class="">#define PRIME 211
+static unsigned long hash_string(void *key) {
+ unsigned long hash = 0, g;
+ char *p = key;
+ while (*p) {
+ hash = (hash << 4) + *p++;
+ if ((g = hash & 0xf0000000) != 0) {
+ hash ^= g >> 24;
+ hash ^= g;
+ }
+ }
+ return hash;
+}
+</code></pre>
+ </div>
+ <p>We'll also need a comparison function for strings. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":string-compare-function" href="#:string-compare-function">String
+ compare function</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static int cmp_string(void *key1, void *key2) {
+ return strcmp((char *)key1, (char *)key2);
+}
+</code></pre>
+ </div>
+ <p>Now that we have the hash table, we can implement the string creation function. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":string-creation" href="#:string-creation">String
+ creation</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static hash_table_t *string_table;
+static token_t *token_create_string(c_token_types kind, int line, int column, const char *s) {
+ if (string_table == NULL) {
+ string_table = hash_table_create(1024, cmp_string, hash_string);
+ }
+ token_t *token = token_data_create(kind, line, column);
+ char *key = strdup(s);
+ if (hash_table_put(string_table, key, key, 0)) {
+ free(key);
+ key = hash_table_get(string_table, key);
+ }
+ token_data(token)->data.s = key;
+ return token;
+}
+</code></pre>
+ </div>
+
+ <h4>External Token API</h4>
+ <p>Sticking with the theme of keeping data structures out of headers, the parser will have to access tokens using
+ functions defined in the tokenizer. We'll implement those now. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":external-token-api" href="#:external-token-api">External
+ token API</a></em></strong></span>
+ <pre class="prettyprint"><code class="">
+c_token_types token_type(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->kind;
+}
+
+int64_t token_int(token_t *token) {
+ assert(token->kind == TOK_CONST_INTEGER_U32 ||
+ token->kind == TOK_CONST_INTEGER_U64 ||
+ token->kind == TOK_CONST_INTEGER_S32 ||
+ token->kind == TOK_CONST_INTEGER_S64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.i;
+}
+
+double token_float(token_t *token) {
+ assert(token->kind == TOK_CONST_FLOAT_32 ||
+ token->kind == TOK_CONST_FLOAT_64);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.f;
+}
+
+const char *token_string(token_t *token) {
+ assert(token->kind == TOK_CONST_STRING_ASCII);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.s;
+}
+
+char token_char(token_t *token) {
+ assert(token->kind == TOK_CONST_CHAR);
+ assert(token->magic == TOK_MAGIC_1);
+ return token_data(token)->data.c;
+}
+
+int token_line(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->line;
+}
+
+int token_column(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ return token->column;
+}
+
+void token_destroy(token_t *token) {
+ assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2);
+ if (token->magic == TOK_MAGIC_1) {
+ if (token->kind == TOK_CONST_STRING_ASCII) {
+ free((void *)token_data(token)->data.s);
+ }
+ }
+ free(token);
+}</code></pre>
+ </div>
+ <p>You might notice that this code uses a lot of asserts. I plan to do this for all functions which access internal
+ data structures to help reduce bugs. </p>
+ <h4>Tokenizer Implementation</h4>
+ <p>I'll implement the less interesting functions first. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":tokenizer-implementation" href="#:tokenizer-implementation">Tokenizer
+ implementation</a></em></strong></span>
+ <pre class="prettyprint"><code class="">token_t *unget_token = NULL;
+void tokenizer_init(const char *filename)
+{
+ input_init(filename);
+}
+
+void tokenizer_unget(token_t *token)
+{
+ assert(unget_token == NULL);
+ unget_token = token;
+}
+
+void tokenizer_destroy(void)
+{
+ input_destroy();
+ if (string_table != NULL)
+ {
+ hash_table_destroy(string_table);
+ }
+}</code></pre>
+ </div>
+ <p>Now we can implement the meat of the tokenizer. The general structure is as follows: </p>
+ <ol>
+ <li>Read a character from the input stream. </li>
+ <li>If whitespace, skip it. </li>
+ <li>If a digit or decimal point, read a number. </li>
+ <li>If a letter, read an identifier or keyword. </li>
+ <li>If a quote, read a string. </li>
+ <li>If a single quote, read a char.</li>
+ <li>If not any of the above, read an operator or separator. </li>
+ <li>Return the token if it matches any of the above, otherwise return an error or EOF token. </li>
+ </ol>
+ <p>Each of the functions for reading a token will return a token or <code>NULL</code> if it's not the right type.
+ </p>
+ <p>Let's first write a function to skip whitespace and comments. The rule in C is that whitespace is ignored except
+ when it separates tokens, any character sequence starting with <code>//</code> is a comment until the end of the
+ line, and any character sequence starting with <code>/*</code> is a comment until the next <code>*/</code>. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":skip-whitespace" href="#:skip-whitespace">Skip
+ whitespace</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static void skip_whitespace(void) {
+ int c;
+ while ((c = input_getc()) != EOF) {
+ if (isspace(c)) {
+ if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ } else if (c == '/') {
+ c = input_getc();
+ if (c == '/') {
+ while ((c = input_getc()) != EOF && c != '\n') {
+ column++;
+ }
+ if (c == EOF) {
+ return;
+ }
+ line++;
+ column = 1;
+ } else if (c == '*') {
+ while ((c = input_getc()) != EOF) {
+ if (c == '*') {
+ c = input_getc();
+ if (c == '/') {
+ break;
+ }
+ } else if (c == '\n') {
+ line++;
+ column = 1;
+ } else {
+ column++;
+ }
+ }
+ if (c == EOF) {
+ return;
+ }
+ } else {
+ input_ungetc(c);
+ return;
+ }
+ } else {
+ input_ungetc(c);
+ return;
+ }
+ }
+}</code></pre>
+ </div>
+ <p>Now we'll implement code to recognize keywords and identifiers. The rules for identifiers are that they must
+ start with a letter or underscore and can contain letters, digits, and underscores. </p>
+ <div class="code-block">
+ <span class="file-name">tokenizer.c</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":read-identifier" href="#:read-identifier">Read
+ identifier</a></em></strong></span>
+ <pre class="prettyprint"><code class="">static token_t *read_identifier(void) {
+ int c;
+ char buf[1024];
+ int i = 0;
+ c = input_getc();
+ if (!isalpha(c) && c != '_') {
+ input_ungetc(c);
+ return NULL;
+ }
+ buf[i++] = c;
+ while ((c = input_getc()) != EOF) {
+ if (!isalnum(c) && c != '_') {
+ input_ungetc(c);
+ break;
+ }
+ buf[i++] = c;
+ }
+ buf[i] = '\0';
+
+ // Check if it's a keyword
+ c_token_types kind = get_keyword(buf);
+
+ if (kind == TOK_ID) {
+ // Check if it's a typedeffed identifier
+ int len = strlen(buf);
+ if (len > 2 && buf[len - 2] == '_' && buf[len - 1] == 't') {
+ kind = TOK_TID;
+ }
+ }
+ return token_create_string(kind, line, column, buf);
+}</code></pre>
+ </div>
+ <p>For recognizing keywords, I'll use a decision tree. This is a simple and space-efficient way to recognize
+ keywords. Originally I wanted to use a trie, but those end up using a lot of memory for a small number of
+ items. </p>
+ <p> To generate the decision tree, I'll use a simple C program which works off the following template: </p>
+ <div class="code-block">
+ <span class="file-name">N/A</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":keyword-decision-tree" href="#:keyword-decision-tree">Keyword
+ decision tree</a></em></strong></span>
+ <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
+ switch (buf[0]) {
+ case [first letter]:
+ switch (buf[1]) {
+ case [second letter]:
+ switch (buf[2]) {
+ case [third letter]:
+ return [keyword];
+ default:
+ return TOK_ID;
+ }
+ default:
+ return TOK_ID;
+ }
+ default:
+ return TOK_ID;
+ }
+}</code></pre>
+ </div>
+ <p>For example, if we just fed the program the keyword "if", it would output the following: </p>
+ <div class="code-block">
+ <span class="file-name">N/A</span>
+ <span class="block-header">
+ <strong class="block-title"><em><a id=":if-keyword" href="#:if-keyword">If
+ keyword</a></em></strong></span>
+ <pre class="prettyprint"><code class="">c_token_types get_keyword(const char *buf) {
+ switch (buf[0]) {
+ case 'i':
+ switch (buf[1]) {
+ case 'f':
+ switch (buf[2]) {
+ case '\0':
+ return TOK_CTK_IF;
+ default:
+ return TOK_ID;
+ }
+ default:
+ return TOK_ID;
+ }
+ default:
+ return TOK_ID;
+ }
}
</code></pre>
-</div>
-<p>Now for the actual API</p>
-<div class="code-block">
- <span class="file-name">tokenizer.c</span>
- <span class="block-header">
- <strong class="block-title"><em><a id=":token-api" href="#:token-api">Token
- API</a></em></strong></span>
- <pre class="prettyprint"><code class="">void tokenizer_init(const char *filename) {
-
+ </div>
+ <p>The program will be placed into the main tokenizer.c file using the HIGH QUALITY TECHNIQUE of including a c file
+ in another c file. </p>
+ <p> Time to write the program. </p>
+
+
</main>
<footer style="text-align: center; padding: 20px;">
diff --git a/test b/test
@@ -1 +0,0 @@
-a