tokenizer.c - website - Website contents

tokenizer.c (28357B)
      1 /* tokenizer.c */
      2 #include <ctype.h>
      3 #include <errno.h>
      4 #include <float.h>
      5 #include <math.h>
      6 #include <stdarg.h>
      7 #include <stdint.h>
      8 #include <stdio.h>
      9 #include <stdlib.h>
     10 #include <string.h>
     11 #include "tokenizer.h"
     12 #include "token.h"
     13 #include "input.h"
     14 #include "util.h"
     15 token_t *left_stack[8];
     16 int left_stack_pos = 0;
     17 hash_table_t *typedef_table = NULL;
     18 /* Utility Functions */
     19 void init_tokenizer(const char *filename) {
     20   input_init(filename);
     21   typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string);
     22 }
     23 
     24 void destroy_tokenizer(void) {
     25   input_destroy();
     26 }
     27 
     28 void reject_token(token_t *token) {
     29   left_stack[left_stack_pos++] = token;
     30 }
     31 
     32 token_t *peek_token(void) {
     33   if (left_stack_pos > 0) {
     34     return left_stack[left_stack_pos - 1];
     35   }
     36     token_t *token = next_token();
     37     reject_token(token);
     38     return token;
     39 }
     40 
     41 /* Stringify Type */
     42 const char *stringify_type(c_token_types type) {
     43   switch (type) {
     44   case TOK_IF:
     45     return "if";
     46   case TOK_ELSE:
     47     return "else";
     48   case TOK_SWITCH:
     49     return "switch";
     50   case TOK_CASE:
     51     return "case";
     52   case TOK_DEFAULT:
     53     return "default";
     54   case TOK_WHILE:
     55     return "while";
     56   case TOK_DO:
     57     return "do";
     58   case TOK_FOR:
     59     return "for";
     60   case TOK_CONTINUE:
     61     return "continue";
     62   case TOK_BREAK:
     63     return "break";
     64   case TOK_RETURN:
     65     return "return";
     66   case TOK_GOTO:
     67     return "goto";
     68   case TOK_VOID:
     69     return "void";
     70   case TOK_CHAR:
     71     return "char";
     72   case TOK_SHORT:
     73     return "short";
     74   case TOK_INT:
     75     return "int";
     76   case TOK_LONG:
     77     return "long";
     78   case TOK_FLOAT:
     79     return "float";
     80   case TOK_DOUBLE:
     81     return "double";
     82   case TOK_SIGNED:
     83     return "signed";
     84   case TOK_UNSIGNED:
     85     return "unsigned";
     86   case TOK_STRUCT:
     87     return "struct";
     88   case TOK_UNION:
     89     return "union";
     90   case TOK_ENUM:
     91     return "enum";
     92   case TOK_TYPEDEF:
     93     return "typedef";
     94   case TOK_AUTO:
     95     return "auto";
     96   case TOK_REGISTER:
     97     return "register";
     98   case TOK_STATIC:
     99     return "static";
    100   case TOK_EXTERN:
    101     return "extern";
    102   case TOK_CONST:
    103     return "const";
    104   case TOK_VOLATILE:
    105     return "volatile";
    106   case TOK_SIZEOF:
    107     return "sizeof";
    108   case TOK_ADD:
    109     return "+";
    110   case TOK_SUB:
    111     return "-";
    112   case TOK_MUL:
    113     return "*";
    114   case TOK_DIV:
    115     return "/";
    116   case TOK_MOD:
    117     return "%";
    118   case TOK_BIT_AND:
    119     return "&";
    120   case TOK_BIT_OR:
    121     return "|";
    122   case TOK_BIT_XOR:
    123     return "^";
    124   case TOK_BIT_NOT:
    125     return "~";
    126   case TOK_LSHIFT:
    127     return "<<";
    128   case TOK_RSHIFT:
    129     return ">>";
    130   case TOK_NOT:
    131     return "!";
    132   case TOK_ASSIGN:
    133     return "=";
    134   case TOK_LT:
    135     return "<";
    136   case TOK_GT:
    137     return ">";
    138   case TOK_INC:
    139     return "++";
    140   case TOK_DEC:
    141     return "--";
    142   case TOK_EQ:
    143     return "==";
    144   case TOK_NE:
    145     return "!=";
    146   case TOK_LE:
    147     return "<=";
    148   case TOK_GE:
    149     return ">=";
    150   case TOK_AND:
    151     return "&&";
    152   case TOK_OR:
    153     return "||";
    154   case TOK_MEMBER_POINTER:
    155     return "->";
    156   case TOK_MEMBER:
    157     return ".";
    158   case TOK_COND_DECISION:
    159     return ":";
    160   case TOK_COND:
    161     return "?";
    162   case TOK_ASSIGN_ADD:
    163     return "+=";
    164   case TOK_ASSIGN_SUB:
    165     return "-=";
    166   case TOK_ASSIGN_MUL:
    167     return "*=";
    168   case TOK_ASSIGN_DIV:
    169     return "/=";
    170   case TOK_ASSIGN_MOD:
    171     return "%=";
    172   case TOK_ASSIGN_BITAND:
    173     return "&=";
    174   case TOK_ASSIGN_BITOR:
    175     return "|=";
    176   case TOK_ASSIGN_BITXOR:
    177     return "^=";
    178   case TOK_ASSIGN_LSHIFT:
    179     return "<<=";
    180   case TOK_ASSIGN_RSHIFT:
    181     return ">>=";
    182   case TOK_HASH:
    183     return "#";
    184   case TOK_ID:
    185     return "identifier";
    186   case TOK_TYPEDEF_NAME:
    187     return "typedef name";
    188   case TOK_INTEGER_U32:
    189   case TOK_INTEGER_U64:
    190   case TOK_INTEGER_S32:
    191   case TOK_INTEGER_S64:
    192     return "integer constant";
    193   case TOK_FLOAT_32:
    194   case TOK_FLOAT_64:
    195     return "floating constant";
    196   case TOK_CHAR_CONST:
    197     return "character constant";
    198   case TOK_STRING_ASCII:
    199     return "string constant";
    200   case TOK_EOF:
    201     return "EOF";
    202   case TOK_ERROR:
    203     return "error";
    204   case TOK_LEFT_PAREN:
    205     return "(";
    206   case TOK_RIGHT_PAREN:
    207     return ")";
    208   case TOK_LEFT_BRACKET:
    209     return "[";
    210   case TOK_RIGHT_BRACKET:
    211     return "]";
    212   case TOK_LEFT_BRACE:
    213     return "{";
    214   case TOK_RIGHT_BRACE:
    215     return "}";
    216   case TOK_COMMA:
    217     return ",";
    218   case TOK_SEMICOLON:
    219     return ";";
    220   case TOK_DOT:
    221     return ".";
    222   case TOK_ELLIPSIS:
    223     return "...";
    224   }
    225   return "UNKNOWN";
    226 }
    227 
    228 
    229 void consume(c_token_types kind) {
    230   token_t *token = next_token();
    231   if (token_type(token) != kind) {
    232     fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
    233     exit(1);
    234   }
    235   token_destroy(token);
    236 }
    237 
    238 void consume_alt(c_token_types *kinds, int n) {
    239   token_t *token = next_token();
    240   for (int i = 0; i < n; i++) {
    241     if (token_type(token) == kinds[i]) {
    242       token_destroy(token);
    243       return;
    244     }
    245   }
    246   fprintf(stderr, "Error: Expected one of the following tokens: ");
    247   for (int i = 0; i < n; i++) {
    248     fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
    249   }
    250   fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
    251   exit(1);
    252 }
    253 
    254 /* Tokenization Function */
    255 char file_name[1024];
    256 /* Warning/Error Functions */
    257 void tok_error(const char *fmt, ...) {
    258   va_list args;
    259   va_start(args, fmt);
    260   fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
    261           column);
    262   vfprintf(stderr, fmt, args);
    263   va_end(args);
    264 }
    265 
    266 void tok_warn(const char *fmt, ...) {
    267   va_list args;
    268   va_start(args, fmt);
    269   fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
    270           column);
    271   vfprintf(stderr, fmt, args);
    272   va_end(args);
    273 }
    274 
    275 /* Skip Whitespace */
    276 static token_t *skip_whitespace(void) {
    277   int c;
    278   while ((c = input_getc()) != EOF) {
    279     if (isspace(c)) { // Whitespace
    280       if (c == '\n') {
    281         line++;
    282         column = 1;
    283       } else {
    284         column++;
    285       }
    286     } else if (c == '#') // GCC preprocessor line control directive.
    287     {
    288       char buf[512];
    289       int i = 0;
    290       while ((c = input_getc()) != EOF && c != '\n') {
    291         buf[i++] = c;
    292         column++;
    293       }
    294       buf[i] = '\0';
    295       if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
    296         column = 1;
    297       } else {
    298         tok_error("Invalid #line directive\n");
    299       }
    300       if (c == EOF) {
    301         return NULL;
    302       }
    303     } else if (c == '/') { // Comment
    304       c = input_getc();
    305       if (c == '/') {
    306         while ((c = input_getc()) != EOF && c != '\n') {
    307           column++;
    308         }
    309         if (c == EOF) {
    310           return NULL;
    311         }
    312         line++;
    313         column = 1;
    314       } else if (c == '*') { // Multiline comment
    315         while ((c = input_getc()) != EOF) {
    316           if (c == '*') {
    317             c = input_getc();
    318             if (c == '/') {
    319               break;
    320             }
    321           } else if (c == '\n') {
    322             line++;
    323             column = 1;
    324           } else {
    325             column++;
    326           }
    327         }
    328         if (c == EOF) {
    329           return NULL;
    330         }
    331       } else { // Handled here to simplify the code.
    332         if (c == '=')
    333           return token_create(TOK_ASSIGN_DIV, line, column, 2);
    334         input_ungetc(c);
    335         return token_create(TOK_DIV, line, column, 1);
    336       }
    337     } else {
    338       input_ungetc(c);
    339       return NULL;
    340     }
    341   }
    342   return NULL;
    343 }
    344 
    345 /* Tokenize Identifier */
    346 /* Get Keyword */
    347 c_token_types get_keyword(const char *buf, int len) {
    348   switch (buf[0]) {
    349   case 'a':
    350     if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
    351       return TOK_AUTO;
    352     break;
    353 
    354   case 'b':
    355     if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
    356         buf[4] == 'k')
    357       return TOK_BREAK;
    358     break;
    359 
    360   case 'c':
    361     switch (buf[1]) {
    362     case 'a':
    363       if (len == 4 && buf[2] == 's' && buf[3] == 'e')
    364         return TOK_CASE;
    365       break;
    366     case 'h':
    367       if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
    368         return TOK_CHAR;
    369       break;
    370     case 'o':
    371       if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
    372         return TOK_CONST;
    373       if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
    374           buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
    375         return TOK_CONTINUE;
    376       break;
    377     }
    378     break;
    379 
    380   case 'd':
    381     switch (buf[1]) {
    382     case 'e':
    383       if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
    384           buf[5] == 'l' && buf[6] == 't')
    385         return TOK_DEFAULT;
    386       break;
    387     case 'o':
    388       if (len == 2 && buf[2] == '\0')
    389         return TOK_DO;
    390       if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
    391           buf[5] == 'e')
    392         return TOK_DOUBLE;
    393       break;
    394     }
    395     break;
    396 
    397   case 'e':
    398     switch (buf[1]) {
    399     case 'l':
    400       if (len == 4 && buf[2] == 's' && buf[3] == 'e')
    401         return TOK_ELSE;
    402       break;
    403     case 'n':
    404       if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
    405         return TOK_ENUM;
    406       break;
    407     case 'x':
    408       if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
    409           buf[5] == 'n')
    410         return TOK_EXTERN;
    411       break;
    412     }
    413     break;
    414 
    415   case 'f':
    416     switch (buf[1]) {
    417     case 'l':
    418       if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
    419         return TOK_FLOAT;
    420       break;
    421     case 'o':
    422       if (len == 3 && buf[2] == 'r')
    423         return TOK_FOR;
    424       break;
    425     }
    426     break;
    427 
    428   case 'g':
    429     if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
    430       return TOK_GOTO;
    431     break;
    432 
    433   case 'i':
    434     switch (buf[1]) {
    435     case 'f':
    436       if (len == 2 && buf[2] == '\0')
    437         return TOK_IF;
    438       break;
    439     case 'n':
    440       if (len == 3 && buf[2] == 't')
    441         return TOK_INT;
    442       break;
    443     }
    444     break;
    445 
    446   case 'l':
    447     if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
    448       return TOK_LONG;
    449     break;
    450 
    451   case 'r':
    452     switch (buf[1]) {
    453     case 'e':
    454       if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
    455           buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
    456         return TOK_REGISTER;
    457       if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
    458           buf[5] == 'n')
    459         return TOK_RETURN;
    460       break;
    461     }
    462     break;
    463 
    464   case 's':
    465     switch (buf[1]) {
    466     case 'h':
    467       if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
    468         return TOK_SHORT;
    469       break;
    470     case 't':
    471       if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
    472           buf[5] == 'c')
    473         return TOK_STATIC;
    474       break;
    475     case 'i':
    476       if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
    477           buf[5] == 'd')
    478         return TOK_SIGNED;
    479       if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
    480           buf[5] == 'f')
    481         return TOK_SIZEOF;
    482       break;
    483     case 'r':
    484       if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
    485         return TOK_STRUCT;
    486       break;
    487     case 'w':
    488       if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
    489           buf[5] == 'h')
    490         return TOK_SWITCH;
    491       break;
    492     }
    493     break;
    494 
    495   case 't':
    496     if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
    497         buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
    498       return TOK_TYPEDEF;
    499     break;
    500 
    501   case 'u':
    502     switch (buf[1]) {
    503     case 'n':
    504       if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
    505         return TOK_UNION;
    506       if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
    507           buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
    508         return TOK_UNSIGNED;
    509       break;
    510     }
    511     break;
    512 
    513   case 'v':
    514     switch (buf[1]) {
    515     case 'o':
    516       if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
    517         return TOK_VOID;
    518       if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
    519           buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
    520         return TOK_VOLATILE;
    521       break;
    522     }
    523     break;
    524 
    525   case 'w':
    526     if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
    527         buf[4] == 'e')
    528       return TOK_WHILE;
    529     break;
    530 
    531   default:
    532     return TOK_ID;
    533   }
    534   return TOK_ID;
    535 }
    536 
    537 static token_t *read_identifier(void) {
    538   int c;
    539   char buf[1024];
    540   int i = 0;
    541   c = input_getc();
    542   if (!isalpha(c) && c != '_') {
    543     input_ungetc(c);
    544     return NULL;
    545   }
    546   buf[i++] = c;
    547   while ((c = input_getc()) != EOF) {
    548     if (!isalnum(c) && c != '_') {
    549       input_ungetc(c);
    550       break;
    551     }
    552     buf[i++] = c;
    553     if (i >= 1008) {
    554       tok_error("Identifier too long\n");
    555       exit(1);
    556     }
    557   }
    558   buf[i] = '\0';
    559   column += i;
    560   // Check if it's a keyword
    561   c_token_types kind = get_keyword(buf, i);
    562   if (kind != TOK_ID) {
    563     return token_create(kind, line, column, i);
    564   }
    565   // Check if it's a typedef
    566   if (hash_table_get(typedef_table, buf) != NULL) {
    567     return token_create(TOK_TYPEDEF_NAME, line, column, i);
    568   }
    569   return token_create_string(kind, line, column, buf, i);
    570 }
    571 
    572 /* Tokenize Number */
    573 static token_t *read_number(void) {
    574   int c;
    575   char buf[1024];
    576   int i = 0;
    577   c = input_getc();
    578   /* Check for valid prefix */
    579    // If we don't have a digit or decimal point, it's not a number
    580     if (!isdigit(c) && c != '.') {
    581       input_ungetc(c);
    582       return NULL;
    583     }
    584     // Decimal point followed by non-digit is a struct member
    585     if (c == '.') {
    586       char cnext = input_getc();
    587       if (!isdigit(cnext)) {
    588         input_ungetc(cnext);
    589         return token_create(TOK_MEMBER, line, column, 1);
    590       }
    591       input_ungetc(cnext);
    592     }
    593 
    594   int radix = 10;
    595   /* Process Radix */
    596     // Check for hex and octal.
    597     if (c == '0') {
    598       char cnext = input_getc();
    599       if (cnext == 'x' || cnext == 'X') {
    600         // Hex, discard the 0x
    601         radix = 16;
    602       } else {
    603         // Octal, discard the 0
    604         input_ungetc(cnext);
    605         radix = 8;
    606       }
    607     } else {
    608       // Decimal, append the first digit
    609       buf[i++] = c;
    610     }
    611 
    612   int is_float = 0;
    613   /* Read Number Loop */
    614      while ((c = input_getc()) != EOF) {
    615       // Since there can be multiple writes to the buffer, we want to make sure we
    616       // don't overflow by giving a 4 byte pad
    617       if (i > 1020) {
    618         tok_error("Number too long\n");
    619         return NULL;
    620       }
    621       // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
    622       // a-f/A-F for hex
    623       if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
    624           (radix == 8 && c >= '0' && c <= '7')) {
    625         buf[i++] = c;
    626         // Decimal point and not a float yet, must be a float
    627       } else if (c == '.' && !is_float) {
    628         is_float = 1;
    629         if (radix != 10) {
    630           tok_error("Invalid floating point constant, expected decimal, got %s\n",
    631                     radix == 16 ? "hexadecimal" : "octal");
    632           return NULL;
    633         }
    634         buf[i++] = c;
    635       }
    636       // Exponent on the end of a constant. (By standard this forces it to be a
    637       // float)
    638       else if (c == 'e' || c == 'E') {
    639         buf[i++] = c;
    640         c = input_getc();
    641         // Sign on the exponent
    642         if (c == '+' || c == '-') {
    643           buf[i++] = c;
    644           c = input_getc();
    645         }
    646         // Exponent must be a digit, I.E no 1e1.2
    647         if (!isdigit(c)) {
    648           tok_error("Invalid floating point exponent\n");
    649           return NULL;
    650         }
    651         buf[i++] = c;
    652         is_float = 1;
    653       } else {
    654         // Reached the end, unget the character so other functions can read it
    655         input_ungetc(c);
    656         break;
    657       }
    658     }
    659 
    660   buf[i] = '\0';
    661   /* Process Suffixes */
    662     int is_unsigned = 0;
    663     int is_long = 0;
    664     int is_single = 0;
    665     while (1) {
    666       c = input_getc();
    667       if (c == 'u' || c == 'U') {
    668         if (is_unsigned) {
    669           tok_warn(
    670               "Warning: Duplicate suffix 'u' for integer constant ignored\n");
    671         }
    672         is_unsigned = 1;
    673       } else if (c == 'l' || c == 'L') {
    674         if (is_long) {
    675           tok_warn(
    676               "Warning: Duplicate suffix 'l' for integer constant ignored\n");
    677         }
    678         is_long = 1;
    679       } else if (c == 'f' || c == 'F') {
    680         if (is_single) {
    681           tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
    682                    "ignored\n");
    683         }
    684         is_single = 1;
    685       } else {
    686         input_ungetc(c);
    687         break;
    688       }
    689     }
    690 
    691   /* Check for conflicting suffixes */
    692       if (is_single && is_long) {
    693       tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
    694                "constant. Ignoring 'l'\n");
    695       is_long = 0;
    696     }
    697     if (is_single && is_unsigned) {
    698       tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
    699                "constant. Ignoring 'u'\n");
    700       is_unsigned = 0;
    701     }
    702     if (is_single && !is_float) {
    703       tok_warn(
    704           "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
    705       is_single = 0;
    706     }
    707 
    708   if (is_float) {
    709     /* Convert to float */
    710         errno = 0;
    711         // Strtod generates a unix-style error when it's given something out of
    712         // range, so we want to get on top of that quickly instead of ignoring it
    713         // That way we can avoid some nasty NAN-propagation in the constant folder.
    714         double f = strtod(buf, NULL);
    715         if (errno == ERANGE) {
    716           tok_error("Floating point constant out of range\n");
    717           return NULL;
    718         }
    719         // Warn if the constant is out of range for a float, I.E it's too big or too
    720         // small
    721         if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
    722           tok_warn(
    723               "Warning: Floating point constant %f is out of range for float\n", f);
    724         }
    725         // Warn if the constant is too precise for a float
    726         if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
    727           tok_warn("Warning: Converting double precision floating point constant "
    728                    "%f to float loses "
    729                    "precision\n",
    730                    f);
    731         }
    732         return token_create_float(is_single ? TOK_FLOAT_32
    733                                             : TOK_FLOAT_64,
    734                                   line, column, f, i);
    735 
    736   } else {
    737     /* Convert to integer */
    738         errno = 0;
    739         uint64_t int_ = strtoull(buf, NULL, radix);
    740         // Same as above, but for integers
    741         if (errno == ERANGE) {
    742           tok_error("Integer constant out of range\n");
    743           return NULL;
    744         }
    745         if (is_unsigned) {
    746           if (is_long) {
    747             return token_create_int(TOK_INTEGER_U64, line, column, int_, i);
    748           } else {
    749             if (int_ > UINT32_MAX) {
    750               tok_warn(
    751                   "Warning: Integer constant %lld is out of range for unsigned "
    752                   "int\n",
    753                   int_);
    754             }
    755             return token_create_int(TOK_INTEGER_U32, line, column, int_, i);
    756           }
    757         } else {
    758           if (is_long) {
    759             // If the highest bit is set, that means this will overflow a signed
    760             // long (Due to two's complement)
    761             if (int_ & (1UL << 63)) {
    762               tok_warn(
    763                   "Warning: Integer constant %lld is out of range for long long\n",
    764                   i);
    765             }
    766             return token_create_int(TOK_INTEGER_S64, line, column, int_, i);
    767           } else {
    768             if (int_ & (1UL << 31)) {
    769               tok_warn("Warning: Integer constant %lld is out of range for int\n",
    770                        int_);
    771             }
    772             return token_create_int(TOK_INTEGER_S32, line, column, int_, i);
    773           }
    774         }
    775 
    776   }
    777   return NULL;
    778 }
    779 
    780 /* Tokenize String */
    781 /* Read Escape Sequence */
    782 static char read_escape_sequence(int *len) {
    783   int c = input_getc();
    784   *len += 1;
    785   switch (c) {
    786   case 'a':
    787     return '\a';
    788   case 'b':
    789     return '\b';
    790   case 'f':
    791     return '\f';
    792   case 'n':
    793     return '\n';
    794   case 'r':
    795     return '\r';
    796   case 't':
    797     return '\t';
    798   case 'v':
    799     return '\v';
    800   case '\'':
    801     return '\'';
    802   case '"':
    803     return '"';
    804   case '?':
    805     return '?';
    806   case '\\':
    807     return '\\';
    808   case '0':
    809     return '\0';
    810   case 'x': {
    811     c = input_getc();
    812     if (!isxdigit(c)) {
    813       tok_error("Invalid hexadecimal escape sequence\n");
    814       return 0;
    815     }
    816     int val = 0;
    817     while (isxdigit(c)) {
    818       *len += 1;
    819       val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
    820       c = input_getc();
    821     }
    822     input_ungetc(c);
    823     return (char)val;
    824   }
    825   default:
    826     if (!isdigit(c)) {
    827       tok_error("Invalid escape sequence\n");
    828       return 0;
    829     }
    830     int val = 0;
    831     while (isdigit(c)) {
    832       *len += 1;
    833       val = val * 8 + c - '0';
    834       c = input_getc();
    835     }
    836     input_ungetc(c);
    837     return (char)val;
    838   }
    839 }
    840 
    841 static token_t *read_string_literal(void) {
    842   int c;
    843   c = input_getc();
    844   if (c != '"') {
    845     input_ungetc(c);
    846     return NULL;
    847   }
    848   int i = 0;
    849   char s_buf[512];
    850   char *buf = s_buf;
    851   int len = 512;
    852   int esc_pad = 0;
    853   while ((c = input_getc()) != EOF) {
    854     if (c == '"') {
    855       // Implicit skip of closing quote
    856       break;
    857     }
    858     if (c == '\\') {
    859       c = read_escape_sequence(&esc_pad);
    860       if (c == 0) {
    861         return NULL;
    862       }
    863     }
    864     if (i >= len) {
    865       if (buf == s_buf) {
    866         buf = malloc(1024);
    867         if (buf == NULL) {
    868           fputs("Out of memory. Could not parse string literal.\n", stderr);
    869           exit(1);
    870         }
    871         memcpy(buf, s_buf, 512);
    872         len *= 2;
    873       } else {
    874         len *= 2;
    875         buf = realloc(buf, len);
    876       }
    877     }
    878     buf[i++] = c;
    879   }
    880   buf[i] = '\0';
    881   if (c == EOF) {
    882     tok_error("Unterminated string literal\n");
    883     if (buf != s_buf) {
    884       free(buf);
    885     }
    886     return NULL;
    887   }
    888 
    889   token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf,
    890                                      i + esc_pad + 2);
    891   if (buf != s_buf) {
    892     free(buf);
    893   }
    894   return tok;
    895 }
    896 
    897 /* Tokenize Character */
    898 static token_t *read_char_constant(void) {
    899   int c;
    900   int len = 0;
    901   c = input_getc();
    902   if (c != '\'') {
    903     input_ungetc(c);
    904     return NULL;
    905   }
    906   len++;
    907   c = input_getc();
    908   if (c == '\'') {
    909     tok_error("Empty character constant\n");
    910     return NULL;
    911   }
    912   if (c == '\\') {
    913     c = read_escape_sequence(&len);
    914   }
    915   int val = c;
    916   c = input_getc();
    917   if (c != '\'') {
    918     tok_error("Expected closing quote for character constant\n");
    919     return NULL;
    920   }
    921   len++;
    922   return token_create_char(TOK_CHAR_CONST, line, column, val, len);
    923 }
    924 
    925 /* Tokenize Operator */
    926 
    927 token_t *read_operator(void) {
    928   int c;
    929   c = input_getc();
    930   switch (c) {
    931   case '!': {
    932     c = input_getc();
    933     if (c == '=')
    934       return token_create(TOK_NE, line, column, 2);
    935     input_ungetc(c);
    936     return token_create(TOK_NOT, line, column, 1);
    937   }
    938   case '%': {
    939     c = input_getc();
    940     if (c == '=')
    941       return token_create(TOK_ASSIGN_MOD, line, column, 2);
    942     input_ungetc(c);
    943     return token_create(TOK_MOD, line, column, 1);
    944   }
    945   case '&': {
    946     c = input_getc();
    947     if (c == '&')
    948       return token_create(TOK_AND, line, column, 2);
    949     if (c == '=')
    950       return token_create(TOK_ASSIGN_BITAND, line, column, 2);
    951     input_ungetc(c);
    952     return token_create(TOK_BIT_AND, line, column, 1);
    953   }
    954   case '(':
    955     return token_create(TOK_LEFT_PAREN, line, column, 1);
    956   case ')':
    957     return token_create(TOK_RIGHT_PAREN, line, column, 1);
    958   case '*': {
    959     c = input_getc();
    960     if (c == '=')
    961       return token_create(TOK_ASSIGN_MUL, line, column, 2);
    962     input_ungetc(c);
    963     return token_create(TOK_MUL, line, column, 1);
    964   }
    965   case '+': {
    966     c = input_getc();
    967     if (c == '+')
    968       return token_create(TOK_INC, line, column, 2);
    969     if (c == '=')
    970       return token_create(TOK_ASSIGN_ADD, line, column, 2);
    971     input_ungetc(c);
    972     return token_create(TOK_ADD, line, column, 2);
    973   }
    974   case ',':
    975     return token_create(TOK_COMMA, line, column, 1);
    976   case '-': {
    977     c = input_getc();
    978     if (c == '-')
    979       return token_create(TOK_DEC, line, column, 2);
    980     if (c == '=')
    981       return token_create(TOK_ASSIGN_SUB, line, column, 2);
    982     if (c == '>')
    983       return token_create(TOK_MEMBER_POINTER, line, column, 2);
    984     input_ungetc(c);
    985     return token_create(TOK_SUB, line, column, 1);
    986   }
    987   case '.': {
    988     c = input_getc();
    989     if (c == '.') {
    990       c = input_getc();
    991       if (c == '.') {
    992         return token_create(TOK_ELLIPSIS, line, column, 3);
    993       } else {
    994         // Bail out, can't store more than one unget
    995         tok_error("Unexpected character '.' at line %d, column %d\n", line,
    996                   column);
    997         exit(1);
    998       }
    999     }
   1000     return token_create('.', line, column, 1);
   1001   }
   1002   case '/': {
   1003     c = input_getc();
   1004     if (c == '=')
   1005       return token_create(TOK_ASSIGN_DIV, line, column, 2);
   1006     input_ungetc(c);
   1007     return token_create(TOK_DIV, line, column, 1);
   1008   }
   1009   case ':':
   1010     return token_create(TOK_COND_DECISION, line, column, 1);
   1011   case ';':
   1012     return token_create(TOK_SEMICOLON, line, column, 1);
   1013   case '<': {
   1014     c = input_getc();
   1015     if (c == '<') {
   1016       c = input_getc();
   1017       if (c == '=')
   1018         return token_create(TOK_ASSIGN_LSHIFT, line, column, 3);
   1019       input_ungetc(c);
   1020       return token_create(TOK_LSHIFT, line, column, 2);
   1021     }
   1022     if (c == '=')
   1023       return token_create(TOK_LE, line, column, 2);
   1024     input_ungetc(c);
   1025     return token_create(TOK_LT, line, column, 1);
   1026   }
   1027   case '=': {
   1028     c = input_getc();
   1029     if (c == '=')
   1030       return token_create(TOK_ASSIGN, line, column, 2);
   1031     input_ungetc(c);
   1032     return token_create(TOK_ASSIGN, line, column, 1);
   1033   }
   1034   case '>': {
   1035     c = input_getc();
   1036     if (c == '>') {
   1037       c = input_getc();
   1038       if (c == '=')
   1039         return token_create(TOK_ASSIGN_RSHIFT, line, column, 3);
   1040       input_ungetc(c);
   1041       return token_create(TOK_RSHIFT, line, column, 2);
   1042     }
   1043     if (c == '=')
   1044       return token_create(TOK_GE, line, column, 2);
   1045     input_ungetc(c);
   1046     return token_create(TOK_GT, line, column, 1);
   1047   }
   1048   case '?':
   1049     return token_create(TOK_COND, line, column, 1);
   1050   case '[':
   1051     return token_create(TOK_LEFT_BRACKET, line, column, 1);
   1052   case ']':
   1053     return token_create(TOK_RIGHT_BRACKET, line, column, 1);
   1054   case '^': {
   1055     c = input_getc();
   1056     if (c == '=')
   1057       return token_create(TOK_ASSIGN_BITXOR, line, column, 2);
   1058     input_ungetc(c);
   1059     return token_create(TOK_BIT_XOR, line, column, 1);
   1060   }
   1061   case '{':
   1062     return token_create(TOK_LEFT_BRACE, line, column, 1);
   1063   case '|': {
   1064     c = input_getc();
   1065     if (c == '|')
   1066       return token_create(TOK_OR, line, column, 2);
   1067     if (c == '=')
   1068       return token_create(TOK_ASSIGN_BITOR, line, column, 2);
   1069     input_ungetc(c);
   1070     return token_create(TOK_BIT_OR, line, column, 1);
   1071   }
   1072   case '}':
   1073     return token_create(TOK_RIGHT_BRACE, line, column, 1);
   1074   case '~':
   1075     return token_create(TOK_BIT_NOT, line, column, 1);
   1076   default:
   1077     input_ungetc(c);
   1078     return NULL;
   1079   }
   1080 
   1081   return NULL;
   1082 }
   1083 
   1084 token_t *next_token(void) {
   1085   if (left_stack_pos > 0) {
   1086     return left_stack[--left_stack_pos];
   1087   }
   1088   token_t *tok = skip_whitespace();
   1089   if (tok != NULL) {
   1090     return tok;
   1091   }
   1092   tok = read_identifier();
   1093   if (tok != NULL) {
   1094     return tok;
   1095   }
   1096   tok = read_number();
   1097   if (tok != NULL) {
   1098     return tok;
   1099   }
   1100   tok = read_char_constant();
   1101   if (tok != NULL) {
   1102     return tok;
   1103   }
   1104   tok = read_string_literal();
   1105   if (tok != NULL) {
   1106     return tok;
   1107   }
   1108   tok = read_operator();
   1109   if (tok != NULL) {
   1110     return tok;
   1111   }
   1112   int c = input_getc();
   1113   if (c == EOF) {
   1114     return NULL;
   1115   }
   1116   tok_warn(
   1117       "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
   1118       line, column);
   1119   return next_token();
   1120 }
   1121 
   1122 #ifdef TEST_TOKENIZER
   1123 /* Run Test */
   1124 char *preprocess(char *in) {
   1125   char *output_name = malloc(1024);
   1126   snprintf(output_name, 1024, "%s.preprocessed", in);
   1127   char *command = malloc(2048);
   1128   snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
   1129   system(command);
   1130   free(command);
   1131   return output_name;
   1132 }
   1133 
   1134 // Tokenize the input file
   1135 int main(int argc, char **argv) {
   1136   if (argc != 2) {
   1137     fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
   1138     return 1;
   1139   }
   1140   char *input_name = argv[1];
   1141   char *preprocessed = preprocess(input_name);
   1142   init_tokenizer(preprocessed);
   1143   token_t *tok;
   1144   while ((tok = next_token()) != NULL) {
   1145     print_token(tok);
   1146     token_destroy(tok);
   1147   }
   1148   destroy_tokenizer();
   1149   remove(preprocessed);
   1150   free(preprocessed);
   1151   hash_table_destroy(string_table);
   1152   return 0;
   1153 }
   1154 
   1155 #endif
   1156 
   1157
	website Website contents
	git clone git://git.reagancfischer.dev/website.git
	Log \| Files \| Refs