tokenizer.c (28357B)
1 /* tokenizer.c */ 2 #include <ctype.h> 3 #include <errno.h> 4 #include <float.h> 5 #include <math.h> 6 #include <stdarg.h> 7 #include <stdint.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include "tokenizer.h" 12 #include "token.h" 13 #include "input.h" 14 #include "util.h" 15 token_t *left_stack[8]; 16 int left_stack_pos = 0; 17 hash_table_t *typedef_table = NULL; 18 /* Utility Functions */ 19 void init_tokenizer(const char *filename) { 20 input_init(filename); 21 typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string); 22 } 23 24 void destroy_tokenizer(void) { 25 input_destroy(); 26 } 27 28 void reject_token(token_t *token) { 29 left_stack[left_stack_pos++] = token; 30 } 31 32 token_t *peek_token(void) { 33 if (left_stack_pos > 0) { 34 return left_stack[left_stack_pos - 1]; 35 } 36 token_t *token = next_token(); 37 reject_token(token); 38 return token; 39 } 40 41 /* Stringify Type */ 42 const char *stringify_type(c_token_types type) { 43 switch (type) { 44 case TOK_IF: 45 return "if"; 46 case TOK_ELSE: 47 return "else"; 48 case TOK_SWITCH: 49 return "switch"; 50 case TOK_CASE: 51 return "case"; 52 case TOK_DEFAULT: 53 return "default"; 54 case TOK_WHILE: 55 return "while"; 56 case TOK_DO: 57 return "do"; 58 case TOK_FOR: 59 return "for"; 60 case TOK_CONTINUE: 61 return "continue"; 62 case TOK_BREAK: 63 return "break"; 64 case TOK_RETURN: 65 return "return"; 66 case TOK_GOTO: 67 return "goto"; 68 case TOK_VOID: 69 return "void"; 70 case TOK_CHAR: 71 return "char"; 72 case TOK_SHORT: 73 return "short"; 74 case TOK_INT: 75 return "int"; 76 case TOK_LONG: 77 return "long"; 78 case TOK_FLOAT: 79 return "float"; 80 case TOK_DOUBLE: 81 return "double"; 82 case TOK_SIGNED: 83 return "signed"; 84 case TOK_UNSIGNED: 85 return "unsigned"; 86 case TOK_STRUCT: 87 return "struct"; 88 case TOK_UNION: 89 return "union"; 90 case TOK_ENUM: 91 return "enum"; 92 case TOK_TYPEDEF: 93 return "typedef"; 94 case TOK_AUTO: 95 return "auto"; 96 case TOK_REGISTER: 97 return "register"; 98 case TOK_STATIC: 99 return "static"; 100 case TOK_EXTERN: 101 return "extern"; 102 case TOK_CONST: 103 return "const"; 104 case TOK_VOLATILE: 105 return "volatile"; 106 case TOK_SIZEOF: 107 return "sizeof"; 108 case TOK_ADD: 109 return "+"; 110 case TOK_SUB: 111 return "-"; 112 case TOK_MUL: 113 return "*"; 114 case TOK_DIV: 115 return "/"; 116 case TOK_MOD: 117 return "%"; 118 case TOK_BIT_AND: 119 return "&"; 120 case TOK_BIT_OR: 121 return "|"; 122 case TOK_BIT_XOR: 123 return "^"; 124 case TOK_BIT_NOT: 125 return "~"; 126 case TOK_LSHIFT: 127 return "<<"; 128 case TOK_RSHIFT: 129 return ">>"; 130 case TOK_NOT: 131 return "!"; 132 case TOK_ASSIGN: 133 return "="; 134 case TOK_LT: 135 return "<"; 136 case TOK_GT: 137 return ">"; 138 case TOK_INC: 139 return "++"; 140 case TOK_DEC: 141 return "--"; 142 case TOK_EQ: 143 return "=="; 144 case TOK_NE: 145 return "!="; 146 case TOK_LE: 147 return "<="; 148 case TOK_GE: 149 return ">="; 150 case TOK_AND: 151 return "&&"; 152 case TOK_OR: 153 return "||"; 154 case TOK_MEMBER_POINTER: 155 return "->"; 156 case TOK_MEMBER: 157 return "."; 158 case TOK_COND_DECISION: 159 return ":"; 160 case TOK_COND: 161 return "?"; 162 case TOK_ASSIGN_ADD: 163 return "+="; 164 case TOK_ASSIGN_SUB: 165 return "-="; 166 case TOK_ASSIGN_MUL: 167 return "*="; 168 case TOK_ASSIGN_DIV: 169 return "/="; 170 case TOK_ASSIGN_MOD: 171 return "%="; 172 case TOK_ASSIGN_BITAND: 173 return "&="; 174 case TOK_ASSIGN_BITOR: 175 return "|="; 176 case TOK_ASSIGN_BITXOR: 177 return "^="; 178 case TOK_ASSIGN_LSHIFT: 179 return "<<="; 180 case TOK_ASSIGN_RSHIFT: 181 return ">>="; 182 case TOK_HASH: 183 return "#"; 184 case TOK_ID: 185 return "identifier"; 186 case TOK_TYPEDEF_NAME: 187 return "typedef name"; 188 case TOK_INTEGER_U32: 189 case TOK_INTEGER_U64: 190 case TOK_INTEGER_S32: 191 case TOK_INTEGER_S64: 192 return "integer constant"; 193 case TOK_FLOAT_32: 194 case TOK_FLOAT_64: 195 return "floating constant"; 196 case TOK_CHAR_CONST: 197 return "character constant"; 198 case TOK_STRING_ASCII: 199 return "string constant"; 200 case TOK_EOF: 201 return "EOF"; 202 case TOK_ERROR: 203 return "error"; 204 case TOK_LEFT_PAREN: 205 return "("; 206 case TOK_RIGHT_PAREN: 207 return ")"; 208 case TOK_LEFT_BRACKET: 209 return "["; 210 case TOK_RIGHT_BRACKET: 211 return "]"; 212 case TOK_LEFT_BRACE: 213 return "{"; 214 case TOK_RIGHT_BRACE: 215 return "}"; 216 case TOK_COMMA: 217 return ","; 218 case TOK_SEMICOLON: 219 return ";"; 220 case TOK_DOT: 221 return "."; 222 case TOK_ELLIPSIS: 223 return "..."; 224 } 225 return "UNKNOWN"; 226 } 227 228 229 void consume(c_token_types kind) { 230 token_t *token = next_token(); 231 if (token_type(token) != kind) { 232 fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token))); 233 exit(1); 234 } 235 token_destroy(token); 236 } 237 238 void consume_alt(c_token_types *kinds, int n) { 239 token_t *token = next_token(); 240 for (int i = 0; i < n; i++) { 241 if (token_type(token) == kinds[i]) { 242 token_destroy(token); 243 return; 244 } 245 } 246 fprintf(stderr, "Error: Expected one of the following tokens: "); 247 for (int i = 0; i < n; i++) { 248 fprintf(stderr, "\"%s\" ", stringify_type(kinds[i])); 249 } 250 fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token))); 251 exit(1); 252 } 253 254 /* Tokenization Function */ 255 char file_name[1024]; 256 /* Warning/Error Functions */ 257 void tok_error(const char *fmt, ...) { 258 va_list args; 259 va_start(args, fmt); 260 fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line, 261 column); 262 vfprintf(stderr, fmt, args); 263 va_end(args); 264 } 265 266 void tok_warn(const char *fmt, ...) { 267 va_list args; 268 va_start(args, fmt); 269 fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line, 270 column); 271 vfprintf(stderr, fmt, args); 272 va_end(args); 273 } 274 275 /* Skip Whitespace */ 276 static token_t *skip_whitespace(void) { 277 int c; 278 while ((c = input_getc()) != EOF) { 279 if (isspace(c)) { // Whitespace 280 if (c == '\n') { 281 line++; 282 column = 1; 283 } else { 284 column++; 285 } 286 } else if (c == '#') // GCC preprocessor line control directive. 287 { 288 char buf[512]; 289 int i = 0; 290 while ((c = input_getc()) != EOF && c != '\n') { 291 buf[i++] = c; 292 column++; 293 } 294 buf[i] = '\0'; 295 if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) { 296 column = 1; 297 } else { 298 tok_error("Invalid #line directive\n"); 299 } 300 if (c == EOF) { 301 return NULL; 302 } 303 } else if (c == '/') { // Comment 304 c = input_getc(); 305 if (c == '/') { 306 while ((c = input_getc()) != EOF && c != '\n') { 307 column++; 308 } 309 if (c == EOF) { 310 return NULL; 311 } 312 line++; 313 column = 1; 314 } else if (c == '*') { // Multiline comment 315 while ((c = input_getc()) != EOF) { 316 if (c == '*') { 317 c = input_getc(); 318 if (c == '/') { 319 break; 320 } 321 } else if (c == '\n') { 322 line++; 323 column = 1; 324 } else { 325 column++; 326 } 327 } 328 if (c == EOF) { 329 return NULL; 330 } 331 } else { // Handled here to simplify the code. 332 if (c == '=') 333 return token_create(TOK_ASSIGN_DIV, line, column, 2); 334 input_ungetc(c); 335 return token_create(TOK_DIV, line, column, 1); 336 } 337 } else { 338 input_ungetc(c); 339 return NULL; 340 } 341 } 342 return NULL; 343 } 344 345 /* Tokenize Identifier */ 346 /* Get Keyword */ 347 c_token_types get_keyword(const char *buf, int len) { 348 switch (buf[0]) { 349 case 'a': 350 if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') 351 return TOK_AUTO; 352 break; 353 354 case 'b': 355 if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && 356 buf[4] == 'k') 357 return TOK_BREAK; 358 break; 359 360 case 'c': 361 switch (buf[1]) { 362 case 'a': 363 if (len == 4 && buf[2] == 's' && buf[3] == 'e') 364 return TOK_CASE; 365 break; 366 case 'h': 367 if (len == 4 && buf[2] == 'a' && buf[3] == 'r') 368 return TOK_CHAR; 369 break; 370 case 'o': 371 if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') 372 return TOK_CONST; 373 if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && 374 buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') 375 return TOK_CONTINUE; 376 break; 377 } 378 break; 379 380 case 'd': 381 switch (buf[1]) { 382 case 'e': 383 if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && 384 buf[5] == 'l' && buf[6] == 't') 385 return TOK_DEFAULT; 386 break; 387 case 'o': 388 if (len == 2 && buf[2] == '\0') 389 return TOK_DO; 390 if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && 391 buf[5] == 'e') 392 return TOK_DOUBLE; 393 break; 394 } 395 break; 396 397 case 'e': 398 switch (buf[1]) { 399 case 'l': 400 if (len == 4 && buf[2] == 's' && buf[3] == 'e') 401 return TOK_ELSE; 402 break; 403 case 'n': 404 if (len == 4 && buf[2] == 'u' && buf[3] == 'm') 405 return TOK_ENUM; 406 break; 407 case 'x': 408 if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && 409 buf[5] == 'n') 410 return TOK_EXTERN; 411 break; 412 } 413 break; 414 415 case 'f': 416 switch (buf[1]) { 417 case 'l': 418 if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') 419 return TOK_FLOAT; 420 break; 421 case 'o': 422 if (len == 3 && buf[2] == 'r') 423 return TOK_FOR; 424 break; 425 } 426 break; 427 428 case 'g': 429 if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') 430 return TOK_GOTO; 431 break; 432 433 case 'i': 434 switch (buf[1]) { 435 case 'f': 436 if (len == 2 && buf[2] == '\0') 437 return TOK_IF; 438 break; 439 case 'n': 440 if (len == 3 && buf[2] == 't') 441 return TOK_INT; 442 break; 443 } 444 break; 445 446 case 'l': 447 if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') 448 return TOK_LONG; 449 break; 450 451 case 'r': 452 switch (buf[1]) { 453 case 'e': 454 if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && 455 buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') 456 return TOK_REGISTER; 457 if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && 458 buf[5] == 'n') 459 return TOK_RETURN; 460 break; 461 } 462 break; 463 464 case 's': 465 switch (buf[1]) { 466 case 'h': 467 if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') 468 return TOK_SHORT; 469 break; 470 case 't': 471 if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && 472 buf[5] == 'c') 473 return TOK_STATIC; 474 break; 475 case 'i': 476 if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && 477 buf[5] == 'd') 478 return TOK_SIGNED; 479 if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && 480 buf[5] == 'f') 481 return TOK_SIZEOF; 482 break; 483 case 'r': 484 if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') 485 return TOK_STRUCT; 486 break; 487 case 'w': 488 if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && 489 buf[5] == 'h') 490 return TOK_SWITCH; 491 break; 492 } 493 break; 494 495 case 't': 496 if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && 497 buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') 498 return TOK_TYPEDEF; 499 break; 500 501 case 'u': 502 switch (buf[1]) { 503 case 'n': 504 if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') 505 return TOK_UNION; 506 if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && 507 buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') 508 return TOK_UNSIGNED; 509 break; 510 } 511 break; 512 513 case 'v': 514 switch (buf[1]) { 515 case 'o': 516 if (len == 4 && buf[2] == 'i' && buf[3] == 'd') 517 return TOK_VOID; 518 if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && 519 buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') 520 return TOK_VOLATILE; 521 break; 522 } 523 break; 524 525 case 'w': 526 if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && 527 buf[4] == 'e') 528 return TOK_WHILE; 529 break; 530 531 default: 532 return TOK_ID; 533 } 534 return TOK_ID; 535 } 536 537 static token_t *read_identifier(void) { 538 int c; 539 char buf[1024]; 540 int i = 0; 541 c = input_getc(); 542 if (!isalpha(c) && c != '_') { 543 input_ungetc(c); 544 return NULL; 545 } 546 buf[i++] = c; 547 while ((c = input_getc()) != EOF) { 548 if (!isalnum(c) && c != '_') { 549 input_ungetc(c); 550 break; 551 } 552 buf[i++] = c; 553 if (i >= 1008) { 554 tok_error("Identifier too long\n"); 555 exit(1); 556 } 557 } 558 buf[i] = '\0'; 559 column += i; 560 // Check if it's a keyword 561 c_token_types kind = get_keyword(buf, i); 562 if (kind != TOK_ID) { 563 return token_create(kind, line, column, i); 564 } 565 // Check if it's a typedef 566 if (hash_table_get(typedef_table, buf) != NULL) { 567 return token_create(TOK_TYPEDEF_NAME, line, column, i); 568 } 569 return token_create_string(kind, line, column, buf, i); 570 } 571 572 /* Tokenize Number */ 573 static token_t *read_number(void) { 574 int c; 575 char buf[1024]; 576 int i = 0; 577 c = input_getc(); 578 /* Check for valid prefix */ 579 // If we don't have a digit or decimal point, it's not a number 580 if (!isdigit(c) && c != '.') { 581 input_ungetc(c); 582 return NULL; 583 } 584 // Decimal point followed by non-digit is a struct member 585 if (c == '.') { 586 char cnext = input_getc(); 587 if (!isdigit(cnext)) { 588 input_ungetc(cnext); 589 return token_create(TOK_MEMBER, line, column, 1); 590 } 591 input_ungetc(cnext); 592 } 593 594 int radix = 10; 595 /* Process Radix */ 596 // Check for hex and octal. 597 if (c == '0') { 598 char cnext = input_getc(); 599 if (cnext == 'x' || cnext == 'X') { 600 // Hex, discard the 0x 601 radix = 16; 602 } else { 603 // Octal, discard the 0 604 input_ungetc(cnext); 605 radix = 8; 606 } 607 } else { 608 // Decimal, append the first digit 609 buf[i++] = c; 610 } 611 612 int is_float = 0; 613 /* Read Number Loop */ 614 while ((c = input_getc()) != EOF) { 615 // Since there can be multiple writes to the buffer, we want to make sure we 616 // don't overflow by giving a 4 byte pad 617 if (i > 1020) { 618 tok_error("Number too long\n"); 619 return NULL; 620 } 621 // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and 622 // a-f/A-F for hex 623 if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) || 624 (radix == 8 && c >= '0' && c <= '7')) { 625 buf[i++] = c; 626 // Decimal point and not a float yet, must be a float 627 } else if (c == '.' && !is_float) { 628 is_float = 1; 629 if (radix != 10) { 630 tok_error("Invalid floating point constant, expected decimal, got %s\n", 631 radix == 16 ? "hexadecimal" : "octal"); 632 return NULL; 633 } 634 buf[i++] = c; 635 } 636 // Exponent on the end of a constant. (By standard this forces it to be a 637 // float) 638 else if (c == 'e' || c == 'E') { 639 buf[i++] = c; 640 c = input_getc(); 641 // Sign on the exponent 642 if (c == '+' || c == '-') { 643 buf[i++] = c; 644 c = input_getc(); 645 } 646 // Exponent must be a digit, I.E no 1e1.2 647 if (!isdigit(c)) { 648 tok_error("Invalid floating point exponent\n"); 649 return NULL; 650 } 651 buf[i++] = c; 652 is_float = 1; 653 } else { 654 // Reached the end, unget the character so other functions can read it 655 input_ungetc(c); 656 break; 657 } 658 } 659 660 buf[i] = '\0'; 661 /* Process Suffixes */ 662 int is_unsigned = 0; 663 int is_long = 0; 664 int is_single = 0; 665 while (1) { 666 c = input_getc(); 667 if (c == 'u' || c == 'U') { 668 if (is_unsigned) { 669 tok_warn( 670 "Warning: Duplicate suffix 'u' for integer constant ignored\n"); 671 } 672 is_unsigned = 1; 673 } else if (c == 'l' || c == 'L') { 674 if (is_long) { 675 tok_warn( 676 "Warning: Duplicate suffix 'l' for integer constant ignored\n"); 677 } 678 is_long = 1; 679 } else if (c == 'f' || c == 'F') { 680 if (is_single) { 681 tok_warn("Warning: Duplicate suffix 'f' for floating point constant " 682 "ignored\n"); 683 } 684 is_single = 1; 685 } else { 686 input_ungetc(c); 687 break; 688 } 689 } 690 691 /* Check for conflicting suffixes */ 692 if (is_single && is_long) { 693 tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " 694 "constant. Ignoring 'l'\n"); 695 is_long = 0; 696 } 697 if (is_single && is_unsigned) { 698 tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point " 699 "constant. Ignoring 'u'\n"); 700 is_unsigned = 0; 701 } 702 if (is_single && !is_float) { 703 tok_warn( 704 "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n"); 705 is_single = 0; 706 } 707 708 if (is_float) { 709 /* Convert to float */ 710 errno = 0; 711 // Strtod generates a unix-style error when it's given something out of 712 // range, so we want to get on top of that quickly instead of ignoring it 713 // That way we can avoid some nasty NAN-propagation in the constant folder. 714 double f = strtod(buf, NULL); 715 if (errno == ERANGE) { 716 tok_error("Floating point constant out of range\n"); 717 return NULL; 718 } 719 // Warn if the constant is out of range for a float, I.E it's too big or too 720 // small 721 if (is_single && (f < FLT_MIN || f > FLT_MAX)) { 722 tok_warn( 723 "Warning: Floating point constant %f is out of range for float\n", f); 724 } 725 // Warn if the constant is too precise for a float 726 if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) { 727 tok_warn("Warning: Converting double precision floating point constant " 728 "%f to float loses " 729 "precision\n", 730 f); 731 } 732 return token_create_float(is_single ? TOK_FLOAT_32 733 : TOK_FLOAT_64, 734 line, column, f, i); 735 736 } else { 737 /* Convert to integer */ 738 errno = 0; 739 uint64_t int_ = strtoull(buf, NULL, radix); 740 // Same as above, but for integers 741 if (errno == ERANGE) { 742 tok_error("Integer constant out of range\n"); 743 return NULL; 744 } 745 if (is_unsigned) { 746 if (is_long) { 747 return token_create_int(TOK_INTEGER_U64, line, column, int_, i); 748 } else { 749 if (int_ > UINT32_MAX) { 750 tok_warn( 751 "Warning: Integer constant %lld is out of range for unsigned " 752 "int\n", 753 int_); 754 } 755 return token_create_int(TOK_INTEGER_U32, line, column, int_, i); 756 } 757 } else { 758 if (is_long) { 759 // If the highest bit is set, that means this will overflow a signed 760 // long (Due to two's complement) 761 if (int_ & (1UL << 63)) { 762 tok_warn( 763 "Warning: Integer constant %lld is out of range for long long\n", 764 i); 765 } 766 return token_create_int(TOK_INTEGER_S64, line, column, int_, i); 767 } else { 768 if (int_ & (1UL << 31)) { 769 tok_warn("Warning: Integer constant %lld is out of range for int\n", 770 int_); 771 } 772 return token_create_int(TOK_INTEGER_S32, line, column, int_, i); 773 } 774 } 775 776 } 777 return NULL; 778 } 779 780 /* Tokenize String */ 781 /* Read Escape Sequence */ 782 static char read_escape_sequence(int *len) { 783 int c = input_getc(); 784 *len += 1; 785 switch (c) { 786 case 'a': 787 return '\a'; 788 case 'b': 789 return '\b'; 790 case 'f': 791 return '\f'; 792 case 'n': 793 return '\n'; 794 case 'r': 795 return '\r'; 796 case 't': 797 return '\t'; 798 case 'v': 799 return '\v'; 800 case '\'': 801 return '\''; 802 case '"': 803 return '"'; 804 case '?': 805 return '?'; 806 case '\\': 807 return '\\'; 808 case '0': 809 return '\0'; 810 case 'x': { 811 c = input_getc(); 812 if (!isxdigit(c)) { 813 tok_error("Invalid hexadecimal escape sequence\n"); 814 return 0; 815 } 816 int val = 0; 817 while (isxdigit(c)) { 818 *len += 1; 819 val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10); 820 c = input_getc(); 821 } 822 input_ungetc(c); 823 return (char)val; 824 } 825 default: 826 if (!isdigit(c)) { 827 tok_error("Invalid escape sequence\n"); 828 return 0; 829 } 830 int val = 0; 831 while (isdigit(c)) { 832 *len += 1; 833 val = val * 8 + c - '0'; 834 c = input_getc(); 835 } 836 input_ungetc(c); 837 return (char)val; 838 } 839 } 840 841 static token_t *read_string_literal(void) { 842 int c; 843 c = input_getc(); 844 if (c != '"') { 845 input_ungetc(c); 846 return NULL; 847 } 848 int i = 0; 849 char s_buf[512]; 850 char *buf = s_buf; 851 int len = 512; 852 int esc_pad = 0; 853 while ((c = input_getc()) != EOF) { 854 if (c == '"') { 855 // Implicit skip of closing quote 856 break; 857 } 858 if (c == '\\') { 859 c = read_escape_sequence(&esc_pad); 860 if (c == 0) { 861 return NULL; 862 } 863 } 864 if (i >= len) { 865 if (buf == s_buf) { 866 buf = malloc(1024); 867 if (buf == NULL) { 868 fputs("Out of memory. Could not parse string literal.\n", stderr); 869 exit(1); 870 } 871 memcpy(buf, s_buf, 512); 872 len *= 2; 873 } else { 874 len *= 2; 875 buf = realloc(buf, len); 876 } 877 } 878 buf[i++] = c; 879 } 880 buf[i] = '\0'; 881 if (c == EOF) { 882 tok_error("Unterminated string literal\n"); 883 if (buf != s_buf) { 884 free(buf); 885 } 886 return NULL; 887 } 888 889 token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, 890 i + esc_pad + 2); 891 if (buf != s_buf) { 892 free(buf); 893 } 894 return tok; 895 } 896 897 /* Tokenize Character */ 898 static token_t *read_char_constant(void) { 899 int c; 900 int len = 0; 901 c = input_getc(); 902 if (c != '\'') { 903 input_ungetc(c); 904 return NULL; 905 } 906 len++; 907 c = input_getc(); 908 if (c == '\'') { 909 tok_error("Empty character constant\n"); 910 return NULL; 911 } 912 if (c == '\\') { 913 c = read_escape_sequence(&len); 914 } 915 int val = c; 916 c = input_getc(); 917 if (c != '\'') { 918 tok_error("Expected closing quote for character constant\n"); 919 return NULL; 920 } 921 len++; 922 return token_create_char(TOK_CHAR_CONST, line, column, val, len); 923 } 924 925 /* Tokenize Operator */ 926 927 token_t *read_operator(void) { 928 int c; 929 c = input_getc(); 930 switch (c) { 931 case '!': { 932 c = input_getc(); 933 if (c == '=') 934 return token_create(TOK_NE, line, column, 2); 935 input_ungetc(c); 936 return token_create(TOK_NOT, line, column, 1); 937 } 938 case '%': { 939 c = input_getc(); 940 if (c == '=') 941 return token_create(TOK_ASSIGN_MOD, line, column, 2); 942 input_ungetc(c); 943 return token_create(TOK_MOD, line, column, 1); 944 } 945 case '&': { 946 c = input_getc(); 947 if (c == '&') 948 return token_create(TOK_AND, line, column, 2); 949 if (c == '=') 950 return token_create(TOK_ASSIGN_BITAND, line, column, 2); 951 input_ungetc(c); 952 return token_create(TOK_BIT_AND, line, column, 1); 953 } 954 case '(': 955 return token_create(TOK_LEFT_PAREN, line, column, 1); 956 case ')': 957 return token_create(TOK_RIGHT_PAREN, line, column, 1); 958 case '*': { 959 c = input_getc(); 960 if (c == '=') 961 return token_create(TOK_ASSIGN_MUL, line, column, 2); 962 input_ungetc(c); 963 return token_create(TOK_MUL, line, column, 1); 964 } 965 case '+': { 966 c = input_getc(); 967 if (c == '+') 968 return token_create(TOK_INC, line, column, 2); 969 if (c == '=') 970 return token_create(TOK_ASSIGN_ADD, line, column, 2); 971 input_ungetc(c); 972 return token_create(TOK_ADD, line, column, 2); 973 } 974 case ',': 975 return token_create(TOK_COMMA, line, column, 1); 976 case '-': { 977 c = input_getc(); 978 if (c == '-') 979 return token_create(TOK_DEC, line, column, 2); 980 if (c == '=') 981 return token_create(TOK_ASSIGN_SUB, line, column, 2); 982 if (c == '>') 983 return token_create(TOK_MEMBER_POINTER, line, column, 2); 984 input_ungetc(c); 985 return token_create(TOK_SUB, line, column, 1); 986 } 987 case '.': { 988 c = input_getc(); 989 if (c == '.') { 990 c = input_getc(); 991 if (c == '.') { 992 return token_create(TOK_ELLIPSIS, line, column, 3); 993 } else { 994 // Bail out, can't store more than one unget 995 tok_error("Unexpected character '.' at line %d, column %d\n", line, 996 column); 997 exit(1); 998 } 999 } 1000 return token_create('.', line, column, 1); 1001 } 1002 case '/': { 1003 c = input_getc(); 1004 if (c == '=') 1005 return token_create(TOK_ASSIGN_DIV, line, column, 2); 1006 input_ungetc(c); 1007 return token_create(TOK_DIV, line, column, 1); 1008 } 1009 case ':': 1010 return token_create(TOK_COND_DECISION, line, column, 1); 1011 case ';': 1012 return token_create(TOK_SEMICOLON, line, column, 1); 1013 case '<': { 1014 c = input_getc(); 1015 if (c == '<') { 1016 c = input_getc(); 1017 if (c == '=') 1018 return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); 1019 input_ungetc(c); 1020 return token_create(TOK_LSHIFT, line, column, 2); 1021 } 1022 if (c == '=') 1023 return token_create(TOK_LE, line, column, 2); 1024 input_ungetc(c); 1025 return token_create(TOK_LT, line, column, 1); 1026 } 1027 case '=': { 1028 c = input_getc(); 1029 if (c == '=') 1030 return token_create(TOK_ASSIGN, line, column, 2); 1031 input_ungetc(c); 1032 return token_create(TOK_ASSIGN, line, column, 1); 1033 } 1034 case '>': { 1035 c = input_getc(); 1036 if (c == '>') { 1037 c = input_getc(); 1038 if (c == '=') 1039 return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); 1040 input_ungetc(c); 1041 return token_create(TOK_RSHIFT, line, column, 2); 1042 } 1043 if (c == '=') 1044 return token_create(TOK_GE, line, column, 2); 1045 input_ungetc(c); 1046 return token_create(TOK_GT, line, column, 1); 1047 } 1048 case '?': 1049 return token_create(TOK_COND, line, column, 1); 1050 case '[': 1051 return token_create(TOK_LEFT_BRACKET, line, column, 1); 1052 case ']': 1053 return token_create(TOK_RIGHT_BRACKET, line, column, 1); 1054 case '^': { 1055 c = input_getc(); 1056 if (c == '=') 1057 return token_create(TOK_ASSIGN_BITXOR, line, column, 2); 1058 input_ungetc(c); 1059 return token_create(TOK_BIT_XOR, line, column, 1); 1060 } 1061 case '{': 1062 return token_create(TOK_LEFT_BRACE, line, column, 1); 1063 case '|': { 1064 c = input_getc(); 1065 if (c == '|') 1066 return token_create(TOK_OR, line, column, 2); 1067 if (c == '=') 1068 return token_create(TOK_ASSIGN_BITOR, line, column, 2); 1069 input_ungetc(c); 1070 return token_create(TOK_BIT_OR, line, column, 1); 1071 } 1072 case '}': 1073 return token_create(TOK_RIGHT_BRACE, line, column, 1); 1074 case '~': 1075 return token_create(TOK_BIT_NOT, line, column, 1); 1076 default: 1077 input_ungetc(c); 1078 return NULL; 1079 } 1080 1081 return NULL; 1082 } 1083 1084 token_t *next_token(void) { 1085 if (left_stack_pos > 0) { 1086 return left_stack[--left_stack_pos]; 1087 } 1088 token_t *tok = skip_whitespace(); 1089 if (tok != NULL) { 1090 return tok; 1091 } 1092 tok = read_identifier(); 1093 if (tok != NULL) { 1094 return tok; 1095 } 1096 tok = read_number(); 1097 if (tok != NULL) { 1098 return tok; 1099 } 1100 tok = read_char_constant(); 1101 if (tok != NULL) { 1102 return tok; 1103 } 1104 tok = read_string_literal(); 1105 if (tok != NULL) { 1106 return tok; 1107 } 1108 tok = read_operator(); 1109 if (tok != NULL) { 1110 return tok; 1111 } 1112 int c = input_getc(); 1113 if (c == EOF) { 1114 return NULL; 1115 } 1116 tok_warn( 1117 "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, 1118 line, column); 1119 return next_token(); 1120 } 1121 1122 #ifdef TEST_TOKENIZER 1123 /* Run Test */ 1124 char *preprocess(char *in) { 1125 char *output_name = malloc(1024); 1126 snprintf(output_name, 1024, "%s.preprocessed", in); 1127 char *command = malloc(2048); 1128 snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name); 1129 system(command); 1130 free(command); 1131 return output_name; 1132 } 1133 1134 // Tokenize the input file 1135 int main(int argc, char **argv) { 1136 if (argc != 2) { 1137 fprintf(stderr, "Usage: %s <input.c>\n", argv[0]); 1138 return 1; 1139 } 1140 char *input_name = argv[1]; 1141 char *preprocessed = preprocess(input_name); 1142 init_tokenizer(preprocessed); 1143 token_t *tok; 1144 while ((tok = next_token()) != NULL) { 1145 print_token(tok); 1146 token_destroy(tok); 1147 } 1148 destroy_tokenizer(); 1149 remove(preprocessed); 1150 free(preprocessed); 1151 hash_table_destroy(string_table); 1152 return 0; 1153 } 1154 1155 #endif 1156 1157