lexer.html - website - Website contents

lexer.html (118682B)
      1 <!DOCTYPE html>
      2 <html>
      3 <head>
      4 <meta charset="utf-8">
      5 <title>Lexer</title>
      6 <script>
      7 !function(){var q=null;window.PR_SHOULD_USE_CONTINUATION=!0;
      8 (function(){function R(a){function d(e){var b=e.charCodeAt(0);if(b!==92)return b;var a=e.charAt(1);return(b=r[a])?b:"0"<=a&&a<="7"?parseInt(e.substring(1),8):a==="u"||a==="x"?parseInt(e.substring(2),16):e.charCodeAt(1)}function g(e){if(e<32)return(e<16?"\\x0":"\\x")+e.toString(16);e=String.fromCharCode(e);return e==="\\"||e==="-"||e==="]"||e==="^"?"\\"+e:e}function b(e){var b=e.substring(1,e.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),e=[],a=
      9 b[0]==="^",c=["["];a&&c.push("^");for(var a=a?1:0,f=b.length;a<f;++a){var h=b[a];if(/\\[bdsw]/i.test(h))c.push(h);else{var h=d(h),l;a+2<f&&"-"===b[a+1]?(l=d(b[a+2]),a+=2):l=h;e.push([h,l]);l<65||h>122||(l<65||h>90||e.push([Math.max(65,h)|32,Math.min(l,90)|32]),l<97||h>122||e.push([Math.max(97,h)&-33,Math.min(l,122)&-33]))}}e.sort(function(e,a){return e[0]-a[0]||a[1]-e[1]});b=[];f=[];for(a=0;a<e.length;++a)h=e[a],h[0]<=f[1]+1?f[1]=Math.max(f[1],h[1]):b.push(f=h);for(a=0;a<b.length;++a)h=b[a],c.push(g(h[0])),
     10 h[1]>h[0]&&(h[1]+1>h[0]&&c.push("-"),c.push(g(h[1])));c.push("]");return c.join("")}function s(e){for(var a=e.source.match(/\[(?:[^\\\] ]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),c=a.length,d=[],f=0,h=0;f<c;++f){var l=a[f];l==="("?++h:"\\"===l.charAt(0)&&(l=+l.substring(1))&&(l<=h?d[l]=-1:a[f]=g(l))}for(f=1;f<d.length;++f)-1===d[f]&&(d[f]=++x);for(h=f=0;f<c;++f)l=a[f],l==="("?(++h,d[h]||(a[f]="(?:")):"\\"===l.charAt(0)&&(l=+l.substring(1))&&l<=h&&
     11 (a[f]="\\"+d[l]);for(f=0;f<c;++f)"^"===a[f]&&"^"!==a[f+1]&&(a[f]="");if(e.ignoreCase&&m)for(f=0;f<c;++f)l=a[f],e=l.charAt(0),l.length>=2&&e==="["?a[f]=b(l):e!=="\\"&&(a[f]=l.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return a.join("")}for(var x=0,m=!1,j=!1,k=0,c=a.length;k<c;++k){var i=a[k];if(i.ignoreCase)j=!0;else if(/[a-z]/i.test(i.source.replace(/\\u[\da-f]{4}|\\x[\da-f]{2}|\\[^UXux]/gi,""))){m=!0;j=!1;break}}for(var r={b:8,t:9,n:10,v:11,
     12 f:12,r:13},n=[],k=0,c=a.length;k<c;++k){i=a[k];if(i.global||i.multiline)throw Error(""+i);n.push("(?:"+s(i)+")")}return RegExp(n.join("|"),j?"gi":"g")}function S(a,d){function g(a){var c=a.nodeType;if(c==1){if(!b.test(a.className)){for(c=a.firstChild;c;c=c.nextSibling)g(c);c=a.nodeName.toLowerCase();if("br"===c||"li"===c)s[j]="\n",m[j<<1]=x++,m[j++<<1|1]=a}}else if(c==3||c==4)c=a.nodeValue,c.length&&(c=d?c.replace(/\r\n?/g,"\n"):c.replace(/[\t\n\r ]+/g," "),s[j]=c,m[j<<1]=x,x+=c.length,m[j++<<1|1]=
     13 a)}var b=/(?:^|\s)nocode(?:\s|$)/,s=[],x=0,m=[],j=0;g(a);return{a:s.join("").replace(/\n$/,""),d:m}}function H(a,d,g,b){d&&(a={a:d,e:a},g(a),b.push.apply(b,a.g))}function T(a){for(var d=void 0,g=a.firstChild;g;g=g.nextSibling)var b=g.nodeType,d=b===1?d?a:g:b===3?U.test(g.nodeValue)?a:d:d;return d===a?void 0:d}function D(a,d){function g(a){for(var j=a.e,k=[j,"pln"],c=0,i=a.a.match(s)||[],r={},n=0,e=i.length;n<e;++n){var z=i[n],w=r[z],t=void 0,f;if(typeof w==="string")f=!1;else{var h=b[z.charAt(0)];
     14 if(h)t=z.match(h[1]),w=h[0];else{for(f=0;f<x;++f)if(h=d[f],t=z.match(h[1])){w=h[0];break}t||(w="pln")}if((f=w.length>=5&&"lang-"===w.substring(0,5))&&!(t&&typeof t[1]==="string"))f=!1,w="src";f||(r[z]=w)}h=c;c+=z.length;if(f){f=t[1];var l=z.indexOf(f),B=l+f.length;t[2]&&(B=z.length-t[2].length,l=B-f.length);w=w.substring(5);H(j+h,z.substring(0,l),g,k);H(j+h+l,f,I(w,f),k);H(j+h+B,z.substring(B),g,k)}else k.push(j+h,w)}a.g=k}var b={},s;(function(){for(var g=a.concat(d),j=[],k={},c=0,i=g.length;c<i;++c){var r=
     15 g[c],n=r[3];if(n)for(var e=n.length;--e>=0;)b[n.charAt(e)]=r;r=r[1];n=""+r;k.hasOwnProperty(n)||(j.push(r),k[n]=q)}j.push(/[\S\s]/);s=R(j)})();var x=d.length;return g}function v(a){var d=[],g=[];a.tripleQuotedStrings?d.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?d.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/,
     16 q,"'\"`"]):d.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&g.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var b=a.hashComments;b&&(a.cStyleComments?(b>1?d.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):d.push(["com",/^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),g.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h(?:h|pp|\+\+)?|[a-z]\w*)>/,q])):d.push(["com",
     17 /^#[^\n\r]*/,q,"#"]));a.cStyleComments&&(g.push(["com",/^\/\/[^\n\r]*/,q]),g.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));if(b=a.regexLiterals){var s=(b=b>1?"":"\n\r")?".":"[\\S\\s]";g.push(["lang-regex",RegExp("^(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*("+("/(?=[^/*"+b+"])(?:[^/\\x5B\\x5C"+b+"]|\\x5C"+s+"|\\x5B(?:[^\\x5C\\x5D"+b+"]|\\x5C"+
     18 s+")*(?:\\x5D|$))+/")+")")])}(b=a.types)&&g.push(["typ",b]);b=(""+a.keywords).replace(/^ | $/g,"");b.length&&g.push(["kwd",RegExp("^(?:"+b.replace(/[\s,]+/g,"|")+")\\b"),q]);d.push(["pln",/^\s+/,q," \r\n\t\u00a0"]);b="^.[^\\s\\w.$@'\"`/\\\\]*";a.regexLiterals&&(b+="(?!s*/)");g.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/,
     19 q],["pun",RegExp(b),q]);return D(d,g)}function J(a,d,g){function b(a){var c=a.nodeType;if(c==1&&!x.test(a.className))if("br"===a.nodeName)s(a),a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)b(a);else if((c==3||c==4)&&g){var d=a.nodeValue,i=d.match(m);if(i)c=d.substring(0,i.index),a.nodeValue=c,(d=d.substring(i.index+i[0].length))&&a.parentNode.insertBefore(j.createTextNode(d),a.nextSibling),s(a),c||a.parentNode.removeChild(a)}}function s(a){function b(a,c){var d=
     20 c?a.cloneNode(!1):a,e=a.parentNode;if(e){var e=b(e,1),g=a.nextSibling;e.appendChild(d);for(var i=g;i;i=g)g=i.nextSibling,e.appendChild(i)}return d}for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),d;(d=a.parentNode)&&d.nodeType===1;)a=d;c.push(a)}for(var x=/(?:^|\s)nocode(?:\s|$)/,m=/\r\n?|\n/,j=a.ownerDocument,k=j.createElement("li");a.firstChild;)k.appendChild(a.firstChild);for(var c=[k],i=0;i<c.length;++i)b(c[i]);d===(d|0)&&c[0].setAttribute("value",d);var r=j.createElement("ol");
     21 r.className="linenums";for(var d=Math.max(0,d-1|0)||0,i=0,n=c.length;i<n;++i)k=c[i],k.className="L"+(i+d)%10,k.firstChild||k.appendChild(j.createTextNode("\u00a0")),r.appendChild(k);a.appendChild(r)}function p(a,d){for(var g=d.length;--g>=0;){var b=d[g];F.hasOwnProperty(b)?E.console&&console.warn("cannot override language handler %s",b):F[b]=a}}function I(a,d){if(!a||!F.hasOwnProperty(a))a=/^\s*</.test(d)?"default-markup":"default-code";return F[a]}function K(a){var d=a.h;try{var g=S(a.c,a.i),b=g.a;
     22 a.a=b;a.d=g.d;a.e=0;I(d,b)(a);var s=/\bMSIE\s(\d+)/.exec(navigator.userAgent),s=s&&+s[1]<=8,d=/\n/g,x=a.a,m=x.length,g=0,j=a.d,k=j.length,b=0,c=a.g,i=c.length,r=0;c[i]=m;var n,e;for(e=n=0;e<i;)c[e]!==c[e+2]?(c[n++]=c[e++],c[n++]=c[e++]):e+=2;i=n;for(e=n=0;e<i;){for(var p=c[e],w=c[e+1],t=e+2;t+2<=i&&c[t+1]===w;)t+=2;c[n++]=p;c[n++]=w;e=t}c.length=n;var f=a.c,h;if(f)h=f.style.display,f.style.display="none";try{for(;b<k;){var l=j[b+2]||m,B=c[r+2]||m,t=Math.min(l,B),A=j[b+1],G;if(A.nodeType!==1&&(G=x.substring(g,
     23 t))){s&&(G=G.replace(d,"\r"));A.nodeValue=G;var L=A.ownerDocument,o=L.createElement("span");o.className=c[r+1];var v=A.parentNode;v.replaceChild(o,A);o.appendChild(A);g<l&&(j[b+1]=A=L.createTextNode(x.substring(t,l)),v.insertBefore(A,o.nextSibling))}g=t;g>=l&&(b+=2);g>=B&&(r+=2)}}finally{if(f)f.style.display=h}}catch(u){E.console&&console.log(u&&u.stack||u)}}var E=window,y=["break,continue,do,else,for,if,return,while"],C=[[y,"auto,case,char,const,default,double,enum,extern,float,goto,inline,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"],
     24 "catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],M=[C,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,delegate,dynamic_cast,explicit,export,friend,generic,late_check,mutable,namespace,nullptr,property,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],V=[C,"abstract,assert,boolean,byte,extends,final,finally,implements,import,instanceof,interface,null,native,package,strictfp,super,synchronized,throws,transient"],
     25 N=[C,"abstract,as,base,bool,by,byte,checked,decimal,delegate,descending,dynamic,event,finally,fixed,foreach,from,group,implicit,in,interface,internal,into,is,let,lock,null,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var,virtual,where"],C=[C,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],O=[y,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"],
     26 P=[y,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],W=[y,"as,assert,const,copy,drop,enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,pub,pure,ref,self,static,struct,true,trait,type,unsafe,use"],y=[y,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],Q=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)\b/,
     27 U=/\S/,X=v({keywords:[M,N,C,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",O,P,y],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),F={};p(X,["default-code"]);p(D([],[["pln",/^[^<?]+/],["dec",/^<!\w[^>]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-",
     28 /^<xmp\b[^>]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^<script\b[^>]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^<style\b[^>]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i] ]),["default-markup","htm","html","mxml","xhtml","xml","xsl"]);p(D([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"] ],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/],
     29 ["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css",/^style\s*=\s*([^\s"'>]+)/i] ]),["in.tag"]);p(D([],[["atv",/^[\S\s]+/] ]),["uq.val"]);p(v({keywords:M,hashComments:!0,cStyleComments:!0,types:Q}),["c","cc","cpp","cxx","cyc","m"]);p(v({keywords:"null,true,false"}),["json"]);p(v({keywords:N,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:Q}),
     30 ["cs"]);p(v({keywords:V,cStyleComments:!0}),["java"]);p(v({keywords:y,hashComments:!0,multiLineStrings:!0}),["bash","bsh","csh","sh"]);p(v({keywords:O,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}),["cv","py","python"]);p(v({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:2}),["perl","pl","pm"]);p(v({keywords:P,
     31 hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb","ruby"]);p(v({keywords:C,cStyleComments:!0,regexLiterals:!0}),["javascript","js"]);p(v({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,throw,true,try,unless,until,when,while,yes",hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);p(v({keywords:W,cStyleComments:!0,multilineStrings:!0}),["rc","rs","rust"]);
     32 p(D([],[["str",/^[\S\s]+/] ]),["regex"]);var Y=E.PR={createSimpleLexer:D,registerLangHandler:p,sourceDecorator:v,PR_ATTRIB_NAME:"atn",PR_ATTRIB_VALUE:"atv",PR_COMMENT:"com",PR_DECLARATION:"dec",PR_KEYWORD:"kwd",PR_LITERAL:"lit",PR_NOCODE:"nocode",PR_PLAIN:"pln",PR_PUNCTUATION:"pun",PR_SOURCE:"src",PR_STRING:"str",PR_TAG:"tag",PR_TYPE:"typ",prettyPrintOne:E.prettyPrintOne=function(a,d,g){var b=document.createElement("div");b.innerHTML="<pre>"+a+"</pre>";b=b.firstChild;g&&J(b,g,!0);K({h:d,j:g,c:b,i:1});
     33 return b.innerHTML},prettyPrint:E.prettyPrint=function(a,d){function g(){for(var b=E.PR_SHOULD_USE_CONTINUATION?c.now()+250:Infinity;i<p.length&&c.now()<b;i++){for(var d=p[i],j=h,k=d;k=k.previousSibling;){var m=k.nodeType,o=(m===7||m===8)&&k.nodeValue;if(o?!/^\??prettify\b/.test(o):m!==3||/\S/.test(k.nodeValue))break;if(o){j={};o.replace(/\b(\w+)=([\w%+\-.:]+)/g,function(a,b,c){j[b]=c});break}}k=d.className;if((j!==h||e.test(k))&&!v.test(k)){m=!1;for(o=d.parentNode;o;o=o.parentNode)if(f.test(o.tagName)&&
     34 o.className&&e.test(o.className)){m=!0;break}if(!m){d.className+=" prettyprinted";m=j.lang;if(!m){var m=k.match(n),y;if(!m&&(y=T(d))&&t.test(y.tagName))m=y.className.match(n);m&&(m=m[1])}if(w.test(d.tagName))o=1;else var o=d.currentStyle,u=s.defaultView,o=(o=o?o.whiteSpace:u&&u.getComputedStyle?u.getComputedStyle(d,q).getPropertyValue("white-space"):0)&&"pre"===o.substring(0,3);u=j.linenums;if(!(u=u==="true"||+u))u=(u=k.match(/\blinenums\b(?::(\d+))?/))?u[1]&&u[1].length?+u[1]:!0:!1;u&&J(d,u,o);r=
     35 {h:m,c:d,j:u,i:o};K(r)}}}i<p.length?setTimeout(g,250):"function"===typeof a&&a()}for(var b=d||document.body,s=b.ownerDocument||document,b=[b.getElementsByTagName("pre"),b.getElementsByTagName("code"),b.getElementsByTagName("xmp")],p=[],m=0;m<b.length;++m)for(var j=0,k=b[m].length;j<k;++j)p.push(b[m][j]);var b=q,c=Date;c.now||(c={now:function(){return+new Date}});var i=0,r,n=/\blang(?:uage)?-([\w.]+)(?!\S)/,e=/\bprettyprint\b/,v=/\bprettyprinted\b/,w=/pre|xmp/i,t=/^code$/i,f=/^(?:pre|code|xmp)$/i,
     36 h={};g()}};typeof define==="function"&&define.amd&&define("google-code-prettify",[],function(){return Y})})();}()
     37 </script>
     38 <style>
     39 .pln{color:#1b181b}.str{color:#918b3b}.kwd{color:#7b59c0}.com{color:#9e8f9e}.typ{color:#516aec}.lit{color:#a65926}.clo,.opn,.pun{color:#1b181b}.tag{color:#ca402b}.atn{color:#a65926}.atv{color:#159393}.dec{color:#a65926}.var{color:#ca402b}.fun{color:#516aec}pre.prettyprint{background:#f7f3f7;color:#ab9bab;font-family:Menlo,Consolas,"Bitstream Vera Sans Mono","DejaVu Sans Mono",Monaco,monospace;font-size:12px;line-height:1.5;border:1px solid #d8cad8;padding:10px}ol.linenums{margin-top:0;margin-bottom:0}
     40 body{min-width:200px;max-width:850px;margin:0 auto;padding:30px;}.chapter-nav{font-size: 10pt;}a:link,a:visited{color:#00f}.codeblock_name,code,pre.prettyprint{font-family:Monaco,"Lucida Console",monospace}body{font-size:14pt}.codeblock_name,.math,.seealso,code{font-size:10pt}.codeblock{page-break-inside:avoid;padding-bottom:15px}.math{text-indent:0}pre.prettyprint{font-size:10pt;padding:10px;border-radius:10px;border:none;white-space:pre-wrap}.codeblock_name{margin-top:1.25em;display:block}a:link{text-decoration:none}a:link:not(.lit):hover{color:#00f;text-decoration:underline}a:link:active{color:red}h4{padding-right:1.25em}h4.noheading{margin-bottom:0}h1{text-align:center}code{padding:2px}pre{-moz-tab-size:4;-o-tab-size:4;tab-size:4}p:not(.notp){margin:0;text-indent:2em}.two-col{list-style-type:none}.two-col li:before{content:'-';padding:5px;margin-right:5px;color:orange;background-color:#fff;display:inline-block}@media print{body{font-size:10pt}pre.prettyprint{font-size:8pt}.seealso{font-size:9pt}.codeblock_name,.math,code{font-size:8pt}.math{text-indent:0}}
     41 /* code blocks (Style from jmeiners.com/lc3-vm, CC BY-NC-SA 4.0, used with attribution) */
     42 
     43 /* Quotes and Block Quotes */
     44 blockquote {
     45     margin: 1.5em 10px;
     46     padding: 0.5em 10px;
     47     border-left: 5px solid #ccc;
     48     color: #666;
     49     background-color: #f9f9f9;
     50     font-style: italic;
     51 }
     52 
     53 blockquote p {
     54     margin: 0;
     55     font-size: 1.2em;
     56 }
     57 
     58 q {
     59     quotes: "“" "”" "‘" "’";
     60     font-style: italic;
     61 }
     62 
     63 q::before {
     64     content: open-quote;
     65 }
     66 
     67 q::after {
     68     content: close-quote;
     69 }
     70 
     71 /*! Color themes for Google Code Prettify | MIT License | github.com/jmblog/color-themes-for-google-code-prettify */
     72 .prettyprint {
     73     background: #f5f7ff;
     74     font-family: Menlo, "Bitstream Vera Sans Mono", "DejaVu Sans Mono", Monaco, Consolas, monospace;
     75     border: 0 !important;
     76 }
     77 
     78 .pln {
     79     color: #202746;
     80 }
     81 
     82 /* Specify class=linenums on a pre to get line numbering */
     83 ol.linenums {
     84     margin-top: 0;
     85     margin-bottom: 0;
     86     color: #202746;
     87 }
     88 
     89 li.L0,
     90 li.L1,
     91 li.L2,
     92 li.L3,
     93 li.L4,
     94 li.L5,
     95 li.L6,
     96 li.L7,
     97 li.L8,
     98 li.L9 {
     99     padding-left: 1em;
    100     background-color: #f5f7ff;
    101     list-style-type: decimal;
    102 }
    103 
    104 @media screen {
    105 
    106     /* string content */
    107 
    108     .str {
    109         color: #ac9739;
    110     }
    111 
    112     /* keyword */
    113 
    114     .kwd {
    115         color: #6679cc;
    116     }
    117 
    118     /* comment */
    119 
    120     .com {
    121         color: #202746;
    122     }
    123 
    124     /* type name */
    125 
    126     .typ {
    127         color: #3d8fd1;
    128     }
    129 
    130     /* literal value */
    131 
    132     .lit {
    133         color: #c76b29;
    134     }
    135 
    136     /* punctuation */
    137 
    138     .pun {
    139         color: #202746;
    140     }
    141 
    142     /* lisp open bracket */
    143 
    144     .opn {
    145         color: #202746;
    146     }
    147 
    148     /* lisp close bracket */
    149 
    150     .clo {
    151         color: #202746;
    152     }
    153 
    154     /* markup tag name */
    155 
    156     .tag {
    157         color: #c94922;
    158     }
    159 
    160     /* markup attribute name */
    161 
    162     .atn {
    163         color: #c76b29;
    164     }
    165 
    166     /* markup attribute value */
    167 
    168     .atv {
    169         color: #22a2c9;
    170     }
    171 
    172     /* declaration */
    173 
    174     .dec {
    175         color: #c76b29;
    176     }
    177 
    178     /* variable name */
    179 
    180     .var {
    181         color: #c94922;
    182     }
    183 
    184     /* function name */
    185 
    186     .fun {
    187         color: #3d8fd1;
    188     }
    189 }</style>
    190 </head>
    191 <body onload="prettyPrint()">
    192 <section>
    193 <h1>Lexer</h1>
    194 <a name="1:1"><div class="section"><h4>1. General Project Structure</h4></a>
    195 <p>Since this is the first article, I'll outline the project structure for the C- compiler.
    196 </p>
    197 <p>The project has a series of pretty typical stages:
    198 </p>
    199 <ol>
    200 <li>The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E").
    201 </li>
    202 <li>The parser. This takes the tokens and builds an abstract syntax tree (AST).
    203 </li>
    204 <li>The type checker/semantic analyzer. This is used to ensure that the types of variables and functions are correct.
    205 </li>
    206 <li>The code generator. This takes the AST and generates an intermediate representation (IR).
    207 </li>
    208 <li>The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages.
    209 </li>
    210 <li>The lowerer. This takes the IR and lowers it to a simpler IR.
    211 </li>
    212 <li>The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers.
    213 </li>
    214 <li>The code emitter. This takes the IR and emits RISC-V assembly.
    215 </li>
    216 </ol>
    217 <p>As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena.
    218 </p>
    219 
    220 </div>
    221 <a name="1:2"><div class="section"><h4>2. Some Rules</h4></a>
    222 <p>Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent.
    223 </p>
    224 <ol>
    225 <li><p>PROGRAM LIKE IT'S 1999
    226 </p>
    227 </li>
    228 </ol>
    229 <blockquote><p> 640 KB ought to be enough for anybody. - Bill Gates
    230 </p>
    231 </blockquote>
    232 <p>Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage.
    233 </p>
    234 <p>This places a lot of constraints on the project, but I think it's a good exercise in minimalism.
    235 </p>
    236 <p>Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future).
    237 </p>
    238 <ol>
    239 <li><p>PROGRAM IN C++--
    240 </p>
    241 </li>
    242 </ol>
    243 <p>I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits:
    244 </p>
    245 <ul>
    246 <li>Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header.
    247 </li>
    248 <li>Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure.
    249 </li>
    250 <li>Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called.
    251 </li>
    252 </ul>
    253 <ol>
    254 <li><p>DON'T GET FANCY
    255 </p>
    256 </li>
    257 </ol>
    258 <p>My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else.
    259 </p>
    260 <p>That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability.
    261 </p>
    262 <ol>
    263 <li><p>DESIGN FOR DEBUGGING
    264 </p>
    265 </li>
    266 </ol>
    267 <p>This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point.
    268 </p>
    269 <p>This might be painful, but it'll make debugging a lot simpler and let users look under the hood.
    270 </p>
    271 <ol>
    272 <li><p>SMART DATA, STUPID CODE
    273 </p>
    274 </li>
    275 </ol>
    276 <p>A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible.
    277 </p>
    278 <p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table.
    279 </p>
    280 
    281 </div>
    282 <a name="1:3"><div class="section"><h4>3. Misc</h4></a>
    283 <p>THIS IS A LITERATE PROGRAM! Go to <a href="https://reagancfischer.dev/projects/cminus/code/lexer.lit">this link</a> to see the file that generated this HTML.
    284 </p>
    285 
    286 </div>
    287 <a name="1:4"><div class="section"><h4>4. The Lexer</h4></a>
    288 <p>A lexical analyzer reads source code and produces tokens, which are the smallest units of meaning in a language. These tokens are then used by the parser to build an abstract syntax tree (AST), which represents the structure of the program.
    289 </p>
    290 <p>For example, in the C programming language, tokens include keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.).
    291 </p>
    292 <p>Given a string like <code>int main() { return 0; }</code>, the lexer would produce a series of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>, <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>, <code>RBRACE</code>.
    293 </p>
    294 
    295 </div>
    296 <a name="1:5"><div class="section"><h4>5. Design</h4></a>
    297 <p>I'll break the lexer into several modules:
    298 </p>
    299 <ul>
    300 <li><code>token.c</code> will contain the token data structure and functions to create and destroy tokens.
    301 </li>
    302 <li><code>input.c</code> will contain the input data structure and functions to read from the input file.
    303 </li>
    304 <li><code>tokenizer.c</code> will contain the main lexer logic.
    305 </li>
    306 </ul>
    307 
    308 </div>
    309 <a name="1:6"><div class="section"><h4>6. Token Interface</h4></a>
    310 <p>We'll need several components to represent a token:
    311 </p>
    312 <ul>
    313 <li>The type of token, which will be an enum with values like <code>TOK_IF</code> or <code>TOK_INTEGER_U32</code>.
    314 </li>
    315 <li>The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do.
    316 </li>
    317 <li>The line and column of the token, used for error messages.
    318 </li>
    319 </ul>
    320 <p>To implement a class system in C, we'll hide the token implementation details behind an opaque pointer. We'll use a forward declaration of the token type in the header file and define the token type in the implementation file.
    321 </p>
    322 
    323 </div>
    324 <a name="1:7"><div class="section"><h4 class="noheading">7. </h4></a>
    325 
    326 <div class="codeblock">
    327 <span class="codeblock_name">{Opaque Token Type <a href="lexer.html#1:7">7</a>}</span>
    328 <pre class="prettyprint lang-c">
    329 typedef struct token token_t;
    330 </pre>
    331 
    332 
    333 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p>
    334 </div>
    335 </div>
    336 <a name="1:8"><div class="section"><h4 class="noheading">8. </h4></a>
    337 <p>We'll need functions to create and destroy tokens.
    338 </p>
    339 
    340 <div class="codeblock">
    341 <span class="codeblock_name">{Token Creation and Destruction Interface <a href="lexer.html#1:8">8</a>}</span>
    342 <pre class="prettyprint lang-c">
    343 token_t *token_create(c_token_types kind, int lin, int col, int len);
    344 token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len);
    345 token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len);
    346 token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len);
    347 token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len);
    348 void token_destroy(token_t *token);
    349 </pre>
    350 
    351 
    352 <p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p>
    353 </div>
    354 </div>
    355 <a name="1:9"><div class="section"><h4 class="noheading">9. </h4></a>
    356 <p>We'll also need functions to access the token data.
    357 </p>
    358 
    359 <div class="codeblock">
    360 <span class="codeblock_name">{Token Interface <a href="lexer.html#1:9">9</a>}</span>
    361 <pre class="prettyprint lang-c">
    362 c_token_types token_type(token_t *token);
    363 int64_t token_int(token_t *token);
    364 double token_float(token_t *token);
    365 const char *token_string(token_t *token);
    366 char token_char(token_t *token);
    367 int token_line(token_t *token);
    368 int token_column(token_t *token);
    369 void print_token(token_t *tok);
    370 </pre>
    371 
    372 
    373 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p>
    374 </div>
    375 </div>
    376 <a name="1:10"><div class="section"><h4 class="noheading">10. </h4></a>
    377 <p>We'll need types to represent the different kinds of tokens.
    378 </p>
    379 
    380 <div class="codeblock">
    381 <span class="codeblock_name">{Token Types <a href="lexer.html#1:10">10</a>}</span>
    382 <pre class="prettyprint lang-c">
    383 typedef enum {
    384   // Control Keywords
    385   TOK_IF,
    386   TOK_ELSE,
    387   TOK_SWITCH,
    388   TOK_CASE,
    389   TOK_DEFAULT,
    390   TOK_WHILE,
    391   TOK_DO,
    392   TOK_FOR,
    393   TOK_CONTINUE,
    394   TOK_BREAK,
    395   TOK_RETURN,
    396   TOK_GOTO,
    397 
    398   // Type Keywords
    399   TOK_VOID,
    400   TOK_CHAR,
    401   TOK_SHORT,
    402   TOK_INT,
    403   TOK_LONG,
    404   TOK_FLOAT,
    405   TOK_DOUBLE,
    406   TOK_SIGNED,
    407   TOK_UNSIGNED,
    408   TOK_STRUCT,
    409   TOK_UNION,
    410   TOK_ENUM,
    411   TOK_TYPEDEF,
    412 
    413   // Storage Class/Specifier Keywords
    414   TOK_AUTO,
    415   TOK_REGISTER,
    416   TOK_STATIC,
    417   TOK_EXTERN,
    418   TOK_CONST,
    419   TOK_VOLATILE,
    420 
    421   // Misc Keywords
    422   TOK_SIZEOF,
    423 
    424   // Operators
    425   TOK_ADD,            // +
    426   TOK_SUB,            // -
    427   TOK_MUL,            // *
    428   TOK_DIV,            // /
    429   TOK_MOD,            // %
    430   TOK_BIT_AND,        // &amp;
    431   TOK_BIT_OR,         // |
    432   TOK_BIT_XOR,        // ^
    433   TOK_BIT_NOT,        // ~
    434   TOK_LSHIFT,         // &lt;&lt;
    435   TOK_RSHIFT,         // &gt;&gt;
    436   TOK_NOT,            // !
    437   TOK_ASSIGN,         // =
    438   TOK_LT,             // &lt;
    439   TOK_GT,             // &gt;
    440   TOK_INC,            // ++
    441   TOK_DEC,            // --
    442   TOK_EQ,             // ==
    443   TOK_NE,             // !=
    444   TOK_LE,             // &lt;=
    445   TOK_GE,             // &gt;=
    446   TOK_AND,            // &amp;&amp;
    447   TOK_OR,             // ||
    448   TOK_MEMBER_POINTER, // -&gt;
    449   TOK_MEMBER,         // .
    450   TOK_COND_DECISION,  // :
    451   TOK_COND,           // ?
    452   TOK_ASSIGN_ADD,     // +=
    453   TOK_ASSIGN_SUB,     // -=
    454   TOK_ASSIGN_MUL,     // *=
    455   TOK_ASSIGN_DIV,     // /=
    456   TOK_ASSIGN_MOD,     // %=
    457   TOK_ASSIGN_BITAND,  // &amp;=
    458   TOK_ASSIGN_BITOR,   // |=
    459   TOK_ASSIGN_BITXOR,  // ^=
    460   TOK_ASSIGN_LSHIFT,  // &lt;&lt;=
    461   TOK_ASSIGN_RSHIFT,  // &gt;&gt;=
    462 
    463   // Separators
    464   TOK_LEFT_PAREN,    // (
    465   TOK_RIGHT_PAREN,   // )
    466   TOK_LEFT_BRACKET,  // [
    467   TOK_RIGHT_BRACKET, // ]
    468   TOK_LEFT_BRACE,    // {
    469   TOK_RIGHT_BRACE,   // }
    470   TOK_COMMA,         // ,
    471   TOK_SEMICOLON,     // ;
    472   TOK_DOT,           // .
    473   TOK_ELLIPSIS,      // ...
    474   TOK_HASH,          // #
    475 
    476   // Identifiers
    477   TOK_ID,
    478   TOK_TYPEDEF_NAME,
    479 
    480   // Constants
    481   TOK_INTEGER_U32,  // u
    482   TOK_INTEGER_U64,  // ul
    483   TOK_INTEGER_S32,  // (no suffix)
    484   TOK_INTEGER_S64,  // l
    485   TOK_FLOAT_32,     // f
    486   TOK_FLOAT_64,     // (no suffix)
    487   TOK_CHAR_CONST,         // 'c'
    488   TOK_STRING_ASCII, // "string" (width of 8 bits)
    489 
    490   // Special
    491   TOK_EOF,
    492   TOK_ERROR,
    493 } c_token_types;
    494 </pre>
    495 
    496 
    497 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p>
    498 </div>
    499 </div>
    500 <a name="1:11"><div class="section"><h4 class="noheading">11. </h4></a>
    501 <p>We bring this all together in <code>token.h</code>. Line and column are exposed as global variables because <code>skip_whitespace</code> will need to update them.
    502 </p>
    503 
    504 <div class="codeblock">
    505 <span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>}</span>
    506 <pre class="prettyprint lang-c">
    507 #ifndef TOKEN_H
    508 #define TOKEN_H
    509 #include &lt;stdint.h&gt; // For int64_t
    510 <span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span>
    511 <span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span>
    512 <span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span>
    513 <span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span>
    514 extern int column;
    515 extern int line;
    516 #endif
    517 </pre>
    518 
    519 <p class="seealso">Redefined in section <a href="lexer.html#1:41">41</a></p>
    520 
    521 </div>
    522 </div>
    523 <a name="1:12"><div class="section"><h4>12. Token Implementation</h4></a>
    524 <p>Now that we have the interface, we can implement the token data structure. We'll need:
    525 </p>
    526 <ul>
    527 <li>The token type.
    528 </li>
    529 <li>A way to store extra data.
    530 </li>
    531 <li>Implementations of the functions defined in the interface.
    532 </li>
    533 </ul>
    534 <p>To verify the token isn't corrupt, we'll use a tag. <code>TOK_MAGIC_1</code> represents a token with optional data, and <code>TOK_MAGIC_2</code> represents a token without optional data.
    535 </p>
    536 <p>A zero-length array is used in the token data structure. This GCC extension allows us to allocate memory for the token data structure and the token data in one allocation.
    537 </p>
    538 
    539 <div class="codeblock">
    540 <span class="codeblock_name">{Token Data Structure <a href="lexer.html#1:12">12</a>}</span>
    541 <pre class="prettyprint lang-c">
    542 #define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK"
    543 #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN"
    544 
    545 struct token {
    546   long magic;
    547   int line;
    548   int column;
    549   short kind;
    550   long opt_data[0];
    551 };
    552 
    553 typedef struct token token_t;
    554 
    555 struct token_data {
    556   union {
    557     int64_t i;
    558     double f;
    559     const char *s;
    560     char c;
    561   } data;
    562 };
    563 
    564 typedef struct token_data token_data_t;
    565 int column = 1;
    566 int line = 1;
    567 </pre>
    568 
    569 
    570 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p>
    571 </div>
    572 </div>
    573 <a name="1:13"><div class="section"><h4 class="noheading">13. </h4></a>
    574 <p>We'll implement an interface for accessing the token data and a macro for accessing optional data.
    575 </p>
    576 
    577 <div class="codeblock">
    578 <span class="codeblock_name">{Token Data Access <a href="lexer.html#1:13">13</a>}</span>
    579 <pre class="prettyprint lang-c">
    580 #define token_data(token) ((struct token_data *)((token)-&gt;opt_data))
    581 
    582 c_token_types token_type(token_t *token) {
    583   assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
    584   return token-&gt;kind;
    585 }
    586 
    587 int64_t token_int(token_t *token) {
    588   assert(token-&gt;kind == TOK_INTEGER_U32 || token-&gt;kind == TOK_INTEGER_U64 || token-&gt;kind == TOK_INTEGER_S32 || token-&gt;kind == TOK_INTEGER_S64);
    589   assert(token-&gt;magic == TOK_MAGIC_1);
    590   return token_data(token)-&gt;data.i;
    591 }
    592 
    593 double token_float(token_t *token) {
    594   assert(token-&gt;kind == TOK_FLOAT_32 || token-&gt;kind == TOK_FLOAT_64);
    595   assert(token-&gt;magic == TOK_MAGIC_1);
    596   return token_data(token)-&gt;data.f;
    597 }
    598 
    599 const char *token_string(token_t *token) {
    600   assert(token-&gt;kind == TOK_STRING_ASCII || token-&gt;kind == TOK_ID || token-&gt;kind == TOK_TYPEDEF_NAME);
    601   assert(token-&gt;magic == TOK_MAGIC_1);
    602   return token_data(token)-&gt;data.s;
    603 }
    604 
    605 char token_char(token_t *token) {
    606   assert(token-&gt;kind == TOK_CHAR_CONST);
    607   assert(token-&gt;magic == TOK_MAGIC_1);
    608   return token_data(token)-&gt;data.c;
    609 }
    610 
    611 int token_line(token_t *token) {
    612   assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
    613   return token-&gt;line;
    614 }
    615 
    616 int token_column(token_t *token) {
    617   assert(token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2);
    618   return token-&gt;column;
    619 }
    620 </pre>
    621 
    622 
    623 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p>
    624 </div>
    625 </div>
    626 <a name="1:14"><div class="section"><h4 class="noheading">14. </h4></a>
    627 <p>For debugging, we'll add a function to print the token type.
    628 </p>
    629 
    630 <div class="codeblock">
    631 <span class="codeblock_name">{Token Debugging <a href="lexer.html#1:14">14</a>}</span>
    632 <pre class="prettyprint lang-c">
    633 <span class="nocode pln">{Token Type Enum to String, <a href="lexer.html#1:15">15</a>}</span>
    634 <span class="nocode pln">{Unescape String, <a href="lexer.html#1:16">16</a>}</span>
    635 <span class="nocode pln">{Print Token, <a href="lexer.html#1:17">17</a>}</span>
    636 </pre>
    637 
    638 
    639 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p>
    640 </div>
    641 </div>
    642 <a name="1:15"><div class="section"><h4 class="noheading">15. </h4></a>
    643 <p>This function returns a string with the token type name.
    644 </p>
    645 
    646 <div class="codeblock">
    647 <span class="codeblock_name">{Token Type Enum to String <a href="lexer.html#1:15">15</a>}</span>
    648 <pre class="prettyprint lang-c">
    649 const char *token_name_from_type(c_token_types type) {
    650   switch (type) {
    651   case TOK_IF:
    652     return "TOK_IF";
    653   case TOK_ELSE:
    654     return "TOK_ELSE";
    655   case TOK_SWITCH:
    656     return "TOK_SWITCH";
    657   case TOK_CASE:
    658     return "TOK_CASE";
    659   case TOK_DEFAULT:
    660     return "TOK_DEFAULT";
    661   case TOK_WHILE:
    662     return "TOK_WHILE";
    663   case TOK_DO:
    664     return "TOK_DO";
    665   case TOK_FOR:
    666     return "TOK_FOR";
    667   case TOK_CONTINUE:
    668     return "TOK_CONTINUE";
    669   case TOK_BREAK:
    670     return "TOK_BREAK";
    671   case TOK_RETURN:
    672     return "TOK_RETURN";
    673   case TOK_GOTO:
    674     return "TOK_GOTO";
    675   case TOK_VOID:
    676     return "TOK_VOID";
    677   case TOK_CHAR:
    678     return "TOK_CHAR";
    679   case TOK_SHORT:
    680     return "TOK_SHORT";
    681   case TOK_INT:
    682     return "TOK_INT";
    683   case TOK_LONG:
    684     return "TOK_LONG";
    685   case TOK_FLOAT:
    686     return "TOK_FLOAT";
    687   case TOK_DOUBLE:
    688     return "TOK_DOUBLE";
    689   case TOK_SIGNED:
    690     return "TOK_SIGNED";
    691   case TOK_UNSIGNED:
    692     return "TOK_UNSIGNED";
    693   case TOK_STRUCT:
    694     return "TOK_STRUCT";
    695   case TOK_UNION:
    696     return "TOK_UNION";
    697   case TOK_ENUM:
    698     return "TOK_ENUM";
    699   case TOK_TYPEDEF:
    700     return "TOK_TYPEDEF";
    701   case TOK_AUTO:
    702     return "TOK_AUTO";
    703   case TOK_REGISTER:
    704     return "TOK_REGISTER";
    705   case TOK_STATIC:
    706     return "TOK_STATIC";
    707   case TOK_EXTERN:
    708     return "TOK_EXTERN";
    709   case TOK_CONST:
    710     return "TOK_CONST";
    711   case TOK_VOLATILE:
    712     return "TOK_VOLATILE";
    713   case TOK_SIZEOF:
    714     return "TOK_SIZEOF";
    715   case TOK_ADD:
    716     return "TOK_ADD";
    717   case TOK_SUB:
    718     return "TOK_SUB";
    719   case TOK_MUL:
    720     return "TOK_MUL";
    721   case TOK_DIV:
    722     return "TOK_DIV";
    723   case TOK_MOD:
    724     return "TOK_MOD";
    725   case TOK_BIT_AND:
    726     return "TOK_BIT_AND";
    727   case TOK_BIT_OR:
    728     return "TOK_BIT_OR";
    729   case TOK_BIT_XOR:
    730     return "TOK_BIT_XOR";
    731   case TOK_BIT_NOT:
    732     return "TOK_BIT_NOT";
    733   case TOK_LSHIFT:
    734     return "TOK_LSHIFT";
    735   case TOK_RSHIFT:
    736     return "TOK_RSHIFT";
    737   case TOK_NOT:
    738     return "TOK_NOT";
    739   case TOK_ASSIGN:
    740     return "TOK_ASSIGN";
    741   case TOK_LT:
    742     return "TOK_LT";
    743   case TOK_GT:
    744     return "TOK_GT";
    745   case TOK_INC:
    746     return "TOK_INC";
    747   case TOK_DEC:
    748     return "TOK_DEC";
    749   case TOK_EQ:
    750     return "TOK_EQ";
    751   case TOK_NE:
    752     return "TOK_NE";
    753   case TOK_LE:
    754     return "TOK_LE";
    755   case TOK_GE:
    756     return "TOK_GE";
    757   case TOK_AND:
    758     return "TOK_AND";
    759   case TOK_OR:
    760     return "TOK_OR";
    761   case TOK_MEMBER_POINTER:
    762     return "TOK_MEMBER_POINTER";
    763   case TOK_MEMBER:
    764     return "TOK_MEMBER";
    765   case TOK_COND_DECISION:
    766     return "TOK_COND_DECISION";
    767   case TOK_COND:
    768     return "TOK_COND";
    769   case TOK_ASSIGN_ADD:
    770     return "TOK_ASSIGN_ADD";
    771   case TOK_ASSIGN_SUB:
    772     return "TOK_ASSIGN_SUB";
    773   case TOK_ASSIGN_MUL:
    774     return "TOK_ASSIGN_MUL";
    775   case TOK_ASSIGN_DIV:
    776     return "TOK_ASSIGN_DIV";
    777   case TOK_ASSIGN_MOD:
    778     return "TOK_ASSIGN_MOD";
    779   case TOK_ASSIGN_BITAND:
    780     return "TOK_ASSIGN_BITAND";
    781   case TOK_ASSIGN_BITOR:
    782     return "TOK_ASSIGN_BITOR";
    783   case TOK_ASSIGN_BITXOR:
    784     return "TOK_ASSIGN_BITXOR";
    785   case TOK_ASSIGN_LSHIFT:
    786     return "TOK_ASSIGN_LSHIFT";
    787   case TOK_ASSIGN_RSHIFT:
    788     return "TOK_ASSIGN_RSHIFT";
    789   case TOK_HASH:
    790     return "TOK_HASH";
    791   case TOK_ID:
    792     return "TOK_ID";
    793   case TOK_TYPEDEF_NAME:
    794     return "TOK_TYPEDEF_NAME";
    795   case TOK_INTEGER_U32:
    796     return "TOK_INTEGER_U32";
    797   case TOK_INTEGER_U64:
    798     return "TOK_INTEGER_U64";
    799   case TOK_INTEGER_S32:
    800     return "TOK_INTEGER_S32";
    801   case TOK_INTEGER_S64:
    802     return "TOK_INTEGER_S64";
    803   case TOK_FLOAT_32:
    804     return "TOK_FLOAT_32";
    805   case TOK_FLOAT_64:
    806     return "TOK_FLOAT_64";
    807   case TOK_CHAR_CONST:
    808     return "TOK_CHAR_CONST";
    809   case TOK_STRING_ASCII:
    810     return "TOK_STRING_ASCII";
    811   case TOK_EOF:
    812     return "TOK_EOF";
    813   case TOK_ERROR:
    814     return "TOK_ERROR";
    815   case TOK_LEFT_PAREN:
    816     return "TOK_LEFT_PAREN";
    817   case TOK_RIGHT_PAREN:
    818     return "TOK_RIGHT_PAREN";
    819   case TOK_LEFT_BRACKET:
    820     return "TOK_LEFT_BRACKET";
    821   case TOK_RIGHT_BRACKET:
    822     return "TOK_RIGHT_BRACKET";
    823   case TOK_LEFT_BRACE:
    824     return "TOK_LEFT_BRACE";
    825   case TOK_RIGHT_BRACE:
    826     return "TOK_RIGHT_BRACE";
    827   case TOK_COMMA:
    828     return "TOK_COMMA";
    829   case TOK_SEMICOLON:
    830     return "TOK_SEMICOLON";
    831   case TOK_DOT:
    832     return "TOK_DOT";
    833   case TOK_ELLIPSIS:
    834     return "TOK_ELLIPSIS";
    835   }
    836   return "UNKNOWN";
    837 }
    838 </pre>
    839 
    840 
    841 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p>
    842 </div>
    843 </div>
    844 <a name="1:16"><div class="section"><h4 class="noheading">16. </h4></a>
    845 <p>This function adds escape characters to a string for printing.
    846 </p>
    847 
    848 <div class="codeblock">
    849 <span class="codeblock_name">{Unescape String <a href="lexer.html#1:16">16</a>}</span>
    850 <pre class="prettyprint lang-c">
    851 #define clamp(x, min, max) ((x) &lt; (min) ? (min) : (x) &gt; (max) ? (max) : (x))
    852 char *re_escape_string(const char *str) {
    853   int len = strlen(str);
    854   char *buf = malloc(len * 2 + 1);
    855   if (!buf) {
    856     fprintf(stderr, "Out of memory. Cannot escape string\n");
    857     exit(1);
    858   }
    859   int i = 0;
    860   for (int j = 0; j &lt; len; j++) {
    861     switch (str[j]) {
    862     case '\a': buf[i++] = '\\'; buf[i++] = 'a'; break;
    863     case '\b': buf[i++] = '\\'; buf[i++] = 'b'; break;
    864     case '\f': buf[i++] = '\\'; buf[i++] = 'f'; break;
    865     case '\n': buf[i++] = '\\'; buf[i++] = 'n'; break;
    866     case '\r': buf[i++] = '\\'; buf[i++] = 'r'; break;
    867     case '\t': buf[i++] = '\\'; buf[i++] = 't'; break;
    868     case '\v': buf[i++] = '\\'; buf[i++] = 'v'; break;
    869     case '\\': buf[i++] = '\\'; buf[i++] = '\\'; break;
    870     case '\'': buf[i++] = '\\'; buf[i++] = '\''; break;
    871     case '"': buf[i++] = '\\'; buf[i++] = '"'; break;
    872     default: {
    873       if (isprint(str[j])) {
    874         buf[i++] = str[j];
    875       } else {
    876         buf[i++] = '\\';
    877         buf[i++] = 'x';
    878         buf[i++] = "0123456789abcdef"[clamp(str[j] &gt;&gt; 4, 0, 0xf)];
    879         buf[i++] = "0123456789abcdef"[clamp(str[j] &amp; 0xf, 0, 0xf)];
    880       }
    881     }
    882     }
    883   }
    884   buf[i] = '\0';
    885   return buf;
    886 }
    887 </pre>
    888 
    889 
    890 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p>
    891 </div>
    892 </div>
    893 <a name="1:17"><div class="section"><h4 class="noheading">17. </h4></a>
    894 <p>This function prints the token type and value.
    895 </p>
    896 
    897 <div class="codeblock">
    898 <span class="codeblock_name">{Print Token <a href="lexer.html#1:17">17</a>}</span>
    899 <pre class="prettyprint lang-c">
    900 void print_token(token_t *tok) {
    901   if (!tok) {
    902     printf("NULL\n");
    903     return;
    904   }
    905   const char *name = token_name_from_type(tok-&gt;kind);
    906   switch (tok-&gt;kind) {
    907   case TOK_ID:
    908   case TOK_STRING_ASCII: {
    909     char *escaped = re_escape_string(token_string(tok));
    910     printf("%s: \"%s\"@%d:%d\n", name, escaped, tok-&gt;line, tok-&gt;column);
    911     free(escaped);
    912     break;
    913   }
    914   case TOK_TYPEDEF_NAME: {
    915     char *escaped = re_escape_string(token_string(tok));
    916     printf("%s: %s@%d:%d\n", name, escaped, tok-&gt;line, tok-&gt;column);
    917     free(escaped);
    918     break;
    919   }
    920   case TOK_CHAR_CONST:
    921     printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok-&gt;line, tok-&gt;column);
    922     break;
    923   case TOK_INTEGER_S32:
    924   case TOK_INTEGER_U32:
    925   case TOK_INTEGER_S64:
    926   case TOK_INTEGER_U64:
    927     printf("%s: %ld@%d:%d\n", name, token_int(tok), tok-&gt;line, tok-&gt;column);
    928     break;
    929   case TOK_FLOAT_32:
    930   case TOK_FLOAT_64:
    931     printf("%s: %f@%d:%d\n", name, token_float(tok), tok-&gt;line, tok-&gt;column);
    932     break;
    933   default:
    934     printf("%s@%d:%d\n", name, tok-&gt;line, tok-&gt;column);
    935     break;
    936   }
    937 }
    938 </pre>
    939 
    940 
    941 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p>
    942 </div>
    943 </div>
    944 <a name="1:18"><div class="section"><h4 class="noheading">18. </h4></a>
    945 <p>Now we can implement functions to create and destroy tokens. We'll start with the easy ones.
    946 </p>
    947 
    948 <div class="codeblock">
    949 <span class="codeblock_name">{Token Creation and Destruction <a href="lexer.html#1:18">18</a>}</span>
    950 <pre class="prettyprint lang-c">
    951 token_t *token_data_create(c_token_types kind, int lin, int col, int len) {
    952   token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data));
    953   if (!token) {
    954     fputs("Out of memory\n", stderr);
    955     exit(1);
    956   }
    957   token-&gt;magic = TOK_MAGIC_1;
    958   token-&gt;line = lin;
    959   token-&gt;column = col;
    960   column += len;
    961   token-&gt;kind = kind;
    962   return token;
    963 }
    964 
    965 token_t *token_create(c_token_types kind, int lin, int col, int len) {
    966   token_t *token = malloc(sizeof(token_t));
    967   if (!token) {
    968     fputs("Out of memory\n", stderr);
    969     exit(1);
    970   }
    971   token-&gt;magic = TOK_MAGIC_2;
    972   token-&gt;line = lin;
    973   token-&gt;column = col;
    974   column += len;
    975   token-&gt;kind = kind;
    976   return token;
    977 }
    978 
    979 token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) {
    980   token_t *token = token_data_create(kind, lin, col, len);
    981   token_data(token)-&gt;data.i = i;
    982   return token;
    983 }
    984 
    985 token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) {
    986   token_t *token = token_data_create(kind, lin, col, len);
    987   token_data(token)-&gt;data.f = f;
    988   return token;
    989 }
    990 
    991 token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) {
    992   token_t *token = token_data_create(kind, lin, col, len);
    993   token_data(token)-&gt;data.c = c;
    994   return token;
    995 }
    996 
    997 void token_destroy(token_t *token) {
    998   if (token-&gt;magic == TOK_MAGIC_1 || token-&gt;magic == TOK_MAGIC_2) {
    999     free(token);
   1000   } else {
   1001     fputs("Corrupt token\n", stderr);
   1002     exit(1);
   1003   }
   1004 }
   1005 </pre>
   1006 
   1007 
   1008 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:42">42</a></p>
   1009 </div>
   1010 </div>
   1011 <a name="1:19"><div class="section"><h4 class="noheading">19. </h4></a>
   1012 <p><code>token_create_string</code> can be implemented either the easy way or the right way. Let's try the easy way.
   1013 </p>
   1014 
   1015 <div class="codeblock">
   1016 <span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>}</span>
   1017 <pre class="prettyprint lang-c">
   1018 token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) {
   1019   token_t *token = token_create(kind, lin, col, len);
   1020   token_data(token)-&gt;data.s = strdup(s);
   1021   return token;
   1022 }
   1023 </pre>
   1024 
   1025 <p class="seealso">Redefined in section <a href="lexer.html#1:40">40</a></p>
   1026 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p>
   1027 </div>
   1028 </div>
   1029 <a name="1:20"><div class="section"><h4 class="noheading">20. </h4></a>
   1030 <p>There's an issue with this approach. <code>token_create_string</code> will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to <code>fprintf</code>, <code>fscanf</code>, <code>strchr</code>, <code>strtok</code>, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage.
   1031 </p>
   1032 <p>To fix this, we use a hash table to store the strings. We'll define a hash table in <code>hash_table.h</code> and <code>hash_table.c</code>.
   1033 </p>
   1034 
   1035 </div>
   1036 <a name="1:21"><div class="section"><h4>21. Hash Table</h4></a>
   1037 <p>A hash table is a data structure that maps keys to values. It's commonly used for implementing symbol tables to store variables and functions. To create a generic hash table, we'll need:
   1038 </p>
   1039 <ol>
   1040 <li>A hash function to convert keys into array indices
   1041 </li>
   1042 <li>A comparison function to check if two keys are equal
   1043 </li>
   1044 <li>An opaque type to represent the hash table
   1045 </li>
   1046 <li>A destructor function to clean up keys and values when removing entries
   1047 </li>
   1048 </ol>
   1049 <p>Let's start with the interface:
   1050 </p>
   1051 
   1052 </div>
   1053 <a name="1:22"><div class="section"><h4 class="noheading">22. </h4></a>
   1054 
   1055 <div class="codeblock">
   1056 <span class="codeblock_name">{Hash Table Opaque Types <a href="lexer.html#1:22">22</a>}</span>
   1057 <pre class="prettyprint lang-c">
   1058 typedef struct hash_table hash_table_t;
   1059 typedef int (*hash_table_cmp_fn)(const void *key1, const void *key2);
   1060 typedef unsigned int (*hash_table_hash_fn)(const void *key);
   1061 typedef void (*hash_table_dtor)(void *data, int is_key);
   1062 </pre>
   1063 
   1064 
   1065 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p>
   1066 </div>
   1067 </div>
   1068 <a name="1:23"><div class="section"><h4 class="noheading">23. </h4></a>
   1069 
   1070 <div class="codeblock">
   1071 <span class="codeblock_name">{Hash Table Creation and Destruction <a href="lexer.html#1:23">23</a>}</span>
   1072 <pre class="prettyprint lang-c">
   1073 hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor);
   1074 void hash_table_destroy(hash_table_t *table);
   1075 </pre>
   1076 
   1077 
   1078 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p>
   1079 </div>
   1080 </div>
   1081 <a name="1:24"><div class="section"><h4 class="noheading">24. </h4></a>
   1082 
   1083 <div class="codeblock">
   1084 <span class="codeblock_name">{Hash Table Access <a href="lexer.html#1:24">24</a>}</span>
   1085 <pre class="prettyprint lang-c">
   1086 void *hash_table_get(const hash_table_t *table, const void *key);
   1087 void hash_table_put(hash_table_t *table, void *key, void *value);
   1088 void hash_table_remove(hash_table_t *table, const void *key);
   1089 </pre>
   1090 
   1091 
   1092 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p>
   1093 </div>
   1094 </div>
   1095 <a name="1:25"><div class="section"><h4 class="noheading">25. </h4></a>
   1096 
   1097 <div class="codeblock">
   1098 <span class="codeblock_name">{<strong>hash_table.h</strong> <a href="lexer.html#1:25">25</a>}</span>
   1099 <pre class="prettyprint lang-c">
   1100 #ifndef HASH_TABLE_H
   1101 #define HASH_TABLE_H
   1102 
   1103 <span class="nocode pln">{Hash Table Opaque Types, <a href="lexer.html#1:22">22</a>}</span>
   1104 <span class="nocode pln">{Hash Table Creation and Destruction, <a href="lexer.html#1:23">23</a>}</span>
   1105 <span class="nocode pln">{Hash Table Access, <a href="lexer.html#1:24">24</a>}</span>
   1106 
   1107 #endif /* HASH_TABLE_H */
   1108 </pre>
   1109 
   1110 
   1111 
   1112 </div>
   1113 </div>
   1114 <a name="1:26"><div class="section"><h4 class="noheading">26. </h4></a>
   1115 <p>Now let's implement the hash table:
   1116 </p>
   1117 
   1118 <div class="codeblock">
   1119 <span class="codeblock_name">{<strong>hash_table.c</strong> <a href="lexer.html#1:26">26</a>}</span>
   1120 <pre class="prettyprint lang-c">
   1121 #include &lt;stdlib.h&gt;
   1122 #include &lt;string.h&gt;
   1123 #include &lt;stdio.h&gt;
   1124 #include "hash_table.h"
   1125 
   1126 <span class="nocode pln">{Hash Table Data Structure, <a href="lexer.html#1:27">27</a>}</span>
   1127 <span class="nocode pln">{Hash Table Entry Data Structure, <a href="lexer.html#1:28">28</a>}</span>
   1128 
   1129 hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) {
   1130 <span class="nocode pln">    {Allocate and Initialize Hash Table, <a href="lexer.html#1:29">29</a>}</span>
   1131     return table;
   1132 }
   1133 
   1134 void hash_table_destroy(hash_table_t *table) {
   1135     if (!table) return;
   1136 <span class="nocode pln">    {Destroy Entries, <a href="lexer.html#1:30">30</a>}</span>
   1137     free(table-&gt;entries);
   1138     free(table);
   1139 }
   1140 
   1141 void *hash_table_get(const hash_table_t *table, const void *key) {
   1142     if (!table || !key) return NULL;
   1143 <span class="nocode pln">    {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span>
   1144 <span class="nocode pln">    {Loop Through Entries and Return Value if Match, <a href="lexer.html#1:35">35</a>}</span>
   1145     return NULL;
   1146 }
   1147 
   1148 void hash_table_put(hash_table_t *table, void *key, void *value) {
   1149     if (!table || !key) return;
   1150 <span class="nocode pln">    {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span>
   1151 <span class="nocode pln">    {Loop Through Entries and Replace Value if Key Matches, <a href="lexer.html#1:32">32</a>}</span>
   1152 <span class="nocode pln">    {Allocate New Entry if No Match, <a href="lexer.html#1:33">33</a>}</span>
   1153 }
   1154 
   1155 void hash_table_remove(hash_table_t *table, const void *key) {
   1156     if (!table || !key) return;
   1157 <span class="nocode pln">    {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span>
   1158 <span class="nocode pln">    {Loop Through Entries and Remove Entry if Key Matches, <a href="lexer.html#1:34">34</a>}</span>
   1159 }
   1160 
   1161 #ifdef TEST_HASH_TABLE
   1162 #include &lt;assert.h&gt;
   1163 
   1164 static int string_cmp(const void *key1, const void *key2) {
   1165     return strcmp((const char *)key1, (const char *)key2);
   1166 }
   1167 
   1168 static unsigned int string_hash(const void *key) {
   1169     unsigned int hash = 5381;
   1170     const unsigned char *str = key;
   1171     int c;
   1172 
   1173     while ((c = *str++))
   1174     {
   1175         hash = ((hash &lt;&lt; 5) + hash) + c;
   1176     }
   1177 
   1178     return hash;
   1179 }
   1180 
   1181 int main(void) {
   1182     hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL);
   1183     assert(table != NULL);
   1184 
   1185     hash_table_put(table, "foo", "bar");
   1186     hash_table_put(table, "foo", "baz");
   1187     assert(strcmp((const char *)hash_table_get(table, "foo"), "baz") == 0);
   1188 
   1189     hash_table_remove(table, "foo");
   1190     assert(hash_table_get(table, "foo") == NULL);
   1191 
   1192     hash_table_destroy(table);
   1193     printf("All tests passed!\n");
   1194     return 0;
   1195 }
   1196 #endif
   1197 </pre>
   1198 
   1199 
   1200 
   1201 </div>
   1202 </div>
   1203 <a name="1:27"><div class="section"><h4 class="noheading">27. </h4></a>
   1204 <p>The hash table data structure contains an array of entry pointers, the size of the array, and function pointers for comparison, hashing, and destruction.
   1205 </p>
   1206 
   1207 <div class="codeblock">
   1208 <span class="codeblock_name">{Hash Table Data Structure <a href="lexer.html#1:27">27</a>}</span>
   1209 <pre class="prettyprint lang-c">
   1210 struct hash_table {
   1211     struct hash_table_entry **entries;
   1212     int size;
   1213     hash_table_cmp_fn cmp;
   1214     hash_table_hash_fn hash;
   1215     hash_table_dtor dtor;
   1216 };
   1217 </pre>
   1218 
   1219 
   1220 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1221 </div>
   1222 </div>
   1223 <a name="1:28"><div class="section"><h4 class="noheading">28. </h4></a>
   1224 <p>Each entry in the hash table contains a key, a value, and a pointer to the next entry in the chain for collision resolution via chaining.
   1225 </p>
   1226 
   1227 <div class="codeblock">
   1228 <span class="codeblock_name">{Hash Table Entry Data Structure <a href="lexer.html#1:28">28</a>}</span>
   1229 <pre class="prettyprint lang-c">
   1230 struct hash_table_entry {
   1231     void *key;
   1232     void *value;
   1233     struct hash_table_entry *next;
   1234 };
   1235 </pre>
   1236 
   1237 
   1238 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1239 </div>
   1240 </div>
   1241 <a name="1:29"><div class="section"><h4 class="noheading">29. </h4></a>
   1242 <p>To allocate a hash table, we allocate memory for the table structure and its entries, initialize the entries to NULL, and set the function pointers.
   1243 </p>
   1244 
   1245 <div class="codeblock">
   1246 <span class="codeblock_name">{Allocate and Initialize Hash Table <a href="lexer.html#1:29">29</a>}</span>
   1247 <pre class="prettyprint lang-c">
   1248 hash_table_t *table = malloc(sizeof(struct hash_table));
   1249 if (!table) {
   1250     fprintf(stderr, "Error: Out of memory, could not allocate hash table\n");
   1251     exit(EXIT_FAILURE);
   1252 }
   1253 table-&gt;entries = calloc(size, sizeof(struct hash_table_entry *));
   1254 if (!table-&gt;entries) {
   1255     fprintf(stderr, "Error: Out of memory, could not allocate hash table entries\n");
   1256     free(table);
   1257     exit(EXIT_FAILURE);
   1258 }
   1259 table-&gt;size = size;
   1260 table-&gt;cmp = cmp;
   1261 table-&gt;hash = hash;
   1262 table-&gt;dtor = dtor;
   1263 </pre>
   1264 
   1265 
   1266 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1267 </div>
   1268 </div>
   1269 <a name="1:30"><div class="section"><h4 class="noheading">30. </h4></a>
   1270 <p>To destroy the entries in a hash table,  we iterate through all entries, free the keys and values using the destructor if provided, and free the entry itself.
   1271 </p>
   1272 
   1273 <div class="codeblock">
   1274 <span class="codeblock_name">{Destroy Entries <a href="lexer.html#1:30">30</a>}</span>
   1275 <pre class="prettyprint lang-c">
   1276 for (int i = 0; i &lt; table-&gt;size; i++) {
   1277     struct hash_table_entry *entry = table-&gt;entries[i];
   1278     while (entry) {
   1279         struct hash_table_entry *next = entry-&gt;next;
   1280         if (table-&gt;dtor) {
   1281             table-&gt;dtor(entry-&gt;key, 1);
   1282             table-&gt;dtor(entry-&gt;value, 0);
   1283         }
   1284         free(entry);
   1285         entry = next;
   1286     }
   1287 }
   1288 </pre>
   1289 
   1290 
   1291 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1292 </div>
   1293 </div>
   1294 <a name="1:31"><div class="section"><h4 class="noheading">31. </h4></a>
   1295 <p>To retrieve an entry's hash bucket, we apply the hash function to the key and take the modulus of the result with the table size.
   1296 </p>
   1297 
   1298 <div class="codeblock">
   1299 <span class="codeblock_name">{Get Entry By Hash <a href="lexer.html#1:31">31</a>}</span>
   1300 <pre class="prettyprint lang-c">
   1301 unsigned int hash = table-&gt;hash(key) % table-&gt;size;
   1302 struct hash_table_entry *entry = table-&gt;entries[hash];
   1303 </pre>
   1304 
   1305 
   1306 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1307 </div>
   1308 </div>
   1309 <a name="1:32"><div class="section"><h4 class="noheading">32. </h4></a>
   1310 <p>When putting a new entry in the table, we first check if the key already exists. If it does, we update the value; otherwise, we create a new entry.
   1311 </p>
   1312 
   1313 <div class="codeblock">
   1314 <span class="codeblock_name">{Loop Through Entries and Replace Value if Key Matches <a href="lexer.html#1:32">32</a>}</span>
   1315 <pre class="prettyprint lang-c">
   1316 while (entry) {
   1317     if (table-&gt;cmp(entry-&gt;key, key) == 0) {
   1318         if (table-&gt;dtor) table-&gt;dtor(entry-&gt;value, 0);
   1319         entry-&gt;value = value;
   1320         return;
   1321     }
   1322     entry = entry-&gt;next;
   1323 }
   1324 </pre>
   1325 
   1326 
   1327 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1328 </div>
   1329 </div>
   1330 <a name="1:33"><div class="section"><h4 class="noheading">33. </h4></a>
   1331 <p>If no matching key is found, we create a new entry and insert it at the beginning of the hash bucket. 
   1332 </p>
   1333 <p>This can possibly improve performance due to a property called temporal locality. When we access an entry, we're likely to access it again soon. Since the entry is at the beginning of the list, it's likely to be accessed again soon.
   1334 </p>
   1335 
   1336 <div class="codeblock">
   1337 <span class="codeblock_name">{Allocate New Entry if No Match <a href="lexer.html#1:33">33</a>}</span>
   1338 <pre class="prettyprint lang-c">
   1339 struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry));
   1340 if (!new_entry) {
   1341     fprintf(stderr, "Error: Out of memory, could not allocate hash table entry\n");
   1342     exit(EXIT_FAILURE);
   1343 }
   1344 new_entry-&gt;key = key;
   1345 new_entry-&gt;value = value;
   1346 new_entry-&gt;next = table-&gt;entries[hash];
   1347 table-&gt;entries[hash] = new_entry;
   1348 </pre>
   1349 
   1350 
   1351 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1352 </div>
   1353 </div>
   1354 <a name="1:34"><div class="section"><h4 class="noheading">34. </h4></a>
   1355 <p>To remove an entry, we find its bucket, update the linked list to bypass it, then free the entry and its contents.
   1356 </p>
   1357 
   1358 <div class="codeblock">
   1359 <span class="codeblock_name">{Loop Through Entries and Remove Entry if Key Matches <a href="lexer.html#1:34">34</a>}</span>
   1360 <pre class="prettyprint lang-c">
   1361 struct hash_table_entry *prev = NULL;
   1362 while (entry) {
   1363     if (table-&gt;cmp(entry-&gt;key, key) == 0) {
   1364         if (prev)
   1365             prev-&gt;next = entry-&gt;next;
   1366         else
   1367             table-&gt;entries[hash] = entry-&gt;next;
   1368         
   1369         if (table-&gt;dtor) {
   1370             table-&gt;dtor(entry-&gt;key, 1);
   1371             table-&gt;dtor(entry-&gt;value, 0);
   1372         }
   1373         free(entry);
   1374         return;
   1375     }
   1376     prev = entry;
   1377     entry = entry-&gt;next;
   1378 }
   1379 </pre>
   1380 
   1381 
   1382 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1383 </div>
   1384 </div>
   1385 <a name="1:35"><div class="section"><h4 class="noheading">35. </h4></a>
   1386 <p>To retrieve a value from a given bucket, we just walk the list and return the value if a matching key is found.
   1387 </p>
   1388 
   1389 <div class="codeblock">
   1390 <span class="codeblock_name">{Loop Through Entries and Return Value if Match <a href="lexer.html#1:35">35</a>}</span>
   1391 <pre class="prettyprint lang-c">
   1392 while (entry) {
   1393     if (table-&gt;cmp(entry-&gt;key, key) == 0) {
   1394         return entry-&gt;value;
   1395     }
   1396     entry = entry-&gt;next;
   1397 }
   1398 </pre>
   1399 
   1400 
   1401 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p>
   1402 </div>
   1403 </div>
   1404 <a name="1:36"><div class="section"><h4 class="noheading">36. </h4></a>
   1405 <p>We're now almost ready to implement <code>token_create_string</code> the right way. First, we'll need a good hash function. 
   1406 </p>
   1407 <p>Hash functions are a very interesting topic and there's a lot of good research on them. The hash function we use should be fast, have a low collision rate, and be able to handle strings of any length.
   1408 </p>
   1409 <p>We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful.
   1410 </p>
   1411 <p>Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 3760 were unique, which is terrible.
   1412 </p>
   1413 <p>Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well.
   1414 </p>
   1415 <p>The first time I wrote this, I used the hash function from the 'Red Dragon Book' (Compilers: Principles, Techniques, and Tools). 
   1416 </p>
   1417 
   1418 <div class="codeblock">
   1419 <span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>}</span>
   1420 <pre class="prettyprint lang-c">
   1421 unsigned long hash_string(void *key) {
   1422   unsigned long hash = 0, g;
   1423   char *p = key;
   1424   while (*p) {
   1425     hash = (hash \&lt;\&lt; 4) + *p++;
   1426     if ((g = hash \&amp; 0xf0000000) != 0) {
   1427       hash ^= g \&gt;\&gt; 24;
   1428       hash ^= g;
   1429     }
   1430   }
   1431   return hash;
   1432 }
   1433 </pre>
   1434 
   1435 
   1436 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
   1437 </div>
   1438 <p>This is a bit slow on modern processors because it's not very cache-friendly. We can do better. Let's use its child, ELFHash, from libc.
   1439 </p>
   1440 <p>As you can see in the code below, this function avoids extra operations and should be much faster. 
   1441 </p>
   1442 
   1443 <div class="codeblock">
   1444 <span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>} :=</span>
   1445 <pre class="prettyprint lang-c">
   1446 unsigned int hash_string(const void *key) {
   1447   unsigned long hash = 0, hi = 0;
   1448   const char *p = key;
   1449   hash = *p;
   1450   if (hash != 0 &amp;&amp; p[1] != 0) {
   1451     hash = (hash &lt;&lt; 4) + p[1];
   1452     if (p[2] != 0) {
   1453       hash = (hash &lt;&lt; 4) + p[2];
   1454       if (p[3] != 0) {
   1455         hash = (hash &lt;&lt; 4) + p[3];
   1456         if (p[4] != 0) {
   1457           hash = (hash &lt;&lt; 4) + p[4];
   1458           p += 5;
   1459           while (*p != 0) {
   1460             hash = (hash &lt;&lt; 4) + *p++;
   1461             hi = hash &amp; 0xf0000000l;
   1462             hash ^= hi &gt;&gt; 24;
   1463           }
   1464           hash &amp;= 0x0fffffffl;
   1465         }
   1466       }
   1467     }
   1468   }
   1469   return hash;
   1470 }
   1471 </pre>
   1472 
   1473 
   1474 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
   1475 </div>
   1476 </div>
   1477 <a name="1:37"><div class="section"><h4 class="noheading">37. </h4></a>
   1478 <p>We also need a comparison function for strings.
   1479 </p>
   1480 
   1481 <div class="codeblock">
   1482 <span class="codeblock_name">{String Comparison <a href="lexer.html#1:37">37</a>}</span>
   1483 <pre class="prettyprint lang-c">
   1484 int cmp_string(const void *key1, const void *key2) {
   1485   return strcmp((char *)key1, (char *)key2);
   1486 }
   1487 </pre>
   1488 
   1489 
   1490 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
   1491 </div>
   1492 </div>
   1493 <a name="1:38"><div class="section"><h4 class="noheading">38. </h4></a>
   1494 <p>Finally, we'll need a destructor for entries.
   1495 </p>
   1496 
   1497 <div class="codeblock">
   1498 <span class="codeblock_name">{String Destructor <a href="lexer.html#1:38">38</a>}</span>
   1499 <pre class="prettyprint lang-c">
   1500 void dtor_string(void *value, int is_key) {
   1501   if (is_key) {
   1502     free(value); // Since the key and value are the same, we only need to free once.
   1503   }
   1504 }
   1505 </pre>
   1506 
   1507 
   1508 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p>
   1509 </div>
   1510 </div>
   1511 <a name="1:39"><div class="section"><h4 class="noheading">39. </h4></a>
   1512 <p>These functions go in util.c
   1513 </p>
   1514 
   1515 <div class="codeblock">
   1516 <span class="codeblock_name">{<strong>util.c</strong> <a href="lexer.html#1:39">39</a>}</span>
   1517 <pre class="prettyprint lang-c">
   1518 #include &lt;string.h&gt;
   1519 #include &lt;stdlib.h&gt;
   1520 #include "hash_table.h"
   1521 <span class="nocode pln">{String Comparison, <a href="lexer.html#1:37">37</a>}</span>
   1522 <span class="nocode pln">{Hash Function, <a href="lexer.html#1:36">36</a>}</span>
   1523 <span class="nocode pln">{String Destructor, <a href="lexer.html#1:38">38</a>}</span>
   1524 </pre>
   1525 
   1526 
   1527 
   1528 </div>
   1529 
   1530 <div class="codeblock">
   1531 <span class="codeblock_name">{<strong>util.h</strong> <a href="lexer.html#1:39">39</a>}</span>
   1532 <pre class="prettyprint lang-c">
   1533 #ifndef UTIL_H
   1534 #define UTIL_H
   1535 #include "hash_table.h"
   1536 int cmp_string(const void *key1, const void *key2);
   1537 unsigned int hash_string(const void *key);
   1538 void dtor_string(void *value, int is_key);
   1539 #endif
   1540 </pre>
   1541 
   1542 
   1543 
   1544 </div>
   1545 </div>
   1546 <a name="1:40"><div class="section"><h4 class="noheading">40. </h4></a>
   1547 <p>Now we can implement <code>token_create_string</code> the right way.
   1548 </p>
   1549 <p>You might notice that we're using the same key and value. This way of using a hash table is normally called a set. We're using it to store strings, but we could use it to store anything we want to deduplicate.
   1550 </p>
   1551 
   1552 <div class="codeblock">
   1553 <span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>} :=</span>
   1554 <pre class="prettyprint lang-c">
   1555 hash_table_t *string_table;
   1556 token_t *token_create_string(c_token_types kind, int lin, int col,
   1557                                     const char *s, int len) {
   1558   if (string_table == NULL) {
   1559     string_table = hash_table_create(2048, cmp_string, hash_string, dtor_string);
   1560   }
   1561   token_t *token = token_data_create(kind, lin, col, len);
   1562   char *key = hash_table_get(string_table, (void *)s);
   1563   if (key == NULL) {
   1564     key = strdup(s);
   1565     hash_table_put(string_table, key, key);
   1566   }
   1567   token_data(token)-&gt;data.s = key;
   1568   return token;
   1569 }
   1570 </pre>
   1571 
   1572 
   1573 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p>
   1574 </div>
   1575 </div>
   1576 <a name="1:41"><div class="section"><h4 class="noheading">41. </h4></a>
   1577 <p>We'll add an external declaration for <code>string_table</code> in <code>token.h</code> so other programs can take advantage of it.
   1578 </p>
   1579 
   1580 <div class="codeblock">
   1581 <span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>} :=</span>
   1582 <pre class="prettyprint lang-c">
   1583 #ifndef TOKEN_H
   1584 #define TOKEN_H
   1585 #include &lt;stdint.h&gt; // We use this for int64_t
   1586 #include "hash_table.h" // We need this for the string table
   1587 <span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span>
   1588 <span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span>
   1589 <span class="nocode pln">{Token Creation and Destruction Interface, <a href="lexer.html#1:8">8</a>}</span>
   1590 <span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span>
   1591 extern hash_table_t *string_table;
   1592 extern int column;
   1593 extern int line;
   1594 #endif
   1595 </pre>
   1596 
   1597 
   1598 
   1599 </div>
   1600 </div>
   1601 <a name="1:42"><div class="section"><h4 class="noheading">42. </h4></a>
   1602 <p>Finally, we implement the token data structure in <code>token.c</code>.
   1603 </p>
   1604 
   1605 <div class="codeblock">
   1606 <span class="codeblock_name">{<strong>token.c</strong> <a href="lexer.html#1:42">42</a>}</span>
   1607 <pre class="prettyprint lang-c">
   1608 #include &lt;stdlib.h&gt;
   1609 #include &lt;string.h&gt;
   1610 #include &lt;stdio.h&gt;
   1611 #include &lt;assert.h&gt;
   1612 #include &lt;ctype.h&gt;
   1613 #include "token.h"
   1614 #include "hash_table.h"
   1615 #include "util.h"
   1616 <span class="nocode pln">{Token Data Structure, <a href="lexer.html#1:12">12</a>}</span>
   1617 <span class="nocode pln">{Token Data Access, <a href="lexer.html#1:13">13</a>}</span>
   1618 <span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span>
   1619 <span class="nocode pln">{Token Create String, <a href="lexer.html#1:19">19</a>}</span>
   1620 <span class="nocode pln">{Token Debugging, <a href="lexer.html#1:14">14</a>}</span>
   1621 </pre>
   1622 
   1623 
   1624 
   1625 </div>
   1626 </div>
   1627 <a name="1:43"><div class="section"><h4>43. Input</h4></a>
   1628 <p>Input will provide a simple interface for reading characters from a file. The stream itself is deliberately hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that.
   1629 </p>
   1630 
   1631 </div>
   1632 <a name="1:44"><div class="section"><h4 class="noheading">44. </h4></a>
   1633 
   1634 <div class="codeblock">
   1635 <span class="codeblock_name">{Input Interface <a href="lexer.html#1:44">44</a>}</span>
   1636 <pre class="prettyprint lang-c">
   1637 void input_init(const char *filename);
   1638 int input_getc(void);
   1639 void input_ungetc(int c);
   1640 void input_destroy(void);
   1641 </pre>
   1642 
   1643 
   1644 <p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p>
   1645 </div>
   1646 <p>When the program wants to start reading a file, it calls <code>input_init</code> with the filename. It can then call <code>input_getc</code> to get the next character in the file. If there's no more input, <code>input_getc</code> will return <code>EOF</code>.
   1647 </p>
   1648 <p>There's also an <code>input_ungetc</code> function, which allows the program to put a character back into the stream. I'll only allow one character to be put back, but that should be enough for the tokenizer.
   1649 </p>
   1650 <p>Finally, when the program is done reading the file, it should call <code>input_destroy</code> to clean up.
   1651 </p>
   1652 
   1653 </div>
   1654 <a name="1:45"><div class="section"><h4>45. Input Design Decisions</h4></a>
   1655 <p>Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into memory, we'll need to read it in chunks. There are a couple of choices for how to do this:
   1656 </p>
   1657 <ol>
   1658 <li><p><strong>Read a line at a time</strong>: This is a more natural approach, but it has two drawbacks. First, it requires a large buffer to store the line (C normally specifies <code>BUFSIZ</code> as 8192 bytes). Second, if the line is longer than <code>BUFSIZ</code>, we'll have to read the line in chunks anyway.
   1659 </p>
   1660 </li>
   1661 <li><p><strong>Choose some arbitrary buffer size and read that many bytes at a time</strong>: This is the approach I'm going to take. It's a little less natural, but it's more memory efficient.
   1662 </p>
   1663 </li>
   1664 </ol>
   1665 <p>Input will read chunks of 128 bytes at a time, reusing the same static buffer. This limitation is not visible to the tokenizer, which will only see the <code>input_getc</code> interface.
   1666 </p>
   1667 <p>When the buffer is exhausted, <code>input_getc</code> will call <code>nextline</code>, which will read the next chunk of the file.
   1668 </p>
   1669 
   1670 </div>
   1671 <a name="1:46"><div class="section"><h4>46. Input Implementation</h4></a>
   1672 <p>The implementation of the input module is pretty straightforward. We have the following data structures and defines as globals:
   1673 </p>
   1674 
   1675 </div>
   1676 <a name="1:47"><div class="section"><h4 class="noheading">47. </h4></a>
   1677 
   1678 <div class="codeblock">
   1679 <span class="codeblock_name">{Input Data <a href="lexer.html#1:47">47</a>}</span>
   1680 <pre class="prettyprint lang-c">
   1681 #define CHUNK_SIZE 128
   1682 static char buffer[CHUNK_SIZE];
   1683 static int buffer_pos = 0;
   1684 static int buffer_size = 0;
   1685 static char unget_buffer_stack[8];
   1686 static int unget_buffer_stack_pos = 0;
   1687 
   1688 static FILE *file = NULL;
   1689 </pre>
   1690 
   1691 
   1692 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p>
   1693 </div>
   1694 <p>When the program calls <code>input_init</code>, we open the file.
   1695 </p>
   1696 
   1697 </div>
   1698 <a name="1:48"><div class="section"><h4 class="noheading">48. </h4></a>
   1699 
   1700 <div class="codeblock">
   1701 <span class="codeblock_name">{Input Initialization <a href="lexer.html#1:48">48</a>}</span>
   1702 <pre class="prettyprint lang-c">
   1703 void input_init(const char *filename) {
   1704   file = fopen(filename, "r");
   1705   if (file == NULL) {
   1706     fprintf(stderr, "Error: Cannot open file %s\n", filename);
   1707     exit(1);
   1708   }
   1709 }
   1710 </pre>
   1711 
   1712 
   1713 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p>
   1714 </div>
   1715 <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is exhausted, we call <code>nextline</code>. We also track the line and column.
   1716 </p>
   1717 
   1718 </div>
   1719 <a name="1:49"><div class="section"><h4 class="noheading">49. </h4></a>
   1720 
   1721 <div class="codeblock">
   1722 <span class="codeblock_name">{Input Get Character <a href="lexer.html#1:49">49</a>}</span>
   1723 <pre class="prettyprint lang-c">
   1724 int input_getc(void) {
   1725   if (unget_buffer_stack_pos &gt; 0) {
   1726     return unget_buffer_stack[--unget_buffer_stack_pos];
   1727   }
   1728   if (buffer_pos == buffer_size) {
   1729     buffer_size = fread(buffer, 1, CHUNK_SIZE, file);
   1730     buffer_pos = 0;
   1731   }
   1732   if (buffer_size == 0) {
   1733     return EOF;
   1734   }
   1735   char c = buffer[buffer_pos++];
   1736   return c;
   1737 }
   1738 </pre>
   1739 
   1740 
   1741 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p>
   1742 </div>
   1743 <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>.
   1744 </p>
   1745 
   1746 </div>
   1747 <a name="1:50"><div class="section"><h4 class="noheading">50. </h4></a>
   1748 
   1749 <div class="codeblock">
   1750 <span class="codeblock_name">{Input Unget Character <a href="lexer.html#1:50">50</a>}</span>
   1751 <pre class="prettyprint lang-c">
   1752 void input_ungetc(int c) {
   1753   unget_buffer_stack[unget_buffer_stack_pos++] = c;
   1754 }
   1755 </pre>
   1756 
   1757 
   1758 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p>
   1759 </div>
   1760 <p>Since we're not using dynamic memory allocation, cleanup is pretty simple.
   1761 </p>
   1762 
   1763 </div>
   1764 <a name="1:51"><div class="section"><h4 class="noheading">51. </h4></a>
   1765 
   1766 <div class="codeblock">
   1767 <span class="codeblock_name">{Input Destroy <a href="lexer.html#1:51">51</a>}</span>
   1768 <pre class="prettyprint lang-c">
   1769 void input_destroy(void) {
   1770   fclose(file);
   1771 }
   1772 </pre>
   1773 
   1774 
   1775 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p>
   1776 </div>
   1777 </div>
   1778 <a name="1:52"><div class="section"><h4 class="noheading">52. </h4></a>
   1779 <p>We put the whole thing together in <code>input.c</code>.
   1780 </p>
   1781 
   1782 <div class="codeblock">
   1783 <span class="codeblock_name">{<strong>input.c</strong> <a href="lexer.html#1:52">52</a>}</span>
   1784 <pre class="prettyprint lang-c">
   1785 #include &lt;stdio.h&gt;
   1786 #include &lt;stdlib.h&gt;
   1787 #include "input.h"
   1788 <span class="nocode pln">{Input Data, <a href="lexer.html#1:47">47</a>}</span>
   1789 <span class="nocode pln">{Input Initialization, <a href="lexer.html#1:48">48</a>}</span>
   1790 <span class="nocode pln">{Input Get Character, <a href="lexer.html#1:49">49</a>}</span>
   1791 <span class="nocode pln">{Input Unget Character, <a href="lexer.html#1:50">50</a>}</span>
   1792 <span class="nocode pln">{Input Destroy, <a href="lexer.html#1:51">51</a>}</span>
   1793 </pre>
   1794 
   1795 
   1796 
   1797 </div>
   1798 </div>
   1799 <a name="1:53"><div class="section"><h4 class="noheading">53. </h4></a>
   1800 <p>We'll need an external declaration for <code>file</code> in <code>input.h</code> so other programs can take advantage of it.
   1801 </p>
   1802 
   1803 <div class="codeblock">
   1804 <span class="codeblock_name">{<strong>input.h</strong> <a href="lexer.html#1:53">53</a>}</span>
   1805 <pre class="prettyprint lang-c">
   1806 #ifndef INPUT_H
   1807 #define INPUT_H
   1808 <span class="nocode pln">{Input Interface, <a href="lexer.html#1:44">44</a>}</span>
   1809 #endif
   1810 </pre>
   1811 
   1812 
   1813 
   1814 </div>
   1815 </div>
   1816 <a name="1:54"><div class="section"><h4 class="noheading">54. </h4></a>
   1817 <p>We'll implement the lexer interface in <code>tokenizer.h</code>
   1818 </p>
   1819 
   1820 <div class="codeblock">
   1821 <span class="codeblock_name">{<strong>tokenizer.h</strong> <a href="lexer.html#1:54">54</a>}</span>
   1822 <pre class="prettyprint lang-c">
   1823 #ifndef TOKENIZER_H
   1824 #define TOKENIZER_H
   1825 #include "token.h"
   1826 #include "input.h"
   1827 <span class="nocode pln">{Tokenization Interface, <a href="lexer.html#1:55">55</a>}</span>
   1828 #endif
   1829 </pre>
   1830 
   1831 
   1832 
   1833 </div>
   1834 </div>
   1835 <a name="1:55"><div class="section"><h4 class="noheading">55. </h4></a>
   1836 <p>The tokenization interface will have a couple of functions. <code>next_token</code> will return the next token in the input stream, <code>init_tokenizer</code> will initialize the tokenizer, and <code>destroy_tokenizer</code> will clean up.
   1837 </p>
   1838 <p>We'll also have some helper functions for lookahead and matching. 
   1839 </p>
   1840 <p>'peek_token' will return the next token without consuming it (it technically does advance the input stream, but it saves the token so it can be reused). 
   1841 </p>
   1842 <p>'consume' will consume the next token if it matches a given kind. If it doesn't match, it will print an error message and exit.
   1843 </p>
   1844 
   1845 <div class="codeblock">
   1846 <span class="codeblock_name">{Tokenization Interface <a href="lexer.html#1:55">55</a>}</span>
   1847 <pre class="prettyprint lang-c">
   1848 void init_tokenizer(const char *filename);
   1849 void destroy_tokenizer(void);
   1850 token_t *next_token(void);
   1851 void reject_token(token_t *token);
   1852 token_t *peek_token(void);
   1853 void consume(c_token_types kind);
   1854 void consume_alt(c_token_types *kinds, int n);
   1855 </pre>
   1856 
   1857 
   1858 <p class="seealso">Used in section <a href="lexer.html#1:54">54</a></p>
   1859 </div>
   1860 </div>
   1861 <a name="1:56"><div class="section"><h4 class="noheading">56. </h4></a>
   1862 <p>Now we can finally implement the tokenizer. A stack for storing tokens for lookahead is defined, as is a hash table, modified by the parser, for typedefs.
   1863 </p>
   1864 
   1865 <div class="codeblock">
   1866 <span class="codeblock_name">{<strong>tokenizer.c</strong> <a href="lexer.html#1:56">56</a>}</span>
   1867 <pre class="prettyprint lang-c">
   1868 #include &lt;ctype.h&gt;
   1869 #include &lt;errno.h&gt;
   1870 #include &lt;float.h&gt;
   1871 #include &lt;math.h&gt;
   1872 #include &lt;stdarg.h&gt;
   1873 #include &lt;stdint.h&gt;
   1874 #include &lt;stdio.h&gt;
   1875 #include &lt;stdlib.h&gt;
   1876 #include &lt;string.h&gt;
   1877 #include "tokenizer.h"
   1878 #include "token.h"
   1879 #include "input.h"
   1880 #include "util.h"
   1881 token_t *left_stack[8];
   1882 int left_stack_pos = 0;
   1883 hash_table_t *typedef_table = NULL;
   1884 <span class="nocode pln">{Utility Functions, <a href="lexer.html#1:57">57</a>}</span>
   1885 <span class="nocode pln">{Tokenization Function, <a href="lexer.html#1:59">59</a>}</span>
   1886 </pre>
   1887 
   1888 
   1889 
   1890 </div>
   1891 </div>
   1892 <a name="1:57"><div class="section"><h4 class="noheading">57. </h4></a>
   1893 <p>Utility functions are everything that doesn't directly tokenize the input.
   1894 </p>
   1895 
   1896 <div class="codeblock">
   1897 <span class="codeblock_name">{Utility Functions <a href="lexer.html#1:57">57</a>}</span>
   1898 <pre class="prettyprint lang-c">
   1899 void init_tokenizer(const char *filename) {
   1900   input_init(filename);
   1901   typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string);
   1902 }
   1903 
   1904 void destroy_tokenizer(void) {
   1905   input_destroy();
   1906 }
   1907 
   1908 void reject_token(token_t *token) {
   1909   left_stack[left_stack_pos++] = token;
   1910 }
   1911 
   1912 token_t *peek_token(void) {
   1913   if (left_stack_pos &gt; 0) {
   1914     return left_stack[left_stack_pos - 1];
   1915   }
   1916     token_t *token = next_token();
   1917     reject_token(token);
   1918     return token;
   1919 }
   1920 
   1921 <span class="nocode pln">{Stringify Type, <a href="lexer.html#1:58">58</a>}</span>
   1922 
   1923 void consume(c_token_types kind) {
   1924   token_t *token = next_token();
   1925   if (token_type(token) != kind) {
   1926     fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
   1927     exit(1);
   1928   }
   1929   token_destroy(token);
   1930 }
   1931 
   1932 void consume_alt(c_token_types *kinds, int n) {
   1933   token_t *token = next_token();
   1934   for (int i = 0; i &lt; n; i++) {
   1935     if (token_type(token) == kinds[i]) {
   1936       token_destroy(token);
   1937       return;
   1938     }
   1939   }
   1940   fprintf(stderr, "Error: Expected one of the following tokens: ");
   1941   for (int i = 0; i &lt; n; i++) {
   1942     fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
   1943   }
   1944   fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
   1945   exit(1);
   1946 }
   1947 </pre>
   1948 
   1949 
   1950 <p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
   1951 </div>
   1952 </div>
   1953 <a name="1:58"><div class="section"><h4 class="noheading">58. </h4></a>
   1954 <p>We'll need a helper function to convert token types to strings. It's pretty simple, just tedious.
   1955 </p>
   1956 
   1957 <div class="codeblock">
   1958 <span class="codeblock_name">{Stringify Type <a href="lexer.html#1:58">58</a>}</span>
   1959 <pre class="prettyprint lang-c">
   1960 const char *stringify_type(c_token_types type) {
   1961   switch (type) {
   1962   case TOK_IF:
   1963     return "if";
   1964   case TOK_ELSE:
   1965     return "else";
   1966   case TOK_SWITCH:
   1967     return "switch";
   1968   case TOK_CASE:
   1969     return "case";
   1970   case TOK_DEFAULT:
   1971     return "default";
   1972   case TOK_WHILE:
   1973     return "while";
   1974   case TOK_DO:
   1975     return "do";
   1976   case TOK_FOR:
   1977     return "for";
   1978   case TOK_CONTINUE:
   1979     return "continue";
   1980   case TOK_BREAK:
   1981     return "break";
   1982   case TOK_RETURN:
   1983     return "return";
   1984   case TOK_GOTO:
   1985     return "goto";
   1986   case TOK_VOID:
   1987     return "void";
   1988   case TOK_CHAR:
   1989     return "char";
   1990   case TOK_SHORT:
   1991     return "short";
   1992   case TOK_INT:
   1993     return "int";
   1994   case TOK_LONG:
   1995     return "long";
   1996   case TOK_FLOAT:
   1997     return "float";
   1998   case TOK_DOUBLE:
   1999     return "double";
   2000   case TOK_SIGNED:
   2001     return "signed";
   2002   case TOK_UNSIGNED:
   2003     return "unsigned";
   2004   case TOK_STRUCT:
   2005     return "struct";
   2006   case TOK_UNION:
   2007     return "union";
   2008   case TOK_ENUM:
   2009     return "enum";
   2010   case TOK_TYPEDEF:
   2011     return "typedef";
   2012   case TOK_AUTO:
   2013     return "auto";
   2014   case TOK_REGISTER:
   2015     return "register";
   2016   case TOK_STATIC:
   2017     return "static";
   2018   case TOK_EXTERN:
   2019     return "extern";
   2020   case TOK_CONST:
   2021     return "const";
   2022   case TOK_VOLATILE:
   2023     return "volatile";
   2024   case TOK_SIZEOF:
   2025     return "sizeof";
   2026   case TOK_ADD:
   2027     return "+";
   2028   case TOK_SUB:
   2029     return "-";
   2030   case TOK_MUL:
   2031     return "*";
   2032   case TOK_DIV:
   2033     return "/";
   2034   case TOK_MOD:
   2035     return "%";
   2036   case TOK_BIT_AND:
   2037     return "&amp;";
   2038   case TOK_BIT_OR:
   2039     return "|";
   2040   case TOK_BIT_XOR:
   2041     return "^";
   2042   case TOK_BIT_NOT:
   2043     return "~";
   2044   case TOK_LSHIFT:
   2045     return "&lt;&lt;";
   2046   case TOK_RSHIFT:
   2047     return "&gt;&gt;";
   2048   case TOK_NOT:
   2049     return "!";
   2050   case TOK_ASSIGN:
   2051     return "=";
   2052   case TOK_LT:
   2053     return "&lt;";
   2054   case TOK_GT:
   2055     return "&gt;";
   2056   case TOK_INC:
   2057     return "++";
   2058   case TOK_DEC:
   2059     return "--";
   2060   case TOK_EQ:
   2061     return "==";
   2062   case TOK_NE:
   2063     return "!=";
   2064   case TOK_LE:
   2065     return "&lt;=";
   2066   case TOK_GE:
   2067     return "&gt;=";
   2068   case TOK_AND:
   2069     return "&amp;&amp;";
   2070   case TOK_OR:
   2071     return "||";
   2072   case TOK_MEMBER_POINTER:
   2073     return "-&gt;";
   2074   case TOK_MEMBER:
   2075     return ".";
   2076   case TOK_COND_DECISION:
   2077     return ":";
   2078   case TOK_COND:
   2079     return "?";
   2080   case TOK_ASSIGN_ADD:
   2081     return "+=";
   2082   case TOK_ASSIGN_SUB:
   2083     return "-=";
   2084   case TOK_ASSIGN_MUL:
   2085     return "*=";
   2086   case TOK_ASSIGN_DIV:
   2087     return "/=";
   2088   case TOK_ASSIGN_MOD:
   2089     return "%=";
   2090   case TOK_ASSIGN_BITAND:
   2091     return "&amp;=";
   2092   case TOK_ASSIGN_BITOR:
   2093     return "|=";
   2094   case TOK_ASSIGN_BITXOR:
   2095     return "^=";
   2096   case TOK_ASSIGN_LSHIFT:
   2097     return "&lt;&lt;=";
   2098   case TOK_ASSIGN_RSHIFT:
   2099     return "&gt;&gt;=";
   2100   case TOK_HASH:
   2101     return "#";
   2102   case TOK_ID:
   2103     return "identifier";
   2104   case TOK_TYPEDEF_NAME:
   2105     return "typedef name";
   2106   case TOK_INTEGER_U32:
   2107   case TOK_INTEGER_U64:
   2108   case TOK_INTEGER_S32:
   2109   case TOK_INTEGER_S64:
   2110     return "integer constant";
   2111   case TOK_FLOAT_32:
   2112   case TOK_FLOAT_64:
   2113     return "floating constant";
   2114   case TOK_CHAR_CONST:
   2115     return "character constant";
   2116   case TOK_STRING_ASCII:
   2117     return "string constant";
   2118   case TOK_EOF:
   2119     return "EOF";
   2120   case TOK_ERROR:
   2121     return "error";
   2122   case TOK_LEFT_PAREN:
   2123     return "(";
   2124   case TOK_RIGHT_PAREN:
   2125     return ")";
   2126   case TOK_LEFT_BRACKET:
   2127     return "[";
   2128   case TOK_RIGHT_BRACKET:
   2129     return "]";
   2130   case TOK_LEFT_BRACE:
   2131     return "{";
   2132   case TOK_RIGHT_BRACE:
   2133     return "}";
   2134   case TOK_COMMA:
   2135     return ",";
   2136   case TOK_SEMICOLON:
   2137     return ";";
   2138   case TOK_DOT:
   2139     return ".";
   2140   case TOK_ELLIPSIS:
   2141     return "...";
   2142   }
   2143   return "UNKNOWN";
   2144 }
   2145 </pre>
   2146 
   2147 
   2148 <p class="seealso">Used in section <a href="lexer.html#1:57">57</a></p>
   2149 </div>
   2150 </div>
   2151 <a name="1:59"><div class="section"><h4 class="noheading">59. </h4></a>
   2152 <p>Now we can implement the tokenization function. The pattern is pretty simple: we call each of the tokenization functions in turn until we find a match. If we don't find a match, we print an error message and exit.
   2153 You might wonder why skip_whitespace can return a token. This makes handling the divide operator easier as comments also start with a slash.
   2154 </p>
   2155 
   2156 <div class="codeblock">
   2157 <span class="codeblock_name">{Tokenization Function <a href="lexer.html#1:59">59</a>}</span>
   2158 <pre class="prettyprint lang-c">
   2159 char file_name[1024];
   2160 <span class="nocode pln">{Warning/Error Functions, <a href="lexer.html#1:60">60</a>}</span>
   2161 <span class="nocode pln">{Skip Whitespace, <a href="lexer.html#1:61">61</a>}</span>
   2162 <span class="nocode pln">{Tokenize Identifier, <a href="lexer.html#1:62">62</a>}</span>
   2163 <span class="nocode pln">{Tokenize Number, <a href="lexer.html#1:65">65</a>}</span>
   2164 <span class="nocode pln">{Tokenize String, <a href="lexer.html#1:74">74</a>}</span>
   2165 <span class="nocode pln">{Tokenize Character, <a href="lexer.html#1:73">73</a>}</span>
   2166 <span class="nocode pln">{Tokenize Operator, <a href="lexer.html#1:64">64</a>}</span>
   2167 token_t *next_token(void) {
   2168   if (left_stack_pos &gt; 0) {
   2169     return left_stack[--left_stack_pos];
   2170   }
   2171   token_t *tok = skip_whitespace();
   2172   if (tok != NULL) {
   2173     return tok;
   2174   }
   2175   tok = read_identifier();
   2176   if (tok != NULL) {
   2177     return tok;
   2178   }
   2179   tok = read_number();
   2180   if (tok != NULL) {
   2181     return tok;
   2182   }
   2183   tok = read_char_constant();
   2184   if (tok != NULL) {
   2185     return tok;
   2186   }
   2187   tok = read_string_literal();
   2188   if (tok != NULL) {
   2189     return tok;
   2190   }
   2191   tok = read_operator();
   2192   if (tok != NULL) {
   2193     return tok;
   2194   }
   2195   int c = input_getc();
   2196   if (c == EOF) {
   2197     return NULL;
   2198   }
   2199   tok_warn(
   2200       "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
   2201       line, column);
   2202   return next_token();
   2203 }
   2204 
   2205 #ifdef TEST_TOKENIZER
   2206 <span class="nocode pln">{Run Test, <a href="lexer.html#1:76">76</a>}</span>
   2207 #endif
   2208 </pre>
   2209 
   2210 
   2211 <p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p>
   2212 </div>
   2213 </div>
   2214 <a name="1:60"><div class="section"><h4 class="noheading">60. </h4></a>
   2215 <p>We'll need a couple of helper functions to skip whitespace and print warnings/errors.
   2216 </p>
   2217 
   2218 <div class="codeblock">
   2219 <span class="codeblock_name">{Warning/Error Functions <a href="lexer.html#1:60">60</a>}</span>
   2220 <pre class="prettyprint lang-c">
   2221 void tok_error(const char *fmt, ...) {
   2222   va_list args;
   2223   va_start(args, fmt);
   2224   fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
   2225           column);
   2226   vfprintf(stderr, fmt, args);
   2227   va_end(args);
   2228 }
   2229 
   2230 void tok_warn(const char *fmt, ...) {
   2231   va_list args;
   2232   va_start(args, fmt);
   2233   fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
   2234           column);
   2235   vfprintf(stderr, fmt, args);
   2236   va_end(args);
   2237 }
   2238 </pre>
   2239 
   2240 
   2241 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   2242 </div>
   2243 </div>
   2244 <a name="1:61"><div class="section"><h4 class="noheading">61. </h4></a>
   2245 <p>The <code>skip_whitespace</code> function is pretty simple. It just skips over any comments, whitespace, and line directives.
   2246 </p>
   2247 
   2248 <div class="codeblock">
   2249 <span class="codeblock_name">{Skip Whitespace <a href="lexer.html#1:61">61</a>}</span>
   2250 <pre class="prettyprint lang-c">
   2251 static token_t *skip_whitespace(void) {
   2252   int c;
   2253   while ((c = input_getc()) != EOF) {
   2254     if (isspace(c)) { // Whitespace
   2255       if (c == '\n') {
   2256         line++;
   2257         column = 1;
   2258       } else {
   2259         column++;
   2260       }
   2261     } else if (c == '#') // GCC preprocessor line control directive.
   2262     {
   2263       char buf[512];
   2264       int i = 0;
   2265       while ((c = input_getc()) != EOF &amp;&amp; c != '\n') {
   2266         buf[i++] = c;
   2267         column++;
   2268       }
   2269       buf[i] = '\0';
   2270       if (sscanf(buf, "%d \"%[^\"]\"", &amp;line, file_name) == 2) {
   2271         column = 1;
   2272       } else {
   2273         tok_error("Invalid #line directive\n");
   2274       }
   2275       if (c == EOF) {
   2276         return NULL;
   2277       }
   2278     } else if (c == '/') { // Comment
   2279       c = input_getc();
   2280       if (c == '/') {
   2281         while ((c = input_getc()) != EOF &amp;&amp; c != '\n') {
   2282           column++;
   2283         }
   2284         if (c == EOF) {
   2285           return NULL;
   2286         }
   2287         line++;
   2288         column = 1;
   2289       } else if (c == '*') { // Multiline comment
   2290         while ((c = input_getc()) != EOF) {
   2291           if (c == '*') {
   2292             c = input_getc();
   2293             if (c == '/') {
   2294               break;
   2295             }
   2296           } else if (c == '\n') {
   2297             line++;
   2298             column = 1;
   2299           } else {
   2300             column++;
   2301           }
   2302         }
   2303         if (c == EOF) {
   2304           return NULL;
   2305         }
   2306       } else { // Handled here to simplify the code.
   2307         if (c == '=')
   2308           return token_create(TOK_ASSIGN_DIV, line, column, 2);
   2309         input_ungetc(c);
   2310         return token_create(TOK_DIV, line, column, 1);
   2311       }
   2312     } else {
   2313       input_ungetc(c);
   2314       return NULL;
   2315     }
   2316   }
   2317   return NULL;
   2318 }
   2319 </pre>
   2320 
   2321 
   2322 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   2323 </div>
   2324 </div>
   2325 <a name="1:62"><div class="section"><h4 class="noheading">62. </h4></a>
   2326 <p>The <code>read_identifier</code> function reads an identifier from the input stream. C identifiers can contain letters, digits, and underscores, but they can't start with a digit.
   2327 </p>
   2328 
   2329 <div class="codeblock">
   2330 <span class="codeblock_name">{Tokenize Identifier <a href="lexer.html#1:62">62</a>}</span>
   2331 <pre class="prettyprint lang-c">
   2332 <span class="nocode pln">{Get Keyword, <a href="lexer.html#1:63">63</a>}</span>
   2333 static token_t *read_identifier(void) {
   2334   int c;
   2335   char buf[1024];
   2336   int i = 0;
   2337   c = input_getc();
   2338   if (!isalpha(c) &amp;&amp; c != '_') {
   2339     input_ungetc(c);
   2340     return NULL;
   2341   }
   2342   buf[i++] = c;
   2343   while ((c = input_getc()) != EOF) {
   2344     if (!isalnum(c) &amp;&amp; c != '_') {
   2345       input_ungetc(c);
   2346       break;
   2347     }
   2348     buf[i++] = c;
   2349     if (i &gt;= 1008) {
   2350       tok_error("Identifier too long\n");
   2351       exit(1);
   2352     }
   2353   }
   2354   buf[i] = '\0';
   2355   column += i;
   2356   // Check if it's a keyword
   2357   c_token_types kind = get_keyword(buf, i);
   2358   if (kind != TOK_ID) {
   2359     return token_create(kind, line, column, i);
   2360   }
   2361   // Check if it's a typedef
   2362   if (hash_table_get(typedef_table, buf) != NULL) {
   2363     return token_create(TOK_TYPEDEF_NAME, line, column, i);
   2364   }
   2365   return token_create_string(kind, line, column, buf, i);
   2366 }
   2367 </pre>
   2368 
   2369 
   2370 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   2371 </div>
   2372 </div>
   2373 <a name="1:63"><div class="section"><h4 class="noheading">63. </h4></a>
   2374 <p>The <code>get_keyword</code> function is a simple decision tree for identifying keywords. The code is pretty tedious, but it works.
   2375 </p>
   2376 
   2377 <div class="codeblock">
   2378 <span class="codeblock_name">{Get Keyword <a href="lexer.html#1:63">63</a>}</span>
   2379 <pre class="prettyprint lang-c">
   2380 c_token_types get_keyword(const char *buf, int len) {
   2381   switch (buf[0]) {
   2382   case 'a':
   2383     if (len == 4 &amp;&amp; buf[1] == 'u' &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'o')
   2384       return TOK_AUTO;
   2385     break;
   2386 
   2387   case 'b':
   2388     if (len == 5 &amp;&amp; buf[1] == 'r' &amp;&amp; buf[2] == 'e' &amp;&amp; buf[3] == 'a' &amp;&amp;
   2389         buf[4] == 'k')
   2390       return TOK_BREAK;
   2391     break;
   2392 
   2393   case 'c':
   2394     switch (buf[1]) {
   2395     case 'a':
   2396       if (len == 4 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'e')
   2397         return TOK_CASE;
   2398       break;
   2399     case 'h':
   2400       if (len == 4 &amp;&amp; buf[2] == 'a' &amp;&amp; buf[3] == 'r')
   2401         return TOK_CHAR;
   2402       break;
   2403     case 'o':
   2404       if (len == 5 &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 's' &amp;&amp; buf[4] == 't')
   2405         return TOK_CONST;
   2406       if (len == 8 &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'i' &amp;&amp;
   2407           buf[5] == 'n' &amp;&amp; buf[6] == 'u' &amp;&amp; buf[7] == 'e')
   2408         return TOK_CONTINUE;
   2409       break;
   2410     }
   2411     break;
   2412 
   2413   case 'd':
   2414     switch (buf[1]) {
   2415     case 'e':
   2416       if (len == 7 &amp;&amp; buf[2] == 'f' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 'u' &amp;&amp;
   2417           buf[5] == 'l' &amp;&amp; buf[6] == 't')
   2418         return TOK_DEFAULT;
   2419       break;
   2420     case 'o':
   2421       if (len == 2 &amp;&amp; buf[2] == '\0')
   2422         return TOK_DO;
   2423       if (len == 6 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'b' &amp;&amp; buf[4] == 'l' &amp;&amp;
   2424           buf[5] == 'e')
   2425         return TOK_DOUBLE;
   2426       break;
   2427     }
   2428     break;
   2429 
   2430   case 'e':
   2431     switch (buf[1]) {
   2432     case 'l':
   2433       if (len == 4 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'e')
   2434         return TOK_ELSE;
   2435       break;
   2436     case 'n':
   2437       if (len == 4 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'm')
   2438         return TOK_ENUM;
   2439       break;
   2440     case 'x':
   2441       if (len == 6 &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'e' &amp;&amp; buf[4] == 'r' &amp;&amp;
   2442           buf[5] == 'n')
   2443         return TOK_EXTERN;
   2444       break;
   2445     }
   2446     break;
   2447 
   2448   case 'f':
   2449     switch (buf[1]) {
   2450     case 'l':
   2451       if (len == 5 &amp;&amp; buf[2] == 'o' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 't')
   2452         return TOK_FLOAT;
   2453       break;
   2454     case 'o':
   2455       if (len == 3 &amp;&amp; buf[2] == 'r')
   2456         return TOK_FOR;
   2457       break;
   2458     }
   2459     break;
   2460 
   2461   case 'g':
   2462     if (len == 4 &amp;&amp; buf[1] == 'o' &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'o')
   2463       return TOK_GOTO;
   2464     break;
   2465 
   2466   case 'i':
   2467     switch (buf[1]) {
   2468     case 'f':
   2469       if (len == 2 &amp;&amp; buf[2] == '\0')
   2470         return TOK_IF;
   2471       break;
   2472     case 'n':
   2473       if (len == 3 &amp;&amp; buf[2] == 't')
   2474         return TOK_INT;
   2475       break;
   2476     }
   2477     break;
   2478 
   2479   case 'l':
   2480     if (len == 4 &amp;&amp; buf[1] == 'o' &amp;&amp; buf[2] == 'n' &amp;&amp; buf[3] == 'g')
   2481       return TOK_LONG;
   2482     break;
   2483 
   2484   case 'r':
   2485     switch (buf[1]) {
   2486     case 'e':
   2487       if (len == 8 &amp;&amp; buf[2] == 'g' &amp;&amp; buf[3] == 'i' &amp;&amp; buf[4] == 's' &amp;&amp;
   2488           buf[5] == 't' &amp;&amp; buf[6] == 'e' &amp;&amp; buf[7] == 'r')
   2489         return TOK_REGISTER;
   2490       if (len == 6 &amp;&amp; buf[2] == 't' &amp;&amp; buf[3] == 'u' &amp;&amp; buf[4] == 'r' &amp;&amp;
   2491           buf[5] == 'n')
   2492         return TOK_RETURN;
   2493       break;
   2494     }
   2495     break;
   2496 
   2497   case 's':
   2498     switch (buf[1]) {
   2499     case 'h':
   2500       if (len == 5 &amp;&amp; buf[2] == 'o' &amp;&amp; buf[3] == 'r' &amp;&amp; buf[4] == 't')
   2501         return TOK_SHORT;
   2502       break;
   2503     case 't':
   2504       if (len == 6 &amp;&amp; buf[2] == 'a' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'i' &amp;&amp;
   2505           buf[5] == 'c')
   2506         return TOK_STATIC;
   2507       break;
   2508     case 'i':
   2509       if (len == 6 &amp;&amp; buf[2] == 'g' &amp;&amp; buf[3] == 'n' &amp;&amp; buf[4] == 'e' &amp;&amp;
   2510           buf[5] == 'd')
   2511         return TOK_SIGNED;
   2512       if (len == 6 &amp;&amp; buf[2] == 'z' &amp;&amp; buf[3] == 'e' &amp;&amp; buf[4] == 'o' &amp;&amp;
   2513           buf[5] == 'f')
   2514         return TOK_SIZEOF;
   2515       break;
   2516     case 'r':
   2517       if (len == 6 &amp;&amp; buf[2] == 'u' &amp;&amp; buf[3] == 'c' &amp;&amp; buf[4] == 't')
   2518         return TOK_STRUCT;
   2519       break;
   2520     case 'w':
   2521       if (len == 6 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 't' &amp;&amp; buf[4] == 'c' &amp;&amp;
   2522           buf[5] == 'h')
   2523         return TOK_SWITCH;
   2524       break;
   2525     }
   2526     break;
   2527 
   2528   case 't':
   2529     if (len == 7 &amp;&amp; buf[1] == 'y' &amp;&amp; buf[2] == 'p' &amp;&amp; buf[3] == 'e' &amp;&amp;
   2530         buf[4] == 'd' &amp;&amp; buf[5] == 'e' &amp;&amp; buf[6] == 'f')
   2531       return TOK_TYPEDEF;
   2532     break;
   2533 
   2534   case 'u':
   2535     switch (buf[1]) {
   2536     case 'n':
   2537       if (len == 5 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'o' &amp;&amp; buf[4] == 'n')
   2538         return TOK_UNION;
   2539       if (len == 8 &amp;&amp; buf[2] == 's' &amp;&amp; buf[3] == 'i' &amp;&amp; buf[4] == 'g' &amp;&amp;
   2540           buf[5] == 'n' &amp;&amp; buf[6] == 'e' &amp;&amp; buf[7] == 'd')
   2541         return TOK_UNSIGNED;
   2542       break;
   2543     }
   2544     break;
   2545 
   2546   case 'v':
   2547     switch (buf[1]) {
   2548     case 'o':
   2549       if (len == 4 &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'd')
   2550         return TOK_VOID;
   2551       if (len == 8 &amp;&amp; buf[2] == 'l' &amp;&amp; buf[3] == 'a' &amp;&amp; buf[4] == 't' &amp;&amp;
   2552           buf[5] == 'i' &amp;&amp; buf[6] == 'l' &amp;&amp; buf[7] == 'e')
   2553         return TOK_VOLATILE;
   2554       break;
   2555     }
   2556     break;
   2557 
   2558   case 'w':
   2559     if (len == 5 &amp;&amp; buf[1] == 'h' &amp;&amp; buf[2] == 'i' &amp;&amp; buf[3] == 'l' &amp;&amp;
   2560         buf[4] == 'e')
   2561       return TOK_WHILE;
   2562     break;
   2563 
   2564   default:
   2565     return TOK_ID;
   2566   }
   2567   return TOK_ID;
   2568 }
   2569 </pre>
   2570 
   2571 
   2572 <p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p>
   2573 </div>
   2574 </div>
   2575 <a name="1:64"><div class="section"><h4 class="noheading">64. </h4></a>
   2576 <p>The <code>read_operator</code> function works similarly to the <code>read_identifier</code> function. It uses a decision tree to identify operators. 
   2577 </p>
   2578 
   2579 <div class="codeblock">
   2580 <span class="codeblock_name">{Tokenize Operator <a href="lexer.html#1:64">64</a>}</span>
   2581 <pre class="prettyprint lang-c">
   2582 
   2583 token_t *read_operator(void) {
   2584   int c;
   2585   c = input_getc();
   2586   switch (c) {
   2587   case '!': {
   2588     c = input_getc();
   2589     if (c == '=')
   2590       return token_create(TOK_NE, line, column, 2);
   2591     input_ungetc(c);
   2592     return token_create(TOK_NOT, line, column, 1);
   2593   }
   2594   case '%': {
   2595     c = input_getc();
   2596     if (c == '=')
   2597       return token_create(TOK_ASSIGN_MOD, line, column, 2);
   2598     input_ungetc(c);
   2599     return token_create(TOK_MOD, line, column, 1);
   2600   }
   2601   case '&amp;': {
   2602     c = input_getc();
   2603     if (c == '&amp;')
   2604       return token_create(TOK_AND, line, column, 2);
   2605     if (c == '=')
   2606       return token_create(TOK_ASSIGN_BITAND, line, column, 2);
   2607     input_ungetc(c);
   2608     return token_create(TOK_BIT_AND, line, column, 1);
   2609   }
   2610   case '(':
   2611     return token_create(TOK_LEFT_PAREN, line, column, 1);
   2612   case ')':
   2613     return token_create(TOK_RIGHT_PAREN, line, column, 1);
   2614   case '*': {
   2615     c = input_getc();
   2616     if (c == '=')
   2617       return token_create(TOK_ASSIGN_MUL, line, column, 2);
   2618     input_ungetc(c);
   2619     return token_create(TOK_MUL, line, column, 1);
   2620   }
   2621   case '+': {
   2622     c = input_getc();
   2623     if (c == '+')
   2624       return token_create(TOK_INC, line, column, 2);
   2625     if (c == '=')
   2626       return token_create(TOK_ASSIGN_ADD, line, column, 2);
   2627     input_ungetc(c);
   2628     return token_create(TOK_ADD, line, column, 2);
   2629   }
   2630   case ',':
   2631     return token_create(TOK_COMMA, line, column, 1);
   2632   case '-': {
   2633     c = input_getc();
   2634     if (c == '-')
   2635       return token_create(TOK_DEC, line, column, 2);
   2636     if (c == '=')
   2637       return token_create(TOK_ASSIGN_SUB, line, column, 2);
   2638     if (c == '&gt;')
   2639       return token_create(TOK_MEMBER_POINTER, line, column, 2);
   2640     input_ungetc(c);
   2641     return token_create(TOK_SUB, line, column, 1);
   2642   }
   2643   case '.': {
   2644     c = input_getc();
   2645     if (c == '.') {
   2646       c = input_getc();
   2647       if (c == '.') {
   2648         return token_create(TOK_ELLIPSIS, line, column, 3);
   2649       } else {
   2650         // Bail out, can't store more than one unget
   2651         tok_error("Unexpected character '.' at line %d, column %d\n", line,
   2652                   column);
   2653         exit(1);
   2654       }
   2655     }
   2656     return token_create('.', line, column, 1);
   2657   }
   2658   case '/': {
   2659     c = input_getc();
   2660     if (c == '=')
   2661       return token_create(TOK_ASSIGN_DIV, line, column, 2);
   2662     input_ungetc(c);
   2663     return token_create(TOK_DIV, line, column, 1);
   2664   }
   2665   case ':':
   2666     return token_create(TOK_COND_DECISION, line, column, 1);
   2667   case ';':
   2668     return token_create(TOK_SEMICOLON, line, column, 1);
   2669   case '&lt;': {
   2670     c = input_getc();
   2671     if (c == '&lt;') {
   2672       c = input_getc();
   2673       if (c == '=')
   2674         return token_create(TOK_ASSIGN_LSHIFT, line, column, 3);
   2675       input_ungetc(c);
   2676       return token_create(TOK_LSHIFT, line, column, 2);
   2677     }
   2678     if (c == '=')
   2679       return token_create(TOK_LE, line, column, 2);
   2680     input_ungetc(c);
   2681     return token_create(TOK_LT, line, column, 1);
   2682   }
   2683   case '=': {
   2684     c = input_getc();
   2685     if (c == '=')
   2686       return token_create(TOK_ASSIGN, line, column, 2);
   2687     input_ungetc(c);
   2688     return token_create(TOK_ASSIGN, line, column, 1);
   2689   }
   2690   case '&gt;': {
   2691     c = input_getc();
   2692     if (c == '&gt;') {
   2693       c = input_getc();
   2694       if (c == '=')
   2695         return token_create(TOK_ASSIGN_RSHIFT, line, column, 3);
   2696       input_ungetc(c);
   2697       return token_create(TOK_RSHIFT, line, column, 2);
   2698     }
   2699     if (c == '=')
   2700       return token_create(TOK_GE, line, column, 2);
   2701     input_ungetc(c);
   2702     return token_create(TOK_GT, line, column, 1);
   2703   }
   2704   case '?':
   2705     return token_create(TOK_COND, line, column, 1);
   2706   case '[':
   2707     return token_create(TOK_LEFT_BRACKET, line, column, 1);
   2708   case ']':
   2709     return token_create(TOK_RIGHT_BRACKET, line, column, 1);
   2710   case '^': {
   2711     c = input_getc();
   2712     if (c == '=')
   2713       return token_create(TOK_ASSIGN_BITXOR, line, column, 2);
   2714     input_ungetc(c);
   2715     return token_create(TOK_BIT_XOR, line, column, 1);
   2716   }
   2717   case '{':
   2718     return token_create(TOK_LEFT_BRACE, line, column, 1);
   2719   case '|': {
   2720     c = input_getc();
   2721     if (c == '|')
   2722       return token_create(TOK_OR, line, column, 2);
   2723     if (c == '=')
   2724       return token_create(TOK_ASSIGN_BITOR, line, column, 2);
   2725     input_ungetc(c);
   2726     return token_create(TOK_BIT_OR, line, column, 1);
   2727   }
   2728   case '}':
   2729     return token_create(TOK_RIGHT_BRACE, line, column, 1);
   2730   case '~':
   2731     return token_create(TOK_BIT_NOT, line, column, 1);
   2732   default:
   2733     input_ungetc(c);
   2734     return NULL;
   2735   }
   2736 
   2737   return NULL;
   2738 }
   2739 </pre>
   2740 
   2741 
   2742 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   2743 </div>
   2744 </div>
   2745 <a name="1:65"><div class="section"><h4 class="noheading">65. </h4></a>
   2746 <p>The <code>read_number</code> function reads a number from the input stream. It can be an integer or a floating-point number.
   2747 </p>
   2748 <p>I've broken it up a bit to make it easier to read. 
   2749 </p>
   2750 
   2751 <div class="codeblock">
   2752 <span class="codeblock_name">{Tokenize Number <a href="lexer.html#1:65">65</a>}</span>
   2753 <pre class="prettyprint lang-c">
   2754 static token_t *read_number(void) {
   2755   int c;
   2756   char buf[1024];
   2757   int i = 0;
   2758   c = input_getc();
   2759 <span class="nocode pln">  {Check for valid prefix, <a href="lexer.html#1:66">66</a>}</span>
   2760   int radix = 10;
   2761 <span class="nocode pln">  {Process Radix, <a href="lexer.html#1:67">67</a>}</span>
   2762   int is_float = 0;
   2763 <span class="nocode pln">  {Read Number Loop, <a href="lexer.html#1:68">68</a>}</span>
   2764   buf[i] = '\0';
   2765 <span class="nocode pln">  {Process Suffixes, <a href="lexer.html#1:69">69</a>}</span>
   2766 <span class="nocode pln">  {Check for conflicting suffixes, <a href="lexer.html#1:70">70</a>}</span>
   2767   if (is_float) {
   2768 <span class="nocode pln">    {Convert to float, <a href="lexer.html#1:71">71</a>}</span>
   2769   } else {
   2770 <span class="nocode pln">    {Convert to integer, <a href="lexer.html#1:72">72</a>}</span>
   2771   }
   2772   return NULL;
   2773 }
   2774 </pre>
   2775 
   2776 
   2777 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   2778 </div>
   2779 </div>
   2780 <a name="1:66"><div class="section"><h4 class="noheading">66. </h4></a>
   2781 <p>To determine if a character is a valid prefix for a number, we need to check if it's a digit or a period followed by a digit
   2782 </p>
   2783 
   2784 <div class="codeblock">
   2785 <span class="codeblock_name">{Check for valid prefix <a href="lexer.html#1:66">66</a>}</span>
   2786 <pre class="prettyprint lang-c">
   2787  // If we don't have a digit or decimal point, it's not a number
   2788   if (!isdigit(c) &amp;&amp; c != '.') {
   2789     input_ungetc(c);
   2790     return NULL;
   2791   }
   2792   // Decimal point followed by non-digit is a struct member
   2793   if (c == '.') {
   2794     char cnext = input_getc();
   2795     if (!isdigit(cnext)) {
   2796       input_ungetc(cnext);
   2797       return token_create(TOK_MEMBER, line, column, 1);
   2798     }
   2799     input_ungetc(cnext);
   2800   }
   2801 </pre>
   2802 
   2803 
   2804 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2805 </div>
   2806 </div>
   2807 <a name="1:67"><div class="section"><h4 class="noheading">67. </h4></a>
   2808 <p>A C constant starting with a zero is either an octal or hexadecimal constant. We need to check the next character to determine which one it is.
   2809 </p>
   2810 
   2811 <div class="codeblock">
   2812 <span class="codeblock_name">{Process Radix <a href="lexer.html#1:67">67</a>}</span>
   2813 <pre class="prettyprint lang-c">
   2814   // Check for hex and octal.
   2815   if (c == '0') {
   2816     char cnext = input_getc();
   2817     if (cnext == 'x' || cnext == 'X') {
   2818       // Hex, discard the 0x
   2819       radix = 16;
   2820     } else {
   2821       // Octal, discard the 0
   2822       input_ungetc(cnext);
   2823       radix = 8;
   2824     }
   2825   } else {
   2826     // Decimal, append the first digit
   2827     buf[i++] = c;
   2828   }
   2829 </pre>
   2830 
   2831 
   2832 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2833 </div>
   2834 </div>
   2835 <a name="1:68"><div class="section"><h4 class="noheading">68. </h4></a>
   2836 
   2837 <div class="codeblock">
   2838 <span class="codeblock_name">{Read Number Loop <a href="lexer.html#1:68">68</a>}</span>
   2839 <pre class="prettyprint lang-c">
   2840    while ((c = input_getc()) != EOF) {
   2841     // Since there can be multiple writes to the buffer, we want to make sure we
   2842     // don't overflow by giving a 4 byte pad
   2843     if (i &gt; 1020) {
   2844       tok_error("Number too long\n");
   2845       return NULL;
   2846     }
   2847     // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
   2848     // a-f/A-F for hex
   2849     if ((radix == 10 &amp;&amp; isdigit(c)) || (radix == 16 &amp;&amp; isxdigit(c)) ||
   2850         (radix == 8 &amp;&amp; c &gt;= '0' &amp;&amp; c &lt;= '7')) {
   2851       buf[i++] = c;
   2852       // Decimal point and not a float yet, must be a float
   2853     } else if (c == '.' &amp;&amp; !is_float) {
   2854       is_float = 1;
   2855       if (radix != 10) {
   2856         tok_error("Invalid floating point constant, expected decimal, got %s\n",
   2857                   radix == 16 ? "hexadecimal" : "octal");
   2858         return NULL;
   2859       }
   2860       buf[i++] = c;
   2861     }
   2862     // Exponent on the end of a constant. (By standard this forces it to be a
   2863     // float)
   2864     else if (c == 'e' || c == 'E') {
   2865       buf[i++] = c;
   2866       c = input_getc();
   2867       // Sign on the exponent
   2868       if (c == '+' || c == '-') {
   2869         buf[i++] = c;
   2870         c = input_getc();
   2871       }
   2872       // Exponent must be a digit, I.E no 1e1.2
   2873       if (!isdigit(c)) {
   2874         tok_error("Invalid floating point exponent\n");
   2875         return NULL;
   2876       }
   2877       buf[i++] = c;
   2878       is_float = 1;
   2879     } else {
   2880       // Reached the end, unget the character so other functions can read it
   2881       input_ungetc(c);
   2882       break;
   2883     }
   2884   }
   2885 </pre>
   2886 
   2887 
   2888 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2889 </div>
   2890 </div>
   2891 <a name="1:69"><div class="section"><h4 class="noheading">69. </h4></a>
   2892 <p>C constants can have suffixes to indicate their type. We need to check for these suffixes and set the appropriate flags.
   2893 </p>
   2894 
   2895 <div class="codeblock">
   2896 <span class="codeblock_name">{Process Suffixes <a href="lexer.html#1:69">69</a>}</span>
   2897 <pre class="prettyprint lang-c">
   2898   int is_unsigned = 0;
   2899   int is_long = 0;
   2900   int is_single = 0;
   2901   while (1) {
   2902     c = input_getc();
   2903     if (c == 'u' || c == 'U') {
   2904       if (is_unsigned) {
   2905         tok_warn(
   2906             "Warning: Duplicate suffix 'u' for integer constant ignored\n");
   2907       }
   2908       is_unsigned = 1;
   2909     } else if (c == 'l' || c == 'L') {
   2910       if (is_long) {
   2911         tok_warn(
   2912             "Warning: Duplicate suffix 'l' for integer constant ignored\n");
   2913       }
   2914       is_long = 1;
   2915     } else if (c == 'f' || c == 'F') {
   2916       if (is_single) {
   2917         tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
   2918                  "ignored\n");
   2919       }
   2920       is_single = 1;
   2921     } else {
   2922       input_ungetc(c);
   2923       break;
   2924     }
   2925   }
   2926 </pre>
   2927 
   2928 
   2929 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2930 </div>
   2931 </div>
   2932 <a name="1:70"><div class="section"><h4 class="noheading">70. </h4></a>
   2933 <p>If we find conflicting suffixes, we print a warning and ignore the suffixes.
   2934 </p>
   2935 
   2936 <div class="codeblock">
   2937 <span class="codeblock_name">{Check for conflicting suffixes <a href="lexer.html#1:70">70</a>}</span>
   2938 <pre class="prettyprint lang-c">
   2939     if (is_single &amp;&amp; is_long) {
   2940     tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
   2941              "constant. Ignoring 'l'\n");
   2942     is_long = 0;
   2943   }
   2944   if (is_single &amp;&amp; is_unsigned) {
   2945     tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
   2946              "constant. Ignoring 'u'\n");
   2947     is_unsigned = 0;
   2948   }
   2949   if (is_single &amp;&amp; !is_float) {
   2950     tok_warn(
   2951         "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
   2952     is_single = 0;
   2953   }
   2954 </pre>
   2955 
   2956 
   2957 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2958 </div>
   2959 </div>
   2960 <a name="1:71"><div class="section"><h4 class="noheading">71. </h4></a>
   2961 <p>If the string contains a float, we pass it to strtod. We need to make sure that the number is in range for the given type and check for errors from strtod
   2962 </p>
   2963 
   2964 <div class="codeblock">
   2965 <span class="codeblock_name">{Convert to float <a href="lexer.html#1:71">71</a>}</span>
   2966 <pre class="prettyprint lang-c">
   2967     errno = 0;
   2968     // Strtod generates a unix-style error when it's given something out of
   2969     // range, so we want to get on top of that quickly instead of ignoring it
   2970     // That way we can avoid some nasty NAN-propagation in the constant folder.
   2971     double f = strtod(buf, NULL);
   2972     if (errno == ERANGE) {
   2973       tok_error("Floating point constant out of range\n");
   2974       return NULL;
   2975     }
   2976     // Warn if the constant is out of range for a float, I.E it's too big or too
   2977     // small
   2978     if (is_single &amp;&amp; (f &lt; FLT_MIN || f &gt; FLT_MAX)) {
   2979       tok_warn(
   2980           "Warning: Floating point constant %f is out of range for float\n", f);
   2981     }
   2982     // Warn if the constant is too precise for a float
   2983     if (is_single &amp;&amp; fabs((double)((float)f) - f) &gt;= FLT_EPSILON) {
   2984       tok_warn("Warning: Converting double precision floating point constant "
   2985                "%f to float loses "
   2986                "precision\n",
   2987                f);
   2988     }
   2989     return token_create_float(is_single ? TOK_FLOAT_32
   2990                                         : TOK_FLOAT_64,
   2991                               line, column, f, i);
   2992 </pre>
   2993 
   2994 
   2995 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   2996 </div>
   2997 </div>
   2998 <a name="1:72"><div class="section"><h4 class="noheading">72. </h4></a>
   2999 <p>If the string contains a number, we pass it to stroll. We need to make sure that the number is in range for the given type and check for errors from strtoll
   3000 </p>
   3001 
   3002 <div class="codeblock">
   3003 <span class="codeblock_name">{Convert to integer <a href="lexer.html#1:72">72</a>}</span>
   3004 <pre class="prettyprint lang-c">
   3005     errno = 0;
   3006     uint64_t int_ = strtoull(buf, NULL, radix);
   3007     // Same as above, but for integers
   3008     if (errno == ERANGE) {
   3009       tok_error("Integer constant out of range\n");
   3010       return NULL;
   3011     }
   3012     if (is_unsigned) {
   3013       if (is_long) {
   3014         return token_create_int(TOK_INTEGER_U64, line, column, int_, i);
   3015       } else {
   3016         if (int_ &gt; UINT32_MAX) {
   3017           tok_warn(
   3018               "Warning: Integer constant %lld is out of range for unsigned "
   3019               "int\n",
   3020               int_);
   3021         }
   3022         return token_create_int(TOK_INTEGER_U32, line, column, int_, i);
   3023       }
   3024     } else {
   3025       if (is_long) {
   3026         // If the highest bit is set, that means this will overflow a signed
   3027         // long (Due to two's complement)
   3028         if (int_ &amp; (1UL &lt;&lt; 63)) {
   3029           tok_warn(
   3030               "Warning: Integer constant %lld is out of range for long long\n",
   3031               i);
   3032         }
   3033         return token_create_int(TOK_INTEGER_S64, line, column, int_, i);
   3034       } else {
   3035         if (int_ &amp; (1UL &lt;&lt; 31)) {
   3036           tok_warn("Warning: Integer constant %lld is out of range for int\n",
   3037                    int_);
   3038         }
   3039         return token_create_int(TOK_INTEGER_S32, line, column, int_, i);
   3040       }
   3041     }
   3042 </pre>
   3043 
   3044 
   3045 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p>
   3046 </div>
   3047 </div>
   3048 <a name="1:73"><div class="section"><h4 class="noheading">73. </h4></a>
   3049 <p>The <code>read_char_constant</code> function reads a character constant from the input stream. It can be a single character or a multi-character escape sequence.
   3050 </p>
   3051 
   3052 <div class="codeblock">
   3053 <span class="codeblock_name">{Tokenize Character <a href="lexer.html#1:73">73</a>}</span>
   3054 <pre class="prettyprint lang-c">
   3055 static token_t *read_char_constant(void) {
   3056   int c;
   3057   int len = 0;
   3058   c = input_getc();
   3059   if (c != '\'') {
   3060     input_ungetc(c);
   3061     return NULL;
   3062   }
   3063   len++;
   3064   c = input_getc();
   3065   if (c == '\'') {
   3066     tok_error("Empty character constant\n");
   3067     return NULL;
   3068   }
   3069   if (c == '\\') {
   3070     c = read_escape_sequence(&amp;len);
   3071   }
   3072   int val = c;
   3073   c = input_getc();
   3074   if (c != '\'') {
   3075     tok_error("Expected closing quote for character constant\n");
   3076     return NULL;
   3077   }
   3078   len++;
   3079   return token_create_char(TOK_CHAR_CONST, line, column, val, len);
   3080 }
   3081 </pre>
   3082 
   3083 
   3084 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   3085 </div>
   3086 </div>
   3087 <a name="1:74"><div class="section"><h4 class="noheading">74. </h4></a>
   3088 <p>The <code>read_string_literal</code> function reads a string literal from the input stream.
   3089 </p>
   3090 <p>For this function, an automatic-lifetime buffer is used to store the string it becomes too large. At that point, a heap-allocated buffer is used.
   3091 This way we can avoid unnecessary heap allocations for small strings.
   3092 </p>
   3093 
   3094 <div class="codeblock">
   3095 <span class="codeblock_name">{Tokenize String <a href="lexer.html#1:74">74</a>}</span>
   3096 <pre class="prettyprint lang-c">
   3097 <span class="nocode pln">{Read Escape Sequence, <a href="lexer.html#1:75">75</a>}</span>
   3098 static token_t *read_string_literal(void) {
   3099   int c;
   3100   c = input_getc();
   3101   if (c != '"') {
   3102     input_ungetc(c);
   3103     return NULL;
   3104   }
   3105   int i = 0;
   3106   char s_buf[512];
   3107   char *buf = s_buf;
   3108   int len = 512;
   3109   int esc_pad = 0;
   3110   while ((c = input_getc()) != EOF) {
   3111     if (c == '"') {
   3112       // Implicit skip of closing quote
   3113       break;
   3114     }
   3115     if (c == '\\') {
   3116       c = read_escape_sequence(&amp;esc_pad);
   3117       if (c == 0) {
   3118         return NULL;
   3119       }
   3120     }
   3121     if (i &gt;= len) {
   3122       if (buf == s_buf) {
   3123         buf = malloc(1024);
   3124         if (buf == NULL) {
   3125           fputs("Out of memory. Could not parse string literal.\n", stderr);
   3126           exit(1);
   3127         }
   3128         memcpy(buf, s_buf, 512);
   3129         len *= 2;
   3130       } else {
   3131         len *= 2;
   3132         buf = realloc(buf, len);
   3133       }
   3134     }
   3135     buf[i++] = c;
   3136   }
   3137   buf[i] = '\0';
   3138   if (c == EOF) {
   3139     tok_error("Unterminated string literal\n");
   3140     if (buf != s_buf) {
   3141       free(buf);
   3142     }
   3143     return NULL;
   3144   }
   3145 
   3146   token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf,
   3147                                      i + esc_pad + 2);
   3148   if (buf != s_buf) {
   3149     free(buf);
   3150   }
   3151   return tok;
   3152 }
   3153 </pre>
   3154 
   3155 
   3156 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   3157 </div>
   3158 </div>
   3159 <a name="1:75"><div class="section"><h4 class="noheading">75. </h4></a>
   3160 <p>Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases.
   3161 </p>
   3162 
   3163 <div class="codeblock">
   3164 <span class="codeblock_name">{Read Escape Sequence <a href="lexer.html#1:75">75</a>}</span>
   3165 <pre class="prettyprint lang-c">
   3166 static char read_escape_sequence(int *len) {
   3167   int c = input_getc();
   3168   *len += 1;
   3169   switch (c) {
   3170   case 'a':
   3171     return '\a';
   3172   case 'b':
   3173     return '\b';
   3174   case 'f':
   3175     return '\f';
   3176   case 'n':
   3177     return '\n';
   3178   case 'r':
   3179     return '\r';
   3180   case 't':
   3181     return '\t';
   3182   case 'v':
   3183     return '\v';
   3184   case '\'':
   3185     return '\'';
   3186   case '"':
   3187     return '"';
   3188   case '?':
   3189     return '?';
   3190   case '\\':
   3191     return '\\';
   3192   case '0':
   3193     return '\0';
   3194   case 'x': {
   3195     c = input_getc();
   3196     if (!isxdigit(c)) {
   3197       tok_error("Invalid hexadecimal escape sequence\n");
   3198       return 0;
   3199     }
   3200     int val = 0;
   3201     while (isxdigit(c)) {
   3202       *len += 1;
   3203       val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
   3204       c = input_getc();
   3205     }
   3206     input_ungetc(c);
   3207     return (char)val;
   3208   }
   3209   default:
   3210     if (!isdigit(c)) {
   3211       tok_error("Invalid escape sequence\n");
   3212       return 0;
   3213     }
   3214     int val = 0;
   3215     while (isdigit(c)) {
   3216       *len += 1;
   3217       val = val * 8 + c - '0';
   3218       c = input_getc();
   3219     }
   3220     input_ungetc(c);
   3221     return (char)val;
   3222   }
   3223 }
   3224 </pre>
   3225 
   3226 
   3227 <p class="seealso">Used in section <a href="lexer.html#1:74">74</a></p>
   3228 </div>
   3229 </div>
   3230 <a name="1:76"><div class="section"><h4 class="noheading">76. </h4></a>
   3231 <p>Finally, I'll add some code for running the tokenizer as its own program. This way we can test it out.
   3232 </p>
   3233 
   3234 <div class="codeblock">
   3235 <span class="codeblock_name">{Run Test <a href="lexer.html#1:76">76</a>}</span>
   3236 <pre class="prettyprint lang-c">
   3237 char *preprocess(char *in) {
   3238   char *output_name = malloc(1024);
   3239   snprintf(output_name, 1024, "%s.preprocessed", in);
   3240   char *command = malloc(2048);
   3241   snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
   3242   system(command);
   3243   free(command);
   3244   return output_name;
   3245 }
   3246 
   3247 // Tokenize the input file
   3248 int main(int argc, char **argv) {
   3249   if (argc != 2) {
   3250     fprintf(stderr, "Usage: %s &lt;input.c&gt;\n", argv[0]);
   3251     return 1;
   3252   }
   3253   char *input_name = argv[1];
   3254   char *preprocessed = preprocess(input_name);
   3255   init_tokenizer(preprocessed);
   3256   token_t *tok;
   3257   while ((tok = next_token()) != NULL) {
   3258     print_token(tok);
   3259     token_destroy(tok);
   3260   }
   3261   destroy_tokenizer();
   3262   remove(preprocessed);
   3263   free(preprocessed);
   3264   hash_table_destroy(string_table);
   3265   return 0;
   3266 }
   3267 </pre>
   3268 
   3269 
   3270 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p>
   3271 </div>
   3272 <h3> Bugs/Errata</h3>
   3273 <p>I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed.
   3274 </p>
   3275 <ul>
   3276 <li>had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code> check which if true returns EOF, preventing input initialization. Fixed by changing to <code>buffer_pos == buffer_size</code>.
   3277 </li>
   3278 <li>assertion <code>token-&gt;kind == TOK_STRING_ASCII</code> failed in token_string. Forgot to expand check for identifiers which also use token_string. Fixed by changing to <code>token-&gt;kind == TOK_STRING_ASCII || token-&gt;kind == TOK_ID || token-&gt;kind == TOK_TID</code>.
   3279 </li>
   3280 <li>token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving the call to free after the call to <code>hash_table_get</code>.
   3281 </li>
   3282 <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created double free. Fixed by rewriting part of function.
   3283 </li>
   3284 <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them.
   3285 </li>
   3286 <li>Destructor for string literals missing in tokenizer teardown, added it in.
   3287 </li>
   3288 <li>read_number - check <code>int_ &gt; INT32_MAX</code> does not work due to some weird casting. Added explicit cast.
   3289 </li>
   3290 <li>read_char_constant - Forgot to handle '\\0'. Added.
   3291 </li>
   3292 <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by adding a check for division operator.
   3293 </li>
   3294 <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced with ELF hash function.
   3295 </li>
   3296 <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an array index. Ew. Used i instead of strlen.
   3297 </li>
   3298 <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead of token_create_string for keywords.
   3299 </li>
   3300 <li>Everywhere - Checks added for memory allocation failure.
   3301 </li>
   3302 <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser.
   3303 </li>
   3304 </ul>
   3305 <h3> Conclusion</h3>
   3306 <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust.
   3307 </p>
   3308 <p>Next time, we'll start on the parser.
   3309 </p>
   3310 <h1> Source code, biblography</h1>
   3311 <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is available <a href="/projects/cminus/code/tokenizer.h">here</a>.
   3312 </p>
   3313 <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is available <a href="/projects/cminus/code/input.h">here</a>.
   3314 </p>
   3315 <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is available <a href="/projects/cminus/code/hash_table.h">here</a>.
   3316 </p>
   3317 <p>Source code for the token module is available <a href="/projects/cminus/code/token.c">here</a>, header file is available <a href="/projects/cminus/code/token.h">here</a>.
   3318 </p>
   3319 <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable Compiler for ANSI C. Grammars are from The C Reference Manual.
   3320 </p>
   3321 <p>I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it).
   3322 </p>
   3323 <p>Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there.
   3324 </p>
   3325 <p>The code for the ELF hash function is from the glibc source code.
   3326 </p>
   3327 <p>The idea for decision trees came from LCC.
   3328 </p>
   3329 <p>Literate programming rendered using <a href="https://github.com/zyedidia/Literate">literate</a>.
   3330 </p>
   3331 <p>  <footer style=" text-align: center; padding: 20px;">
   3332     <p>© 2024 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT
   3333       license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p>
   3334   </footer>
   3335 </p>
   3336 
   3337 </div>
   3338 </body>
	website Website contents
	git clone git://git.reagancfischer.dev/website.git
	Log \| Files \| Refs