lexer.html (118682B)
1 <!DOCTYPE html> 2 <html> 3 <head> 4 <meta charset="utf-8"> 5 <title>Lexer</title> 6 <script> 7 !function(){var q=null;window.PR_SHOULD_USE_CONTINUATION=!0; 8 (function(){function R(a){function d(e){var b=e.charCodeAt(0);if(b!==92)return b;var a=e.charAt(1);return(b=r[a])?b:"0"<=a&&a<="7"?parseInt(e.substring(1),8):a==="u"||a==="x"?parseInt(e.substring(2),16):e.charCodeAt(1)}function g(e){if(e<32)return(e<16?"\\x0":"\\x")+e.toString(16);e=String.fromCharCode(e);return e==="\\"||e==="-"||e==="]"||e==="^"?"\\"+e:e}function b(e){var b=e.substring(1,e.length-1).match(/\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\[0-3][0-7]{0,2}|\\[0-7]{1,2}|\\[\S\s]|[^\\]/g),e=[],a= 9 b[0]==="^",c=["["];a&&c.push("^");for(var a=a?1:0,f=b.length;a<f;++a){var h=b[a];if(/\\[bdsw]/i.test(h))c.push(h);else{var h=d(h),l;a+2<f&&"-"===b[a+1]?(l=d(b[a+2]),a+=2):l=h;e.push([h,l]);l<65||h>122||(l<65||h>90||e.push([Math.max(65,h)|32,Math.min(l,90)|32]),l<97||h>122||e.push([Math.max(97,h)&-33,Math.min(l,122)&-33]))}}e.sort(function(e,a){return e[0]-a[0]||a[1]-e[1]});b=[];f=[];for(a=0;a<e.length;++a)h=e[a],h[0]<=f[1]+1?f[1]=Math.max(f[1],h[1]):b.push(f=h);for(a=0;a<b.length;++a)h=b[a],c.push(g(h[0])), 10 h[1]>h[0]&&(h[1]+1>h[0]&&c.push("-"),c.push(g(h[1])));c.push("]");return c.join("")}function s(e){for(var a=e.source.match(/\[(?:[^\\\] ]|\\[\S\s])*]|\\u[\dA-Fa-f]{4}|\\x[\dA-Fa-f]{2}|\\\d+|\\[^\dux]|\(\?[!:=]|[()^]|[^()[\\^]+/g),c=a.length,d=[],f=0,h=0;f<c;++f){var l=a[f];l==="("?++h:"\\"===l.charAt(0)&&(l=+l.substring(1))&&(l<=h?d[l]=-1:a[f]=g(l))}for(f=1;f<d.length;++f)-1===d[f]&&(d[f]=++x);for(h=f=0;f<c;++f)l=a[f],l==="("?(++h,d[h]||(a[f]="(?:")):"\\"===l.charAt(0)&&(l=+l.substring(1))&&l<=h&& 11 (a[f]="\\"+d[l]);for(f=0;f<c;++f)"^"===a[f]&&"^"!==a[f+1]&&(a[f]="");if(e.ignoreCase&&m)for(f=0;f<c;++f)l=a[f],e=l.charAt(0),l.length>=2&&e==="["?a[f]=b(l):e!=="\\"&&(a[f]=l.replace(/[A-Za-z]/g,function(a){a=a.charCodeAt(0);return"["+String.fromCharCode(a&-33,a|32)+"]"}));return a.join("")}for(var x=0,m=!1,j=!1,k=0,c=a.length;k<c;++k){var i=a[k];if(i.ignoreCase)j=!0;else if(/[a-z]/i.test(i.source.replace(/\\u[\da-f]{4}|\\x[\da-f]{2}|\\[^UXux]/gi,""))){m=!0;j=!1;break}}for(var r={b:8,t:9,n:10,v:11, 12 f:12,r:13},n=[],k=0,c=a.length;k<c;++k){i=a[k];if(i.global||i.multiline)throw Error(""+i);n.push("(?:"+s(i)+")")}return RegExp(n.join("|"),j?"gi":"g")}function S(a,d){function g(a){var c=a.nodeType;if(c==1){if(!b.test(a.className)){for(c=a.firstChild;c;c=c.nextSibling)g(c);c=a.nodeName.toLowerCase();if("br"===c||"li"===c)s[j]="\n",m[j<<1]=x++,m[j++<<1|1]=a}}else if(c==3||c==4)c=a.nodeValue,c.length&&(c=d?c.replace(/\r\n?/g,"\n"):c.replace(/[\t\n\r ]+/g," "),s[j]=c,m[j<<1]=x,x+=c.length,m[j++<<1|1]= 13 a)}var b=/(?:^|\s)nocode(?:\s|$)/,s=[],x=0,m=[],j=0;g(a);return{a:s.join("").replace(/\n$/,""),d:m}}function H(a,d,g,b){d&&(a={a:d,e:a},g(a),b.push.apply(b,a.g))}function T(a){for(var d=void 0,g=a.firstChild;g;g=g.nextSibling)var b=g.nodeType,d=b===1?d?a:g:b===3?U.test(g.nodeValue)?a:d:d;return d===a?void 0:d}function D(a,d){function g(a){for(var j=a.e,k=[j,"pln"],c=0,i=a.a.match(s)||[],r={},n=0,e=i.length;n<e;++n){var z=i[n],w=r[z],t=void 0,f;if(typeof w==="string")f=!1;else{var h=b[z.charAt(0)]; 14 if(h)t=z.match(h[1]),w=h[0];else{for(f=0;f<x;++f)if(h=d[f],t=z.match(h[1])){w=h[0];break}t||(w="pln")}if((f=w.length>=5&&"lang-"===w.substring(0,5))&&!(t&&typeof t[1]==="string"))f=!1,w="src";f||(r[z]=w)}h=c;c+=z.length;if(f){f=t[1];var l=z.indexOf(f),B=l+f.length;t[2]&&(B=z.length-t[2].length,l=B-f.length);w=w.substring(5);H(j+h,z.substring(0,l),g,k);H(j+h+l,f,I(w,f),k);H(j+h+B,z.substring(B),g,k)}else k.push(j+h,w)}a.g=k}var b={},s;(function(){for(var g=a.concat(d),j=[],k={},c=0,i=g.length;c<i;++c){var r= 15 g[c],n=r[3];if(n)for(var e=n.length;--e>=0;)b[n.charAt(e)]=r;r=r[1];n=""+r;k.hasOwnProperty(n)||(j.push(r),k[n]=q)}j.push(/[\S\s]/);s=R(j)})();var x=d.length;return g}function v(a){var d=[],g=[];a.tripleQuotedStrings?d.push(["str",/^(?:'''(?:[^'\\]|\\[\S\s]|''?(?=[^']))*(?:'''|$)|"""(?:[^"\\]|\\[\S\s]|""?(?=[^"]))*(?:"""|$)|'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$))/,q,"'\""]):a.multiLineStrings?d.push(["str",/^(?:'(?:[^'\\]|\\[\S\s])*(?:'|$)|"(?:[^"\\]|\\[\S\s])*(?:"|$)|`(?:[^\\`]|\\[\S\s])*(?:`|$))/, 16 q,"'\"`"]):d.push(["str",/^(?:'(?:[^\n\r'\\]|\\.)*(?:'|$)|"(?:[^\n\r"\\]|\\.)*(?:"|$))/,q,"\"'"]);a.verbatimStrings&&g.push(["str",/^@"(?:[^"]|"")*(?:"|$)/,q]);var b=a.hashComments;b&&(a.cStyleComments?(b>1?d.push(["com",/^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/,q,"#"]):d.push(["com",/^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\b|[^\n\r]*)/,q,"#"]),g.push(["str",/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h(?:h|pp|\+\+)?|[a-z]\w*)>/,q])):d.push(["com", 17 /^#[^\n\r]*/,q,"#"]));a.cStyleComments&&(g.push(["com",/^\/\/[^\n\r]*/,q]),g.push(["com",/^\/\*[\S\s]*?(?:\*\/|$)/,q]));if(b=a.regexLiterals){var s=(b=b>1?"":"\n\r")?".":"[\\S\\s]";g.push(["lang-regex",RegExp("^(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*("+("/(?=[^/*"+b+"])(?:[^/\\x5B\\x5C"+b+"]|\\x5C"+s+"|\\x5B(?:[^\\x5C\\x5D"+b+"]|\\x5C"+ 18 s+")*(?:\\x5D|$))+/")+")")])}(b=a.types)&&g.push(["typ",b]);b=(""+a.keywords).replace(/^ | $/g,"");b.length&&g.push(["kwd",RegExp("^(?:"+b.replace(/[\s,]+/g,"|")+")\\b"),q]);d.push(["pln",/^\s+/,q," \r\n\t\u00a0"]);b="^.[^\\s\\w.$@'\"`/\\\\]*";a.regexLiterals&&(b+="(?!s*/)");g.push(["lit",/^@[$_a-z][\w$@]*/i,q],["typ",/^(?:[@_]?[A-Z]+[a-z][\w$@]*|\w+_t\b)/,q],["pln",/^[$_a-z][\w$@]*/i,q],["lit",/^(?:0x[\da-f]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+-]?\d+)?)[a-z]*/i,q,"0123456789"],["pln",/^\\[\S\s]?/, 19 q],["pun",RegExp(b),q]);return D(d,g)}function J(a,d,g){function b(a){var c=a.nodeType;if(c==1&&!x.test(a.className))if("br"===a.nodeName)s(a),a.parentNode&&a.parentNode.removeChild(a);else for(a=a.firstChild;a;a=a.nextSibling)b(a);else if((c==3||c==4)&&g){var d=a.nodeValue,i=d.match(m);if(i)c=d.substring(0,i.index),a.nodeValue=c,(d=d.substring(i.index+i[0].length))&&a.parentNode.insertBefore(j.createTextNode(d),a.nextSibling),s(a),c||a.parentNode.removeChild(a)}}function s(a){function b(a,c){var d= 20 c?a.cloneNode(!1):a,e=a.parentNode;if(e){var e=b(e,1),g=a.nextSibling;e.appendChild(d);for(var i=g;i;i=g)g=i.nextSibling,e.appendChild(i)}return d}for(;!a.nextSibling;)if(a=a.parentNode,!a)return;for(var a=b(a.nextSibling,0),d;(d=a.parentNode)&&d.nodeType===1;)a=d;c.push(a)}for(var x=/(?:^|\s)nocode(?:\s|$)/,m=/\r\n?|\n/,j=a.ownerDocument,k=j.createElement("li");a.firstChild;)k.appendChild(a.firstChild);for(var c=[k],i=0;i<c.length;++i)b(c[i]);d===(d|0)&&c[0].setAttribute("value",d);var r=j.createElement("ol"); 21 r.className="linenums";for(var d=Math.max(0,d-1|0)||0,i=0,n=c.length;i<n;++i)k=c[i],k.className="L"+(i+d)%10,k.firstChild||k.appendChild(j.createTextNode("\u00a0")),r.appendChild(k);a.appendChild(r)}function p(a,d){for(var g=d.length;--g>=0;){var b=d[g];F.hasOwnProperty(b)?E.console&&console.warn("cannot override language handler %s",b):F[b]=a}}function I(a,d){if(!a||!F.hasOwnProperty(a))a=/^\s*</.test(d)?"default-markup":"default-code";return F[a]}function K(a){var d=a.h;try{var g=S(a.c,a.i),b=g.a; 22 a.a=b;a.d=g.d;a.e=0;I(d,b)(a);var s=/\bMSIE\s(\d+)/.exec(navigator.userAgent),s=s&&+s[1]<=8,d=/\n/g,x=a.a,m=x.length,g=0,j=a.d,k=j.length,b=0,c=a.g,i=c.length,r=0;c[i]=m;var n,e;for(e=n=0;e<i;)c[e]!==c[e+2]?(c[n++]=c[e++],c[n++]=c[e++]):e+=2;i=n;for(e=n=0;e<i;){for(var p=c[e],w=c[e+1],t=e+2;t+2<=i&&c[t+1]===w;)t+=2;c[n++]=p;c[n++]=w;e=t}c.length=n;var f=a.c,h;if(f)h=f.style.display,f.style.display="none";try{for(;b<k;){var l=j[b+2]||m,B=c[r+2]||m,t=Math.min(l,B),A=j[b+1],G;if(A.nodeType!==1&&(G=x.substring(g, 23 t))){s&&(G=G.replace(d,"\r"));A.nodeValue=G;var L=A.ownerDocument,o=L.createElement("span");o.className=c[r+1];var v=A.parentNode;v.replaceChild(o,A);o.appendChild(A);g<l&&(j[b+1]=A=L.createTextNode(x.substring(t,l)),v.insertBefore(A,o.nextSibling))}g=t;g>=l&&(b+=2);g>=B&&(r+=2)}}finally{if(f)f.style.display=h}}catch(u){E.console&&console.log(u&&u.stack||u)}}var E=window,y=["break,continue,do,else,for,if,return,while"],C=[[y,"auto,case,char,const,default,double,enum,extern,float,goto,inline,int,long,register,short,signed,sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"], 24 "catch,class,delete,false,import,new,operator,private,protected,public,this,throw,true,try,typeof"],M=[C,"alignof,align_union,asm,axiom,bool,concept,concept_map,const_cast,constexpr,decltype,delegate,dynamic_cast,explicit,export,friend,generic,late_check,mutable,namespace,nullptr,property,reinterpret_cast,static_assert,static_cast,template,typeid,typename,using,virtual,where"],V=[C,"abstract,assert,boolean,byte,extends,final,finally,implements,import,instanceof,interface,null,native,package,strictfp,super,synchronized,throws,transient"], 25 N=[C,"abstract,as,base,bool,by,byte,checked,decimal,delegate,descending,dynamic,event,finally,fixed,foreach,from,group,implicit,in,interface,internal,into,is,let,lock,null,object,out,override,orderby,params,partial,readonly,ref,sbyte,sealed,stackalloc,string,select,uint,ulong,unchecked,unsafe,ushort,var,virtual,where"],C=[C,"debugger,eval,export,function,get,null,set,undefined,var,with,Infinity,NaN"],O=[y,"and,as,assert,class,def,del,elif,except,exec,finally,from,global,import,in,is,lambda,nonlocal,not,or,pass,print,raise,try,with,yield,False,True,None"], 26 P=[y,"alias,and,begin,case,class,def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo,rescue,retry,self,super,then,true,undef,unless,until,when,yield,BEGIN,END"],W=[y,"as,assert,const,copy,drop,enum,extern,fail,false,fn,impl,let,log,loop,match,mod,move,mut,priv,pub,pure,ref,self,static,struct,true,trait,type,unsafe,use"],y=[y,"case,done,elif,esac,eval,fi,function,in,local,set,then,until"],Q=/^(DIR|FILE|vector|(de|priority_)?queue|list|stack|(const_)?iterator|(multi)?(set|map)|bitset|u?(int|float)\d*)\b/, 27 U=/\S/,X=v({keywords:[M,N,C,"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",O,P,y],hashComments:!0,cStyleComments:!0,multiLineStrings:!0,regexLiterals:!0}),F={};p(X,["default-code"]);p(D([],[["pln",/^[^<?]+/],["dec",/^<!\w[^>]*(?:>|$)/],["com",/^<\!--[\S\s]*?(?:--\>|$)/],["lang-",/^<\?([\S\s]+?)(?:\?>|$)/],["lang-",/^<%([\S\s]+?)(?:%>|$)/],["pun",/^(?:<[%?]|[%?]>)/],["lang-", 28 /^<xmp\b[^>]*>([\S\s]+?)<\/xmp\b[^>]*>/i],["lang-js",/^<script\b[^>]*>([\S\s]*?)(<\/script\b[^>]*>)/i],["lang-css",/^<style\b[^>]*>([\S\s]*?)(<\/style\b[^>]*>)/i],["lang-in.tag",/^(<\/?[a-z][^<>]*>)/i] ]),["default-markup","htm","html","mxml","xhtml","xml","xsl"]);p(D([["pln",/^\s+/,q," \t\r\n"],["atv",/^(?:"[^"]*"?|'[^']*'?)/,q,"\"'"] ],[["tag",/^^<\/?[a-z](?:[\w-.:]*\w)?|\/?>$/i],["atn",/^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],["lang-uq.val",/^=\s*([^\s"'>]*(?:[^\s"'/>]|\/(?=\s)))/],["pun",/^[/<->]+/], 29 ["lang-js",/^on\w+\s*=\s*"([^"]+)"/i],["lang-js",/^on\w+\s*=\s*'([^']+)'/i],["lang-js",/^on\w+\s*=\s*([^\s"'>]+)/i],["lang-css",/^style\s*=\s*"([^"]+)"/i],["lang-css",/^style\s*=\s*'([^']+)'/i],["lang-css",/^style\s*=\s*([^\s"'>]+)/i] ]),["in.tag"]);p(D([],[["atv",/^[\S\s]+/] ]),["uq.val"]);p(v({keywords:M,hashComments:!0,cStyleComments:!0,types:Q}),["c","cc","cpp","cxx","cyc","m"]);p(v({keywords:"null,true,false"}),["json"]);p(v({keywords:N,hashComments:!0,cStyleComments:!0,verbatimStrings:!0,types:Q}), 30 ["cs"]);p(v({keywords:V,cStyleComments:!0}),["java"]);p(v({keywords:y,hashComments:!0,multiLineStrings:!0}),["bash","bsh","csh","sh"]);p(v({keywords:O,hashComments:!0,multiLineStrings:!0,tripleQuotedStrings:!0}),["cv","py","python"]);p(v({keywords:"caller,delete,die,do,dump,elsif,eval,exit,foreach,for,goto,if,import,last,local,my,next,no,our,print,package,redo,require,sub,undef,unless,until,use,wantarray,while,BEGIN,END",hashComments:!0,multiLineStrings:!0,regexLiterals:2}),["perl","pl","pm"]);p(v({keywords:P, 31 hashComments:!0,multiLineStrings:!0,regexLiterals:!0}),["rb","ruby"]);p(v({keywords:C,cStyleComments:!0,regexLiterals:!0}),["javascript","js"]);p(v({keywords:"all,and,by,catch,class,else,extends,false,finally,for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then,throw,true,try,unless,until,when,while,yes",hashComments:3,cStyleComments:!0,multilineStrings:!0,tripleQuotedStrings:!0,regexLiterals:!0}),["coffee"]);p(v({keywords:W,cStyleComments:!0,multilineStrings:!0}),["rc","rs","rust"]); 32 p(D([],[["str",/^[\S\s]+/] ]),["regex"]);var Y=E.PR={createSimpleLexer:D,registerLangHandler:p,sourceDecorator:v,PR_ATTRIB_NAME:"atn",PR_ATTRIB_VALUE:"atv",PR_COMMENT:"com",PR_DECLARATION:"dec",PR_KEYWORD:"kwd",PR_LITERAL:"lit",PR_NOCODE:"nocode",PR_PLAIN:"pln",PR_PUNCTUATION:"pun",PR_SOURCE:"src",PR_STRING:"str",PR_TAG:"tag",PR_TYPE:"typ",prettyPrintOne:E.prettyPrintOne=function(a,d,g){var b=document.createElement("div");b.innerHTML="<pre>"+a+"</pre>";b=b.firstChild;g&&J(b,g,!0);K({h:d,j:g,c:b,i:1}); 33 return b.innerHTML},prettyPrint:E.prettyPrint=function(a,d){function g(){for(var b=E.PR_SHOULD_USE_CONTINUATION?c.now()+250:Infinity;i<p.length&&c.now()<b;i++){for(var d=p[i],j=h,k=d;k=k.previousSibling;){var m=k.nodeType,o=(m===7||m===8)&&k.nodeValue;if(o?!/^\??prettify\b/.test(o):m!==3||/\S/.test(k.nodeValue))break;if(o){j={};o.replace(/\b(\w+)=([\w%+\-.:]+)/g,function(a,b,c){j[b]=c});break}}k=d.className;if((j!==h||e.test(k))&&!v.test(k)){m=!1;for(o=d.parentNode;o;o=o.parentNode)if(f.test(o.tagName)&& 34 o.className&&e.test(o.className)){m=!0;break}if(!m){d.className+=" prettyprinted";m=j.lang;if(!m){var m=k.match(n),y;if(!m&&(y=T(d))&&t.test(y.tagName))m=y.className.match(n);m&&(m=m[1])}if(w.test(d.tagName))o=1;else var o=d.currentStyle,u=s.defaultView,o=(o=o?o.whiteSpace:u&&u.getComputedStyle?u.getComputedStyle(d,q).getPropertyValue("white-space"):0)&&"pre"===o.substring(0,3);u=j.linenums;if(!(u=u==="true"||+u))u=(u=k.match(/\blinenums\b(?::(\d+))?/))?u[1]&&u[1].length?+u[1]:!0:!1;u&&J(d,u,o);r= 35 {h:m,c:d,j:u,i:o};K(r)}}}i<p.length?setTimeout(g,250):"function"===typeof a&&a()}for(var b=d||document.body,s=b.ownerDocument||document,b=[b.getElementsByTagName("pre"),b.getElementsByTagName("code"),b.getElementsByTagName("xmp")],p=[],m=0;m<b.length;++m)for(var j=0,k=b[m].length;j<k;++j)p.push(b[m][j]);var b=q,c=Date;c.now||(c={now:function(){return+new Date}});var i=0,r,n=/\blang(?:uage)?-([\w.]+)(?!\S)/,e=/\bprettyprint\b/,v=/\bprettyprinted\b/,w=/pre|xmp/i,t=/^code$/i,f=/^(?:pre|code|xmp)$/i, 36 h={};g()}};typeof define==="function"&&define.amd&&define("google-code-prettify",[],function(){return Y})})();}() 37 </script> 38 <style> 39 .pln{color:#1b181b}.str{color:#918b3b}.kwd{color:#7b59c0}.com{color:#9e8f9e}.typ{color:#516aec}.lit{color:#a65926}.clo,.opn,.pun{color:#1b181b}.tag{color:#ca402b}.atn{color:#a65926}.atv{color:#159393}.dec{color:#a65926}.var{color:#ca402b}.fun{color:#516aec}pre.prettyprint{background:#f7f3f7;color:#ab9bab;font-family:Menlo,Consolas,"Bitstream Vera Sans Mono","DejaVu Sans Mono",Monaco,monospace;font-size:12px;line-height:1.5;border:1px solid #d8cad8;padding:10px}ol.linenums{margin-top:0;margin-bottom:0} 40 body{min-width:200px;max-width:850px;margin:0 auto;padding:30px;}.chapter-nav{font-size: 10pt;}a:link,a:visited{color:#00f}.codeblock_name,code,pre.prettyprint{font-family:Monaco,"Lucida Console",monospace}body{font-size:14pt}.codeblock_name,.math,.seealso,code{font-size:10pt}.codeblock{page-break-inside:avoid;padding-bottom:15px}.math{text-indent:0}pre.prettyprint{font-size:10pt;padding:10px;border-radius:10px;border:none;white-space:pre-wrap}.codeblock_name{margin-top:1.25em;display:block}a:link{text-decoration:none}a:link:not(.lit):hover{color:#00f;text-decoration:underline}a:link:active{color:red}h4{padding-right:1.25em}h4.noheading{margin-bottom:0}h1{text-align:center}code{padding:2px}pre{-moz-tab-size:4;-o-tab-size:4;tab-size:4}p:not(.notp){margin:0;text-indent:2em}.two-col{list-style-type:none}.two-col li:before{content:'-';padding:5px;margin-right:5px;color:orange;background-color:#fff;display:inline-block}@media print{body{font-size:10pt}pre.prettyprint{font-size:8pt}.seealso{font-size:9pt}.codeblock_name,.math,code{font-size:8pt}.math{text-indent:0}} 41 /* code blocks (Style from jmeiners.com/lc3-vm, CC BY-NC-SA 4.0, used with attribution) */ 42 43 /* Quotes and Block Quotes */ 44 blockquote { 45 margin: 1.5em 10px; 46 padding: 0.5em 10px; 47 border-left: 5px solid #ccc; 48 color: #666; 49 background-color: #f9f9f9; 50 font-style: italic; 51 } 52 53 blockquote p { 54 margin: 0; 55 font-size: 1.2em; 56 } 57 58 q { 59 quotes: "“" "”" "‘" "’"; 60 font-style: italic; 61 } 62 63 q::before { 64 content: open-quote; 65 } 66 67 q::after { 68 content: close-quote; 69 } 70 71 /*! Color themes for Google Code Prettify | MIT License | github.com/jmblog/color-themes-for-google-code-prettify */ 72 .prettyprint { 73 background: #f5f7ff; 74 font-family: Menlo, "Bitstream Vera Sans Mono", "DejaVu Sans Mono", Monaco, Consolas, monospace; 75 border: 0 !important; 76 } 77 78 .pln { 79 color: #202746; 80 } 81 82 /* Specify class=linenums on a pre to get line numbering */ 83 ol.linenums { 84 margin-top: 0; 85 margin-bottom: 0; 86 color: #202746; 87 } 88 89 li.L0, 90 li.L1, 91 li.L2, 92 li.L3, 93 li.L4, 94 li.L5, 95 li.L6, 96 li.L7, 97 li.L8, 98 li.L9 { 99 padding-left: 1em; 100 background-color: #f5f7ff; 101 list-style-type: decimal; 102 } 103 104 @media screen { 105 106 /* string content */ 107 108 .str { 109 color: #ac9739; 110 } 111 112 /* keyword */ 113 114 .kwd { 115 color: #6679cc; 116 } 117 118 /* comment */ 119 120 .com { 121 color: #202746; 122 } 123 124 /* type name */ 125 126 .typ { 127 color: #3d8fd1; 128 } 129 130 /* literal value */ 131 132 .lit { 133 color: #c76b29; 134 } 135 136 /* punctuation */ 137 138 .pun { 139 color: #202746; 140 } 141 142 /* lisp open bracket */ 143 144 .opn { 145 color: #202746; 146 } 147 148 /* lisp close bracket */ 149 150 .clo { 151 color: #202746; 152 } 153 154 /* markup tag name */ 155 156 .tag { 157 color: #c94922; 158 } 159 160 /* markup attribute name */ 161 162 .atn { 163 color: #c76b29; 164 } 165 166 /* markup attribute value */ 167 168 .atv { 169 color: #22a2c9; 170 } 171 172 /* declaration */ 173 174 .dec { 175 color: #c76b29; 176 } 177 178 /* variable name */ 179 180 .var { 181 color: #c94922; 182 } 183 184 /* function name */ 185 186 .fun { 187 color: #3d8fd1; 188 } 189 }</style> 190 </head> 191 <body onload="prettyPrint()"> 192 <section> 193 <h1>Lexer</h1> 194 <a name="1:1"><div class="section"><h4>1. General Project Structure</h4></a> 195 <p>Since this is the first article, I'll outline the project structure for the C- compiler. 196 </p> 197 <p>The project has a series of pretty typical stages: 198 </p> 199 <ol> 200 <li>The lexer. This takes a file as input and emits a series of tokens (Its input is already preprocessed, I outsource that to "gcc -E"). 201 </li> 202 <li>The parser. This takes the tokens and builds an abstract syntax tree (AST). 203 </li> 204 <li>The type checker/semantic analyzer. This is used to ensure that the types of variables and functions are correct. 205 </li> 206 <li>The code generator. This takes the AST and generates an intermediate representation (IR). 207 </li> 208 <li>The optimizer. This takes the IR and optimizes it. This'll be broken up into a few stages. 209 </li> 210 <li>The lowerer. This takes the IR and lowers it to a simpler IR. 211 </li> 212 <li>The register allocator. This takes the IR, which has instructions in an infinite number of registers, and assigns them to a finite number of registers. 213 </li> 214 <li>The code emitter. This takes the IR and emits RISC-V assembly. 215 </li> 216 </ol> 217 <p>As far as possible, I'd like to keep each of these stages separate. One benefit of this is that it simplifies memory management greatly. I plan to use an arena allocator for each stage, and by making sure the only thing on the actual heap is the output of the stage, and all temporary data is stored in the arena, I can free all the memory used by a stage by simply freeing the arena. 218 </p> 219 220 </div> 221 <a name="1:2"><div class="section"><h4>2. Some Rules</h4></a> 222 <p>Here are some rules (more like guidelines) that I plan to follow for this project; they're mostly just to keep things simple and consistent. 223 </p> 224 <ol> 225 <li><p>PROGRAM LIKE IT'S 1999 226 </p> 227 </li> 228 </ol> 229 <blockquote><p> 640 KB ought to be enough for anybody. - Bill Gates 230 </p> 231 </blockquote> 232 <p>Maybe not that little, But I'm going to try to keep the project as simple as possible, 640 KB probably won't be enough, but I'll still aim for less than 10 MB of memory usage. 233 </p> 234 <p>This places a lot of constraints on the project, but I think it's a good exercise in minimalism. 235 </p> 236 <p>Some consequences of this are that I'll have to use memory-wise algorithms, be very careful about program structure, and avoid some of the bigger libraries (which will help with making this project self-hosting in the future). 237 </p> 238 <ol> 239 <li><p>PROGRAM IN C++-- 240 </p> 241 </li> 242 </ol> 243 <p>I'm not a big fan of C++, but its class system helps prevent a lot of ugly bugs. To that end, I'm going to try and keep data structures out of header files, and only expose functions that operate on those data structures, to create a sort of approximation of a class. This has a few benefits: 244 </p> 245 <ul> 246 <li>Quicker compilation. A change to a data structure will only require one file to be recompiled, rather than every file that includes the header. 247 </li> 248 <li>Less chance of bugs. If a function is the only way to interact with a data structure, then it's much harder to misuse that data structure. 249 </li> 250 <li>Run time type checking. I can include some sort of tag in the first field of every data structure to ensure that the correct functions are being called. 251 </li> 252 </ul> 253 <ol> 254 <li><p>DON'T GET FANCY 255 </p> 256 </li> 257 </ol> 258 <p>My goal here isn't to write the fastest interpreter in the world, or the most complete. I just want to make something that works and can be understood by someone else. 259 </p> 260 <p>That means I'm going to avoid a lot of the tricks that are used in production interpreters, and focus more on simplicity and readability. 261 </p> 262 <ol> 263 <li><p>DESIGN FOR DEBUGGING 264 </p> 265 </li> 266 </ol> 267 <p>This code is going to be peppered with asserts and contain mechanisms to print out the state of the program at any point. 268 </p> 269 <p>This might be painful, but it'll make debugging a lot simpler and let users look under the hood. 270 </p> 271 <ol> 272 <li><p>SMART DATA, STUPID CODE 273 </p> 274 </li> 275 </ol> 276 <p>A lot of times, the right data structure can replace 50-100 lines of procedural code. I'm going to try and design data structures which make the algorithms as simple as possible. 277 </p> 278 <p>For example, instead of writing 50-100 lines of code to hold every keyword in the language, I can just use a simple hash table. 279 </p> 280 281 </div> 282 <a name="1:3"><div class="section"><h4>3. Misc</h4></a> 283 <p>THIS IS A LITERATE PROGRAM! Go to <a href="https://reagancfischer.dev/projects/cminus/code/lexer.lit">this link</a> to see the file that generated this HTML. 284 </p> 285 286 </div> 287 <a name="1:4"><div class="section"><h4>4. The Lexer</h4></a> 288 <p>A lexical analyzer reads source code and produces tokens, which are the smallest units of meaning in a language. These tokens are then used by the parser to build an abstract syntax tree (AST), which represents the structure of the program. 289 </p> 290 <p>For example, in the C programming language, tokens include keywords (if, else, while, etc.), identifiers (variable names), numbers, and punctuation (braces, semicolons, etc.). 291 </p> 292 <p>Given a string like <code>int main() { return 0; }</code>, the lexer would produce a series of tokens like <code>INT</code>, <code>IDENTIFIER(main)</code>, <code>LPAREN</code>, <code>RPAREN</code>, <code>LBRACE</code>, <code>RETURN</code>, <code>INTCONSTANT(0)</code>, <code>SEMICOLON</code>, <code>RBRACE</code>. 293 </p> 294 295 </div> 296 <a name="1:5"><div class="section"><h4>5. Design</h4></a> 297 <p>I'll break the lexer into several modules: 298 </p> 299 <ul> 300 <li><code>token.c</code> will contain the token data structure and functions to create and destroy tokens. 301 </li> 302 <li><code>input.c</code> will contain the input data structure and functions to read from the input file. 303 </li> 304 <li><code>tokenizer.c</code> will contain the main lexer logic. 305 </li> 306 </ul> 307 308 </div> 309 <a name="1:6"><div class="section"><h4>6. Token Interface</h4></a> 310 <p>We'll need several components to represent a token: 311 </p> 312 <ul> 313 <li>The type of token, which will be an enum with values like <code>TOK_IF</code> or <code>TOK_INTEGER_U32</code>. 314 </li> 315 <li>The value of the token. Some tokens, like keywords, don't have a value. Others, like identifiers or constants, do. 316 </li> 317 <li>The line and column of the token, used for error messages. 318 </li> 319 </ul> 320 <p>To implement a class system in C, we'll hide the token implementation details behind an opaque pointer. We'll use a forward declaration of the token type in the header file and define the token type in the implementation file. 321 </p> 322 323 </div> 324 <a name="1:7"><div class="section"><h4 class="noheading">7. </h4></a> 325 326 <div class="codeblock"> 327 <span class="codeblock_name">{Opaque Token Type <a href="lexer.html#1:7">7</a>}</span> 328 <pre class="prettyprint lang-c"> 329 typedef struct token token_t; 330 </pre> 331 332 333 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p> 334 </div> 335 </div> 336 <a name="1:8"><div class="section"><h4 class="noheading">8. </h4></a> 337 <p>We'll need functions to create and destroy tokens. 338 </p> 339 340 <div class="codeblock"> 341 <span class="codeblock_name">{Token Creation and Destruction Interface <a href="lexer.html#1:8">8</a>}</span> 342 <pre class="prettyprint lang-c"> 343 token_t *token_create(c_token_types kind, int lin, int col, int len); 344 token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len); 345 token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len); 346 token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len); 347 token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len); 348 void token_destroy(token_t *token); 349 </pre> 350 351 352 <p class="seealso">Used in section <a href="lexer.html#1:41">41</a></p> 353 </div> 354 </div> 355 <a name="1:9"><div class="section"><h4 class="noheading">9. </h4></a> 356 <p>We'll also need functions to access the token data. 357 </p> 358 359 <div class="codeblock"> 360 <span class="codeblock_name">{Token Interface <a href="lexer.html#1:9">9</a>}</span> 361 <pre class="prettyprint lang-c"> 362 c_token_types token_type(token_t *token); 363 int64_t token_int(token_t *token); 364 double token_float(token_t *token); 365 const char *token_string(token_t *token); 366 char token_char(token_t *token); 367 int token_line(token_t *token); 368 int token_column(token_t *token); 369 void print_token(token_t *tok); 370 </pre> 371 372 373 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p> 374 </div> 375 </div> 376 <a name="1:10"><div class="section"><h4 class="noheading">10. </h4></a> 377 <p>We'll need types to represent the different kinds of tokens. 378 </p> 379 380 <div class="codeblock"> 381 <span class="codeblock_name">{Token Types <a href="lexer.html#1:10">10</a>}</span> 382 <pre class="prettyprint lang-c"> 383 typedef enum { 384 // Control Keywords 385 TOK_IF, 386 TOK_ELSE, 387 TOK_SWITCH, 388 TOK_CASE, 389 TOK_DEFAULT, 390 TOK_WHILE, 391 TOK_DO, 392 TOK_FOR, 393 TOK_CONTINUE, 394 TOK_BREAK, 395 TOK_RETURN, 396 TOK_GOTO, 397 398 // Type Keywords 399 TOK_VOID, 400 TOK_CHAR, 401 TOK_SHORT, 402 TOK_INT, 403 TOK_LONG, 404 TOK_FLOAT, 405 TOK_DOUBLE, 406 TOK_SIGNED, 407 TOK_UNSIGNED, 408 TOK_STRUCT, 409 TOK_UNION, 410 TOK_ENUM, 411 TOK_TYPEDEF, 412 413 // Storage Class/Specifier Keywords 414 TOK_AUTO, 415 TOK_REGISTER, 416 TOK_STATIC, 417 TOK_EXTERN, 418 TOK_CONST, 419 TOK_VOLATILE, 420 421 // Misc Keywords 422 TOK_SIZEOF, 423 424 // Operators 425 TOK_ADD, // + 426 TOK_SUB, // - 427 TOK_MUL, // * 428 TOK_DIV, // / 429 TOK_MOD, // % 430 TOK_BIT_AND, // & 431 TOK_BIT_OR, // | 432 TOK_BIT_XOR, // ^ 433 TOK_BIT_NOT, // ~ 434 TOK_LSHIFT, // << 435 TOK_RSHIFT, // >> 436 TOK_NOT, // ! 437 TOK_ASSIGN, // = 438 TOK_LT, // < 439 TOK_GT, // > 440 TOK_INC, // ++ 441 TOK_DEC, // -- 442 TOK_EQ, // == 443 TOK_NE, // != 444 TOK_LE, // <= 445 TOK_GE, // >= 446 TOK_AND, // && 447 TOK_OR, // || 448 TOK_MEMBER_POINTER, // -> 449 TOK_MEMBER, // . 450 TOK_COND_DECISION, // : 451 TOK_COND, // ? 452 TOK_ASSIGN_ADD, // += 453 TOK_ASSIGN_SUB, // -= 454 TOK_ASSIGN_MUL, // *= 455 TOK_ASSIGN_DIV, // /= 456 TOK_ASSIGN_MOD, // %= 457 TOK_ASSIGN_BITAND, // &= 458 TOK_ASSIGN_BITOR, // |= 459 TOK_ASSIGN_BITXOR, // ^= 460 TOK_ASSIGN_LSHIFT, // <<= 461 TOK_ASSIGN_RSHIFT, // >>= 462 463 // Separators 464 TOK_LEFT_PAREN, // ( 465 TOK_RIGHT_PAREN, // ) 466 TOK_LEFT_BRACKET, // [ 467 TOK_RIGHT_BRACKET, // ] 468 TOK_LEFT_BRACE, // { 469 TOK_RIGHT_BRACE, // } 470 TOK_COMMA, // , 471 TOK_SEMICOLON, // ; 472 TOK_DOT, // . 473 TOK_ELLIPSIS, // ... 474 TOK_HASH, // # 475 476 // Identifiers 477 TOK_ID, 478 TOK_TYPEDEF_NAME, 479 480 // Constants 481 TOK_INTEGER_U32, // u 482 TOK_INTEGER_U64, // ul 483 TOK_INTEGER_S32, // (no suffix) 484 TOK_INTEGER_S64, // l 485 TOK_FLOAT_32, // f 486 TOK_FLOAT_64, // (no suffix) 487 TOK_CHAR_CONST, // 'c' 488 TOK_STRING_ASCII, // "string" (width of 8 bits) 489 490 // Special 491 TOK_EOF, 492 TOK_ERROR, 493 } c_token_types; 494 </pre> 495 496 497 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:41">41</a></p> 498 </div> 499 </div> 500 <a name="1:11"><div class="section"><h4 class="noheading">11. </h4></a> 501 <p>We bring this all together in <code>token.h</code>. Line and column are exposed as global variables because <code>skip_whitespace</code> will need to update them. 502 </p> 503 504 <div class="codeblock"> 505 <span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>}</span> 506 <pre class="prettyprint lang-c"> 507 #ifndef TOKEN_H 508 #define TOKEN_H 509 #include <stdint.h> // For int64_t 510 <span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span> 511 <span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span> 512 <span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span> 513 <span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span> 514 extern int column; 515 extern int line; 516 #endif 517 </pre> 518 519 <p class="seealso">Redefined in section <a href="lexer.html#1:41">41</a></p> 520 521 </div> 522 </div> 523 <a name="1:12"><div class="section"><h4>12. Token Implementation</h4></a> 524 <p>Now that we have the interface, we can implement the token data structure. We'll need: 525 </p> 526 <ul> 527 <li>The token type. 528 </li> 529 <li>A way to store extra data. 530 </li> 531 <li>Implementations of the functions defined in the interface. 532 </li> 533 </ul> 534 <p>To verify the token isn't corrupt, we'll use a tag. <code>TOK_MAGIC_1</code> represents a token with optional data, and <code>TOK_MAGIC_2</code> represents a token without optional data. 535 </p> 536 <p>A zero-length array is used in the token data structure. This GCC extension allows us to allocate memory for the token data structure and the token data in one allocation. 537 </p> 538 539 <div class="codeblock"> 540 <span class="codeblock_name">{Token Data Structure <a href="lexer.html#1:12">12</a>}</span> 541 <pre class="prettyprint lang-c"> 542 #define TOK_MAGIC_1 0x544F4B454E544F4Bul // "TOKENTOK" 543 #define TOK_MAGIC_2 0x544F4B544F4B454Eul // "TOKTOKEN" 544 545 struct token { 546 long magic; 547 int line; 548 int column; 549 short kind; 550 long opt_data[0]; 551 }; 552 553 typedef struct token token_t; 554 555 struct token_data { 556 union { 557 int64_t i; 558 double f; 559 const char *s; 560 char c; 561 } data; 562 }; 563 564 typedef struct token_data token_data_t; 565 int column = 1; 566 int line = 1; 567 </pre> 568 569 570 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p> 571 </div> 572 </div> 573 <a name="1:13"><div class="section"><h4 class="noheading">13. </h4></a> 574 <p>We'll implement an interface for accessing the token data and a macro for accessing optional data. 575 </p> 576 577 <div class="codeblock"> 578 <span class="codeblock_name">{Token Data Access <a href="lexer.html#1:13">13</a>}</span> 579 <pre class="prettyprint lang-c"> 580 #define token_data(token) ((struct token_data *)((token)->opt_data)) 581 582 c_token_types token_type(token_t *token) { 583 assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); 584 return token->kind; 585 } 586 587 int64_t token_int(token_t *token) { 588 assert(token->kind == TOK_INTEGER_U32 || token->kind == TOK_INTEGER_U64 || token->kind == TOK_INTEGER_S32 || token->kind == TOK_INTEGER_S64); 589 assert(token->magic == TOK_MAGIC_1); 590 return token_data(token)->data.i; 591 } 592 593 double token_float(token_t *token) { 594 assert(token->kind == TOK_FLOAT_32 || token->kind == TOK_FLOAT_64); 595 assert(token->magic == TOK_MAGIC_1); 596 return token_data(token)->data.f; 597 } 598 599 const char *token_string(token_t *token) { 600 assert(token->kind == TOK_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TYPEDEF_NAME); 601 assert(token->magic == TOK_MAGIC_1); 602 return token_data(token)->data.s; 603 } 604 605 char token_char(token_t *token) { 606 assert(token->kind == TOK_CHAR_CONST); 607 assert(token->magic == TOK_MAGIC_1); 608 return token_data(token)->data.c; 609 } 610 611 int token_line(token_t *token) { 612 assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); 613 return token->line; 614 } 615 616 int token_column(token_t *token) { 617 assert(token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2); 618 return token->column; 619 } 620 </pre> 621 622 623 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p> 624 </div> 625 </div> 626 <a name="1:14"><div class="section"><h4 class="noheading">14. </h4></a> 627 <p>For debugging, we'll add a function to print the token type. 628 </p> 629 630 <div class="codeblock"> 631 <span class="codeblock_name">{Token Debugging <a href="lexer.html#1:14">14</a>}</span> 632 <pre class="prettyprint lang-c"> 633 <span class="nocode pln">{Token Type Enum to String, <a href="lexer.html#1:15">15</a>}</span> 634 <span class="nocode pln">{Unescape String, <a href="lexer.html#1:16">16</a>}</span> 635 <span class="nocode pln">{Print Token, <a href="lexer.html#1:17">17</a>}</span> 636 </pre> 637 638 639 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p> 640 </div> 641 </div> 642 <a name="1:15"><div class="section"><h4 class="noheading">15. </h4></a> 643 <p>This function returns a string with the token type name. 644 </p> 645 646 <div class="codeblock"> 647 <span class="codeblock_name">{Token Type Enum to String <a href="lexer.html#1:15">15</a>}</span> 648 <pre class="prettyprint lang-c"> 649 const char *token_name_from_type(c_token_types type) { 650 switch (type) { 651 case TOK_IF: 652 return "TOK_IF"; 653 case TOK_ELSE: 654 return "TOK_ELSE"; 655 case TOK_SWITCH: 656 return "TOK_SWITCH"; 657 case TOK_CASE: 658 return "TOK_CASE"; 659 case TOK_DEFAULT: 660 return "TOK_DEFAULT"; 661 case TOK_WHILE: 662 return "TOK_WHILE"; 663 case TOK_DO: 664 return "TOK_DO"; 665 case TOK_FOR: 666 return "TOK_FOR"; 667 case TOK_CONTINUE: 668 return "TOK_CONTINUE"; 669 case TOK_BREAK: 670 return "TOK_BREAK"; 671 case TOK_RETURN: 672 return "TOK_RETURN"; 673 case TOK_GOTO: 674 return "TOK_GOTO"; 675 case TOK_VOID: 676 return "TOK_VOID"; 677 case TOK_CHAR: 678 return "TOK_CHAR"; 679 case TOK_SHORT: 680 return "TOK_SHORT"; 681 case TOK_INT: 682 return "TOK_INT"; 683 case TOK_LONG: 684 return "TOK_LONG"; 685 case TOK_FLOAT: 686 return "TOK_FLOAT"; 687 case TOK_DOUBLE: 688 return "TOK_DOUBLE"; 689 case TOK_SIGNED: 690 return "TOK_SIGNED"; 691 case TOK_UNSIGNED: 692 return "TOK_UNSIGNED"; 693 case TOK_STRUCT: 694 return "TOK_STRUCT"; 695 case TOK_UNION: 696 return "TOK_UNION"; 697 case TOK_ENUM: 698 return "TOK_ENUM"; 699 case TOK_TYPEDEF: 700 return "TOK_TYPEDEF"; 701 case TOK_AUTO: 702 return "TOK_AUTO"; 703 case TOK_REGISTER: 704 return "TOK_REGISTER"; 705 case TOK_STATIC: 706 return "TOK_STATIC"; 707 case TOK_EXTERN: 708 return "TOK_EXTERN"; 709 case TOK_CONST: 710 return "TOK_CONST"; 711 case TOK_VOLATILE: 712 return "TOK_VOLATILE"; 713 case TOK_SIZEOF: 714 return "TOK_SIZEOF"; 715 case TOK_ADD: 716 return "TOK_ADD"; 717 case TOK_SUB: 718 return "TOK_SUB"; 719 case TOK_MUL: 720 return "TOK_MUL"; 721 case TOK_DIV: 722 return "TOK_DIV"; 723 case TOK_MOD: 724 return "TOK_MOD"; 725 case TOK_BIT_AND: 726 return "TOK_BIT_AND"; 727 case TOK_BIT_OR: 728 return "TOK_BIT_OR"; 729 case TOK_BIT_XOR: 730 return "TOK_BIT_XOR"; 731 case TOK_BIT_NOT: 732 return "TOK_BIT_NOT"; 733 case TOK_LSHIFT: 734 return "TOK_LSHIFT"; 735 case TOK_RSHIFT: 736 return "TOK_RSHIFT"; 737 case TOK_NOT: 738 return "TOK_NOT"; 739 case TOK_ASSIGN: 740 return "TOK_ASSIGN"; 741 case TOK_LT: 742 return "TOK_LT"; 743 case TOK_GT: 744 return "TOK_GT"; 745 case TOK_INC: 746 return "TOK_INC"; 747 case TOK_DEC: 748 return "TOK_DEC"; 749 case TOK_EQ: 750 return "TOK_EQ"; 751 case TOK_NE: 752 return "TOK_NE"; 753 case TOK_LE: 754 return "TOK_LE"; 755 case TOK_GE: 756 return "TOK_GE"; 757 case TOK_AND: 758 return "TOK_AND"; 759 case TOK_OR: 760 return "TOK_OR"; 761 case TOK_MEMBER_POINTER: 762 return "TOK_MEMBER_POINTER"; 763 case TOK_MEMBER: 764 return "TOK_MEMBER"; 765 case TOK_COND_DECISION: 766 return "TOK_COND_DECISION"; 767 case TOK_COND: 768 return "TOK_COND"; 769 case TOK_ASSIGN_ADD: 770 return "TOK_ASSIGN_ADD"; 771 case TOK_ASSIGN_SUB: 772 return "TOK_ASSIGN_SUB"; 773 case TOK_ASSIGN_MUL: 774 return "TOK_ASSIGN_MUL"; 775 case TOK_ASSIGN_DIV: 776 return "TOK_ASSIGN_DIV"; 777 case TOK_ASSIGN_MOD: 778 return "TOK_ASSIGN_MOD"; 779 case TOK_ASSIGN_BITAND: 780 return "TOK_ASSIGN_BITAND"; 781 case TOK_ASSIGN_BITOR: 782 return "TOK_ASSIGN_BITOR"; 783 case TOK_ASSIGN_BITXOR: 784 return "TOK_ASSIGN_BITXOR"; 785 case TOK_ASSIGN_LSHIFT: 786 return "TOK_ASSIGN_LSHIFT"; 787 case TOK_ASSIGN_RSHIFT: 788 return "TOK_ASSIGN_RSHIFT"; 789 case TOK_HASH: 790 return "TOK_HASH"; 791 case TOK_ID: 792 return "TOK_ID"; 793 case TOK_TYPEDEF_NAME: 794 return "TOK_TYPEDEF_NAME"; 795 case TOK_INTEGER_U32: 796 return "TOK_INTEGER_U32"; 797 case TOK_INTEGER_U64: 798 return "TOK_INTEGER_U64"; 799 case TOK_INTEGER_S32: 800 return "TOK_INTEGER_S32"; 801 case TOK_INTEGER_S64: 802 return "TOK_INTEGER_S64"; 803 case TOK_FLOAT_32: 804 return "TOK_FLOAT_32"; 805 case TOK_FLOAT_64: 806 return "TOK_FLOAT_64"; 807 case TOK_CHAR_CONST: 808 return "TOK_CHAR_CONST"; 809 case TOK_STRING_ASCII: 810 return "TOK_STRING_ASCII"; 811 case TOK_EOF: 812 return "TOK_EOF"; 813 case TOK_ERROR: 814 return "TOK_ERROR"; 815 case TOK_LEFT_PAREN: 816 return "TOK_LEFT_PAREN"; 817 case TOK_RIGHT_PAREN: 818 return "TOK_RIGHT_PAREN"; 819 case TOK_LEFT_BRACKET: 820 return "TOK_LEFT_BRACKET"; 821 case TOK_RIGHT_BRACKET: 822 return "TOK_RIGHT_BRACKET"; 823 case TOK_LEFT_BRACE: 824 return "TOK_LEFT_BRACE"; 825 case TOK_RIGHT_BRACE: 826 return "TOK_RIGHT_BRACE"; 827 case TOK_COMMA: 828 return "TOK_COMMA"; 829 case TOK_SEMICOLON: 830 return "TOK_SEMICOLON"; 831 case TOK_DOT: 832 return "TOK_DOT"; 833 case TOK_ELLIPSIS: 834 return "TOK_ELLIPSIS"; 835 } 836 return "UNKNOWN"; 837 } 838 </pre> 839 840 841 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> 842 </div> 843 </div> 844 <a name="1:16"><div class="section"><h4 class="noheading">16. </h4></a> 845 <p>This function adds escape characters to a string for printing. 846 </p> 847 848 <div class="codeblock"> 849 <span class="codeblock_name">{Unescape String <a href="lexer.html#1:16">16</a>}</span> 850 <pre class="prettyprint lang-c"> 851 #define clamp(x, min, max) ((x) < (min) ? (min) : (x) > (max) ? (max) : (x)) 852 char *re_escape_string(const char *str) { 853 int len = strlen(str); 854 char *buf = malloc(len * 2 + 1); 855 if (!buf) { 856 fprintf(stderr, "Out of memory. Cannot escape string\n"); 857 exit(1); 858 } 859 int i = 0; 860 for (int j = 0; j < len; j++) { 861 switch (str[j]) { 862 case '\a': buf[i++] = '\\'; buf[i++] = 'a'; break; 863 case '\b': buf[i++] = '\\'; buf[i++] = 'b'; break; 864 case '\f': buf[i++] = '\\'; buf[i++] = 'f'; break; 865 case '\n': buf[i++] = '\\'; buf[i++] = 'n'; break; 866 case '\r': buf[i++] = '\\'; buf[i++] = 'r'; break; 867 case '\t': buf[i++] = '\\'; buf[i++] = 't'; break; 868 case '\v': buf[i++] = '\\'; buf[i++] = 'v'; break; 869 case '\\': buf[i++] = '\\'; buf[i++] = '\\'; break; 870 case '\'': buf[i++] = '\\'; buf[i++] = '\''; break; 871 case '"': buf[i++] = '\\'; buf[i++] = '"'; break; 872 default: { 873 if (isprint(str[j])) { 874 buf[i++] = str[j]; 875 } else { 876 buf[i++] = '\\'; 877 buf[i++] = 'x'; 878 buf[i++] = "0123456789abcdef"[clamp(str[j] >> 4, 0, 0xf)]; 879 buf[i++] = "0123456789abcdef"[clamp(str[j] & 0xf, 0, 0xf)]; 880 } 881 } 882 } 883 } 884 buf[i] = '\0'; 885 return buf; 886 } 887 </pre> 888 889 890 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> 891 </div> 892 </div> 893 <a name="1:17"><div class="section"><h4 class="noheading">17. </h4></a> 894 <p>This function prints the token type and value. 895 </p> 896 897 <div class="codeblock"> 898 <span class="codeblock_name">{Print Token <a href="lexer.html#1:17">17</a>}</span> 899 <pre class="prettyprint lang-c"> 900 void print_token(token_t *tok) { 901 if (!tok) { 902 printf("NULL\n"); 903 return; 904 } 905 const char *name = token_name_from_type(tok->kind); 906 switch (tok->kind) { 907 case TOK_ID: 908 case TOK_STRING_ASCII: { 909 char *escaped = re_escape_string(token_string(tok)); 910 printf("%s: \"%s\"@%d:%d\n", name, escaped, tok->line, tok->column); 911 free(escaped); 912 break; 913 } 914 case TOK_TYPEDEF_NAME: { 915 char *escaped = re_escape_string(token_string(tok)); 916 printf("%s: %s@%d:%d\n", name, escaped, tok->line, tok->column); 917 free(escaped); 918 break; 919 } 920 case TOK_CHAR_CONST: 921 printf("%s: '%c'@%d:%d\n", name, token_char(tok), tok->line, tok->column); 922 break; 923 case TOK_INTEGER_S32: 924 case TOK_INTEGER_U32: 925 case TOK_INTEGER_S64: 926 case TOK_INTEGER_U64: 927 printf("%s: %ld@%d:%d\n", name, token_int(tok), tok->line, tok->column); 928 break; 929 case TOK_FLOAT_32: 930 case TOK_FLOAT_64: 931 printf("%s: %f@%d:%d\n", name, token_float(tok), tok->line, tok->column); 932 break; 933 default: 934 printf("%s@%d:%d\n", name, tok->line, tok->column); 935 break; 936 } 937 } 938 </pre> 939 940 941 <p class="seealso">Used in section <a href="lexer.html#1:14">14</a></p> 942 </div> 943 </div> 944 <a name="1:18"><div class="section"><h4 class="noheading">18. </h4></a> 945 <p>Now we can implement functions to create and destroy tokens. We'll start with the easy ones. 946 </p> 947 948 <div class="codeblock"> 949 <span class="codeblock_name">{Token Creation and Destruction <a href="lexer.html#1:18">18</a>}</span> 950 <pre class="prettyprint lang-c"> 951 token_t *token_data_create(c_token_types kind, int lin, int col, int len) { 952 token_t *token = malloc(sizeof(token_t) + sizeof(struct token_data)); 953 if (!token) { 954 fputs("Out of memory\n", stderr); 955 exit(1); 956 } 957 token->magic = TOK_MAGIC_1; 958 token->line = lin; 959 token->column = col; 960 column += len; 961 token->kind = kind; 962 return token; 963 } 964 965 token_t *token_create(c_token_types kind, int lin, int col, int len) { 966 token_t *token = malloc(sizeof(token_t)); 967 if (!token) { 968 fputs("Out of memory\n", stderr); 969 exit(1); 970 } 971 token->magic = TOK_MAGIC_2; 972 token->line = lin; 973 token->column = col; 974 column += len; 975 token->kind = kind; 976 return token; 977 } 978 979 token_t *token_create_int(c_token_types kind, int lin, int col, int64_t i, int len) { 980 token_t *token = token_data_create(kind, lin, col, len); 981 token_data(token)->data.i = i; 982 return token; 983 } 984 985 token_t *token_create_float(c_token_types kind, int lin, int col, double f, int len) { 986 token_t *token = token_data_create(kind, lin, col, len); 987 token_data(token)->data.f = f; 988 return token; 989 } 990 991 token_t *token_create_char(c_token_types kind, int lin, int col, char c, int len) { 992 token_t *token = token_data_create(kind, lin, col, len); 993 token_data(token)->data.c = c; 994 return token; 995 } 996 997 void token_destroy(token_t *token) { 998 if (token->magic == TOK_MAGIC_1 || token->magic == TOK_MAGIC_2) { 999 free(token); 1000 } else { 1001 fputs("Corrupt token\n", stderr); 1002 exit(1); 1003 } 1004 } 1005 </pre> 1006 1007 1008 <p class="seealso">Used in sections <a href="lexer.html#1:11">11</a> and <a href="lexer.html#1:42">42</a></p> 1009 </div> 1010 </div> 1011 <a name="1:19"><div class="section"><h4 class="noheading">19. </h4></a> 1012 <p><code>token_create_string</code> can be implemented either the easy way or the right way. Let's try the easy way. 1013 </p> 1014 1015 <div class="codeblock"> 1016 <span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>}</span> 1017 <pre class="prettyprint lang-c"> 1018 token_t *token_create_string(c_token_types kind, int lin, int col, const char *s, int len) { 1019 token_t *token = token_create(kind, lin, col, len); 1020 token_data(token)->data.s = strdup(s); 1021 return token; 1022 } 1023 </pre> 1024 1025 <p class="seealso">Redefined in section <a href="lexer.html#1:40">40</a></p> 1026 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p> 1027 </div> 1028 </div> 1029 <a name="1:20"><div class="section"><h4 class="noheading">20. </h4></a> 1030 <p>There's an issue with this approach. <code>token_create_string</code> will be called for every identifier and every string in a program. Imagine a large program, say a shell, with a bunch of user input and output. That program will likely have 20-40 calls to <code>fprintf</code>, <code>fscanf</code>, <code>strchr</code>, <code>strtok</code>, each. We create a new string for each of those calls. That's a lot of duplicates, and can quickly add up to a lot of memory usage. 1031 </p> 1032 <p>To fix this, we use a hash table to store the strings. We'll define a hash table in <code>hash_table.h</code> and <code>hash_table.c</code>. 1033 </p> 1034 1035 </div> 1036 <a name="1:21"><div class="section"><h4>21. Hash Table</h4></a> 1037 <p>A hash table is a data structure that maps keys to values. It's commonly used for implementing symbol tables to store variables and functions. To create a generic hash table, we'll need: 1038 </p> 1039 <ol> 1040 <li>A hash function to convert keys into array indices 1041 </li> 1042 <li>A comparison function to check if two keys are equal 1043 </li> 1044 <li>An opaque type to represent the hash table 1045 </li> 1046 <li>A destructor function to clean up keys and values when removing entries 1047 </li> 1048 </ol> 1049 <p>Let's start with the interface: 1050 </p> 1051 1052 </div> 1053 <a name="1:22"><div class="section"><h4 class="noheading">22. </h4></a> 1054 1055 <div class="codeblock"> 1056 <span class="codeblock_name">{Hash Table Opaque Types <a href="lexer.html#1:22">22</a>}</span> 1057 <pre class="prettyprint lang-c"> 1058 typedef struct hash_table hash_table_t; 1059 typedef int (*hash_table_cmp_fn)(const void *key1, const void *key2); 1060 typedef unsigned int (*hash_table_hash_fn)(const void *key); 1061 typedef void (*hash_table_dtor)(void *data, int is_key); 1062 </pre> 1063 1064 1065 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> 1066 </div> 1067 </div> 1068 <a name="1:23"><div class="section"><h4 class="noheading">23. </h4></a> 1069 1070 <div class="codeblock"> 1071 <span class="codeblock_name">{Hash Table Creation and Destruction <a href="lexer.html#1:23">23</a>}</span> 1072 <pre class="prettyprint lang-c"> 1073 hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor); 1074 void hash_table_destroy(hash_table_t *table); 1075 </pre> 1076 1077 1078 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> 1079 </div> 1080 </div> 1081 <a name="1:24"><div class="section"><h4 class="noheading">24. </h4></a> 1082 1083 <div class="codeblock"> 1084 <span class="codeblock_name">{Hash Table Access <a href="lexer.html#1:24">24</a>}</span> 1085 <pre class="prettyprint lang-c"> 1086 void *hash_table_get(const hash_table_t *table, const void *key); 1087 void hash_table_put(hash_table_t *table, void *key, void *value); 1088 void hash_table_remove(hash_table_t *table, const void *key); 1089 </pre> 1090 1091 1092 <p class="seealso">Used in section <a href="lexer.html#1:25">25</a></p> 1093 </div> 1094 </div> 1095 <a name="1:25"><div class="section"><h4 class="noheading">25. </h4></a> 1096 1097 <div class="codeblock"> 1098 <span class="codeblock_name">{<strong>hash_table.h</strong> <a href="lexer.html#1:25">25</a>}</span> 1099 <pre class="prettyprint lang-c"> 1100 #ifndef HASH_TABLE_H 1101 #define HASH_TABLE_H 1102 1103 <span class="nocode pln">{Hash Table Opaque Types, <a href="lexer.html#1:22">22</a>}</span> 1104 <span class="nocode pln">{Hash Table Creation and Destruction, <a href="lexer.html#1:23">23</a>}</span> 1105 <span class="nocode pln">{Hash Table Access, <a href="lexer.html#1:24">24</a>}</span> 1106 1107 #endif /* HASH_TABLE_H */ 1108 </pre> 1109 1110 1111 1112 </div> 1113 </div> 1114 <a name="1:26"><div class="section"><h4 class="noheading">26. </h4></a> 1115 <p>Now let's implement the hash table: 1116 </p> 1117 1118 <div class="codeblock"> 1119 <span class="codeblock_name">{<strong>hash_table.c</strong> <a href="lexer.html#1:26">26</a>}</span> 1120 <pre class="prettyprint lang-c"> 1121 #include <stdlib.h> 1122 #include <string.h> 1123 #include <stdio.h> 1124 #include "hash_table.h" 1125 1126 <span class="nocode pln">{Hash Table Data Structure, <a href="lexer.html#1:27">27</a>}</span> 1127 <span class="nocode pln">{Hash Table Entry Data Structure, <a href="lexer.html#1:28">28</a>}</span> 1128 1129 hash_table_t *hash_table_create(int size, hash_table_cmp_fn cmp, hash_table_hash_fn hash, hash_table_dtor dtor) { 1130 <span class="nocode pln"> {Allocate and Initialize Hash Table, <a href="lexer.html#1:29">29</a>}</span> 1131 return table; 1132 } 1133 1134 void hash_table_destroy(hash_table_t *table) { 1135 if (!table) return; 1136 <span class="nocode pln"> {Destroy Entries, <a href="lexer.html#1:30">30</a>}</span> 1137 free(table->entries); 1138 free(table); 1139 } 1140 1141 void *hash_table_get(const hash_table_t *table, const void *key) { 1142 if (!table || !key) return NULL; 1143 <span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> 1144 <span class="nocode pln"> {Loop Through Entries and Return Value if Match, <a href="lexer.html#1:35">35</a>}</span> 1145 return NULL; 1146 } 1147 1148 void hash_table_put(hash_table_t *table, void *key, void *value) { 1149 if (!table || !key) return; 1150 <span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> 1151 <span class="nocode pln"> {Loop Through Entries and Replace Value if Key Matches, <a href="lexer.html#1:32">32</a>}</span> 1152 <span class="nocode pln"> {Allocate New Entry if No Match, <a href="lexer.html#1:33">33</a>}</span> 1153 } 1154 1155 void hash_table_remove(hash_table_t *table, const void *key) { 1156 if (!table || !key) return; 1157 <span class="nocode pln"> {Get Entry By Hash, <a href="lexer.html#1:31">31</a>}</span> 1158 <span class="nocode pln"> {Loop Through Entries and Remove Entry if Key Matches, <a href="lexer.html#1:34">34</a>}</span> 1159 } 1160 1161 #ifdef TEST_HASH_TABLE 1162 #include <assert.h> 1163 1164 static int string_cmp(const void *key1, const void *key2) { 1165 return strcmp((const char *)key1, (const char *)key2); 1166 } 1167 1168 static unsigned int string_hash(const void *key) { 1169 unsigned int hash = 5381; 1170 const unsigned char *str = key; 1171 int c; 1172 1173 while ((c = *str++)) 1174 { 1175 hash = ((hash << 5) + hash) + c; 1176 } 1177 1178 return hash; 1179 } 1180 1181 int main(void) { 1182 hash_table_t *table = hash_table_create(16, string_cmp, string_hash, NULL); 1183 assert(table != NULL); 1184 1185 hash_table_put(table, "foo", "bar"); 1186 hash_table_put(table, "foo", "baz"); 1187 assert(strcmp((const char *)hash_table_get(table, "foo"), "baz") == 0); 1188 1189 hash_table_remove(table, "foo"); 1190 assert(hash_table_get(table, "foo") == NULL); 1191 1192 hash_table_destroy(table); 1193 printf("All tests passed!\n"); 1194 return 0; 1195 } 1196 #endif 1197 </pre> 1198 1199 1200 1201 </div> 1202 </div> 1203 <a name="1:27"><div class="section"><h4 class="noheading">27. </h4></a> 1204 <p>The hash table data structure contains an array of entry pointers, the size of the array, and function pointers for comparison, hashing, and destruction. 1205 </p> 1206 1207 <div class="codeblock"> 1208 <span class="codeblock_name">{Hash Table Data Structure <a href="lexer.html#1:27">27</a>}</span> 1209 <pre class="prettyprint lang-c"> 1210 struct hash_table { 1211 struct hash_table_entry **entries; 1212 int size; 1213 hash_table_cmp_fn cmp; 1214 hash_table_hash_fn hash; 1215 hash_table_dtor dtor; 1216 }; 1217 </pre> 1218 1219 1220 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1221 </div> 1222 </div> 1223 <a name="1:28"><div class="section"><h4 class="noheading">28. </h4></a> 1224 <p>Each entry in the hash table contains a key, a value, and a pointer to the next entry in the chain for collision resolution via chaining. 1225 </p> 1226 1227 <div class="codeblock"> 1228 <span class="codeblock_name">{Hash Table Entry Data Structure <a href="lexer.html#1:28">28</a>}</span> 1229 <pre class="prettyprint lang-c"> 1230 struct hash_table_entry { 1231 void *key; 1232 void *value; 1233 struct hash_table_entry *next; 1234 }; 1235 </pre> 1236 1237 1238 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1239 </div> 1240 </div> 1241 <a name="1:29"><div class="section"><h4 class="noheading">29. </h4></a> 1242 <p>To allocate a hash table, we allocate memory for the table structure and its entries, initialize the entries to NULL, and set the function pointers. 1243 </p> 1244 1245 <div class="codeblock"> 1246 <span class="codeblock_name">{Allocate and Initialize Hash Table <a href="lexer.html#1:29">29</a>}</span> 1247 <pre class="prettyprint lang-c"> 1248 hash_table_t *table = malloc(sizeof(struct hash_table)); 1249 if (!table) { 1250 fprintf(stderr, "Error: Out of memory, could not allocate hash table\n"); 1251 exit(EXIT_FAILURE); 1252 } 1253 table->entries = calloc(size, sizeof(struct hash_table_entry *)); 1254 if (!table->entries) { 1255 fprintf(stderr, "Error: Out of memory, could not allocate hash table entries\n"); 1256 free(table); 1257 exit(EXIT_FAILURE); 1258 } 1259 table->size = size; 1260 table->cmp = cmp; 1261 table->hash = hash; 1262 table->dtor = dtor; 1263 </pre> 1264 1265 1266 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1267 </div> 1268 </div> 1269 <a name="1:30"><div class="section"><h4 class="noheading">30. </h4></a> 1270 <p>To destroy the entries in a hash table, we iterate through all entries, free the keys and values using the destructor if provided, and free the entry itself. 1271 </p> 1272 1273 <div class="codeblock"> 1274 <span class="codeblock_name">{Destroy Entries <a href="lexer.html#1:30">30</a>}</span> 1275 <pre class="prettyprint lang-c"> 1276 for (int i = 0; i < table->size; i++) { 1277 struct hash_table_entry *entry = table->entries[i]; 1278 while (entry) { 1279 struct hash_table_entry *next = entry->next; 1280 if (table->dtor) { 1281 table->dtor(entry->key, 1); 1282 table->dtor(entry->value, 0); 1283 } 1284 free(entry); 1285 entry = next; 1286 } 1287 } 1288 </pre> 1289 1290 1291 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1292 </div> 1293 </div> 1294 <a name="1:31"><div class="section"><h4 class="noheading">31. </h4></a> 1295 <p>To retrieve an entry's hash bucket, we apply the hash function to the key and take the modulus of the result with the table size. 1296 </p> 1297 1298 <div class="codeblock"> 1299 <span class="codeblock_name">{Get Entry By Hash <a href="lexer.html#1:31">31</a>}</span> 1300 <pre class="prettyprint lang-c"> 1301 unsigned int hash = table->hash(key) % table->size; 1302 struct hash_table_entry *entry = table->entries[hash]; 1303 </pre> 1304 1305 1306 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1307 </div> 1308 </div> 1309 <a name="1:32"><div class="section"><h4 class="noheading">32. </h4></a> 1310 <p>When putting a new entry in the table, we first check if the key already exists. If it does, we update the value; otherwise, we create a new entry. 1311 </p> 1312 1313 <div class="codeblock"> 1314 <span class="codeblock_name">{Loop Through Entries and Replace Value if Key Matches <a href="lexer.html#1:32">32</a>}</span> 1315 <pre class="prettyprint lang-c"> 1316 while (entry) { 1317 if (table->cmp(entry->key, key) == 0) { 1318 if (table->dtor) table->dtor(entry->value, 0); 1319 entry->value = value; 1320 return; 1321 } 1322 entry = entry->next; 1323 } 1324 </pre> 1325 1326 1327 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1328 </div> 1329 </div> 1330 <a name="1:33"><div class="section"><h4 class="noheading">33. </h4></a> 1331 <p>If no matching key is found, we create a new entry and insert it at the beginning of the hash bucket. 1332 </p> 1333 <p>This can possibly improve performance due to a property called temporal locality. When we access an entry, we're likely to access it again soon. Since the entry is at the beginning of the list, it's likely to be accessed again soon. 1334 </p> 1335 1336 <div class="codeblock"> 1337 <span class="codeblock_name">{Allocate New Entry if No Match <a href="lexer.html#1:33">33</a>}</span> 1338 <pre class="prettyprint lang-c"> 1339 struct hash_table_entry *new_entry = malloc(sizeof(struct hash_table_entry)); 1340 if (!new_entry) { 1341 fprintf(stderr, "Error: Out of memory, could not allocate hash table entry\n"); 1342 exit(EXIT_FAILURE); 1343 } 1344 new_entry->key = key; 1345 new_entry->value = value; 1346 new_entry->next = table->entries[hash]; 1347 table->entries[hash] = new_entry; 1348 </pre> 1349 1350 1351 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1352 </div> 1353 </div> 1354 <a name="1:34"><div class="section"><h4 class="noheading">34. </h4></a> 1355 <p>To remove an entry, we find its bucket, update the linked list to bypass it, then free the entry and its contents. 1356 </p> 1357 1358 <div class="codeblock"> 1359 <span class="codeblock_name">{Loop Through Entries and Remove Entry if Key Matches <a href="lexer.html#1:34">34</a>}</span> 1360 <pre class="prettyprint lang-c"> 1361 struct hash_table_entry *prev = NULL; 1362 while (entry) { 1363 if (table->cmp(entry->key, key) == 0) { 1364 if (prev) 1365 prev->next = entry->next; 1366 else 1367 table->entries[hash] = entry->next; 1368 1369 if (table->dtor) { 1370 table->dtor(entry->key, 1); 1371 table->dtor(entry->value, 0); 1372 } 1373 free(entry); 1374 return; 1375 } 1376 prev = entry; 1377 entry = entry->next; 1378 } 1379 </pre> 1380 1381 1382 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1383 </div> 1384 </div> 1385 <a name="1:35"><div class="section"><h4 class="noheading">35. </h4></a> 1386 <p>To retrieve a value from a given bucket, we just walk the list and return the value if a matching key is found. 1387 </p> 1388 1389 <div class="codeblock"> 1390 <span class="codeblock_name">{Loop Through Entries and Return Value if Match <a href="lexer.html#1:35">35</a>}</span> 1391 <pre class="prettyprint lang-c"> 1392 while (entry) { 1393 if (table->cmp(entry->key, key) == 0) { 1394 return entry->value; 1395 } 1396 entry = entry->next; 1397 } 1398 </pre> 1399 1400 1401 <p class="seealso">Used in section <a href="lexer.html#1:26">26</a></p> 1402 </div> 1403 </div> 1404 <a name="1:36"><div class="section"><h4 class="noheading">36. </h4></a> 1405 <p>We're now almost ready to implement <code>token_create_string</code> the right way. First, we'll need a good hash function. 1406 </p> 1407 <p>Hash functions are a very interesting topic and there's a lot of good research on them. The hash function we use should be fast, have a low collision rate, and be able to handle strings of any length. 1408 </p> 1409 <p>We can't just sum the characters in a string, because that would mean that "stop" and "pots" would have the same hash. Multiplying has the same problem. If we take each to the power of its position in the string, we get a better distribution, but it's still awful. 1410 </p> 1411 <p>Using a simple python program, I brute-forced all possible 4-character strings and ran our power-hash function on them, the result showed that for 456976 possible strings, only 3760 were unique, which is terrible. 1412 </p> 1413 <p>Instead of trying to come up with a new hash function, we can use one that's been well-tested and is known to work well. 1414 </p> 1415 <p>The first time I wrote this, I used the hash function from the 'Red Dragon Book' (Compilers: Principles, Techniques, and Tools). 1416 </p> 1417 1418 <div class="codeblock"> 1419 <span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>}</span> 1420 <pre class="prettyprint lang-c"> 1421 unsigned long hash_string(void *key) { 1422 unsigned long hash = 0, g; 1423 char *p = key; 1424 while (*p) { 1425 hash = (hash \<\< 4) + *p++; 1426 if ((g = hash \& 0xf0000000) != 0) { 1427 hash ^= g \>\> 24; 1428 hash ^= g; 1429 } 1430 } 1431 return hash; 1432 } 1433 </pre> 1434 1435 1436 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> 1437 </div> 1438 <p>This is a bit slow on modern processors because it's not very cache-friendly. We can do better. Let's use its child, ELFHash, from libc. 1439 </p> 1440 <p>As you can see in the code below, this function avoids extra operations and should be much faster. 1441 </p> 1442 1443 <div class="codeblock"> 1444 <span class="codeblock_name">{Hash Function <a href="lexer.html#1:36">36</a>} :=</span> 1445 <pre class="prettyprint lang-c"> 1446 unsigned int hash_string(const void *key) { 1447 unsigned long hash = 0, hi = 0; 1448 const char *p = key; 1449 hash = *p; 1450 if (hash != 0 && p[1] != 0) { 1451 hash = (hash << 4) + p[1]; 1452 if (p[2] != 0) { 1453 hash = (hash << 4) + p[2]; 1454 if (p[3] != 0) { 1455 hash = (hash << 4) + p[3]; 1456 if (p[4] != 0) { 1457 hash = (hash << 4) + p[4]; 1458 p += 5; 1459 while (*p != 0) { 1460 hash = (hash << 4) + *p++; 1461 hi = hash & 0xf0000000l; 1462 hash ^= hi >> 24; 1463 } 1464 hash &= 0x0fffffffl; 1465 } 1466 } 1467 } 1468 } 1469 return hash; 1470 } 1471 </pre> 1472 1473 1474 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> 1475 </div> 1476 </div> 1477 <a name="1:37"><div class="section"><h4 class="noheading">37. </h4></a> 1478 <p>We also need a comparison function for strings. 1479 </p> 1480 1481 <div class="codeblock"> 1482 <span class="codeblock_name">{String Comparison <a href="lexer.html#1:37">37</a>}</span> 1483 <pre class="prettyprint lang-c"> 1484 int cmp_string(const void *key1, const void *key2) { 1485 return strcmp((char *)key1, (char *)key2); 1486 } 1487 </pre> 1488 1489 1490 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> 1491 </div> 1492 </div> 1493 <a name="1:38"><div class="section"><h4 class="noheading">38. </h4></a> 1494 <p>Finally, we'll need a destructor for entries. 1495 </p> 1496 1497 <div class="codeblock"> 1498 <span class="codeblock_name">{String Destructor <a href="lexer.html#1:38">38</a>}</span> 1499 <pre class="prettyprint lang-c"> 1500 void dtor_string(void *value, int is_key) { 1501 if (is_key) { 1502 free(value); // Since the key and value are the same, we only need to free once. 1503 } 1504 } 1505 </pre> 1506 1507 1508 <p class="seealso">Used in section <a href="lexer.html#1:39">39</a></p> 1509 </div> 1510 </div> 1511 <a name="1:39"><div class="section"><h4 class="noheading">39. </h4></a> 1512 <p>These functions go in util.c 1513 </p> 1514 1515 <div class="codeblock"> 1516 <span class="codeblock_name">{<strong>util.c</strong> <a href="lexer.html#1:39">39</a>}</span> 1517 <pre class="prettyprint lang-c"> 1518 #include <string.h> 1519 #include <stdlib.h> 1520 #include "hash_table.h" 1521 <span class="nocode pln">{String Comparison, <a href="lexer.html#1:37">37</a>}</span> 1522 <span class="nocode pln">{Hash Function, <a href="lexer.html#1:36">36</a>}</span> 1523 <span class="nocode pln">{String Destructor, <a href="lexer.html#1:38">38</a>}</span> 1524 </pre> 1525 1526 1527 1528 </div> 1529 1530 <div class="codeblock"> 1531 <span class="codeblock_name">{<strong>util.h</strong> <a href="lexer.html#1:39">39</a>}</span> 1532 <pre class="prettyprint lang-c"> 1533 #ifndef UTIL_H 1534 #define UTIL_H 1535 #include "hash_table.h" 1536 int cmp_string(const void *key1, const void *key2); 1537 unsigned int hash_string(const void *key); 1538 void dtor_string(void *value, int is_key); 1539 #endif 1540 </pre> 1541 1542 1543 1544 </div> 1545 </div> 1546 <a name="1:40"><div class="section"><h4 class="noheading">40. </h4></a> 1547 <p>Now we can implement <code>token_create_string</code> the right way. 1548 </p> 1549 <p>You might notice that we're using the same key and value. This way of using a hash table is normally called a set. We're using it to store strings, but we could use it to store anything we want to deduplicate. 1550 </p> 1551 1552 <div class="codeblock"> 1553 <span class="codeblock_name">{Token Create String <a href="lexer.html#1:19">19</a>} :=</span> 1554 <pre class="prettyprint lang-c"> 1555 hash_table_t *string_table; 1556 token_t *token_create_string(c_token_types kind, int lin, int col, 1557 const char *s, int len) { 1558 if (string_table == NULL) { 1559 string_table = hash_table_create(2048, cmp_string, hash_string, dtor_string); 1560 } 1561 token_t *token = token_data_create(kind, lin, col, len); 1562 char *key = hash_table_get(string_table, (void *)s); 1563 if (key == NULL) { 1564 key = strdup(s); 1565 hash_table_put(string_table, key, key); 1566 } 1567 token_data(token)->data.s = key; 1568 return token; 1569 } 1570 </pre> 1571 1572 1573 <p class="seealso">Used in section <a href="lexer.html#1:42">42</a></p> 1574 </div> 1575 </div> 1576 <a name="1:41"><div class="section"><h4 class="noheading">41. </h4></a> 1577 <p>We'll add an external declaration for <code>string_table</code> in <code>token.h</code> so other programs can take advantage of it. 1578 </p> 1579 1580 <div class="codeblock"> 1581 <span class="codeblock_name">{<strong>token.h</strong> <a href="lexer.html#1:11">11</a>} :=</span> 1582 <pre class="prettyprint lang-c"> 1583 #ifndef TOKEN_H 1584 #define TOKEN_H 1585 #include <stdint.h> // We use this for int64_t 1586 #include "hash_table.h" // We need this for the string table 1587 <span class="nocode pln">{Token Types, <a href="lexer.html#1:10">10</a>}</span> 1588 <span class="nocode pln">{Opaque Token Type, <a href="lexer.html#1:7">7</a>}</span> 1589 <span class="nocode pln">{Token Creation and Destruction Interface, <a href="lexer.html#1:8">8</a>}</span> 1590 <span class="nocode pln">{Token Interface, <a href="lexer.html#1:9">9</a>}</span> 1591 extern hash_table_t *string_table; 1592 extern int column; 1593 extern int line; 1594 #endif 1595 </pre> 1596 1597 1598 1599 </div> 1600 </div> 1601 <a name="1:42"><div class="section"><h4 class="noheading">42. </h4></a> 1602 <p>Finally, we implement the token data structure in <code>token.c</code>. 1603 </p> 1604 1605 <div class="codeblock"> 1606 <span class="codeblock_name">{<strong>token.c</strong> <a href="lexer.html#1:42">42</a>}</span> 1607 <pre class="prettyprint lang-c"> 1608 #include <stdlib.h> 1609 #include <string.h> 1610 #include <stdio.h> 1611 #include <assert.h> 1612 #include <ctype.h> 1613 #include "token.h" 1614 #include "hash_table.h" 1615 #include "util.h" 1616 <span class="nocode pln">{Token Data Structure, <a href="lexer.html#1:12">12</a>}</span> 1617 <span class="nocode pln">{Token Data Access, <a href="lexer.html#1:13">13</a>}</span> 1618 <span class="nocode pln">{Token Creation and Destruction, <a href="lexer.html#1:18">18</a>}</span> 1619 <span class="nocode pln">{Token Create String, <a href="lexer.html#1:19">19</a>}</span> 1620 <span class="nocode pln">{Token Debugging, <a href="lexer.html#1:14">14</a>}</span> 1621 </pre> 1622 1623 1624 1625 </div> 1626 </div> 1627 <a name="1:43"><div class="section"><h4>43. Input</h4></a> 1628 <p>Input will provide a simple interface for reading characters from a file. The stream itself is deliberately hidden from the tokenizer, so that the tokenizer doesn't have to worry about buffering or anything like that. 1629 </p> 1630 1631 </div> 1632 <a name="1:44"><div class="section"><h4 class="noheading">44. </h4></a> 1633 1634 <div class="codeblock"> 1635 <span class="codeblock_name">{Input Interface <a href="lexer.html#1:44">44</a>}</span> 1636 <pre class="prettyprint lang-c"> 1637 void input_init(const char *filename); 1638 int input_getc(void); 1639 void input_ungetc(int c); 1640 void input_destroy(void); 1641 </pre> 1642 1643 1644 <p class="seealso">Used in section <a href="lexer.html#1:53">53</a></p> 1645 </div> 1646 <p>When the program wants to start reading a file, it calls <code>input_init</code> with the filename. It can then call <code>input_getc</code> to get the next character in the file. If there's no more input, <code>input_getc</code> will return <code>EOF</code>. 1647 </p> 1648 <p>There's also an <code>input_ungetc</code> function, which allows the program to put a character back into the stream. I'll only allow one character to be put back, but that should be enough for the tokenizer. 1649 </p> 1650 <p>Finally, when the program is done reading the file, it should call <code>input_destroy</code> to clean up. 1651 </p> 1652 1653 </div> 1654 <a name="1:45"><div class="section"><h4>45. Input Design Decisions</h4></a> 1655 <p>Per rule 1, we're trying to keep memory usage low. That means that instead of reading the entire file into memory, we'll need to read it in chunks. There are a couple of choices for how to do this: 1656 </p> 1657 <ol> 1658 <li><p><strong>Read a line at a time</strong>: This is a more natural approach, but it has two drawbacks. First, it requires a large buffer to store the line (C normally specifies <code>BUFSIZ</code> as 8192 bytes). Second, if the line is longer than <code>BUFSIZ</code>, we'll have to read the line in chunks anyway. 1659 </p> 1660 </li> 1661 <li><p><strong>Choose some arbitrary buffer size and read that many bytes at a time</strong>: This is the approach I'm going to take. It's a little less natural, but it's more memory efficient. 1662 </p> 1663 </li> 1664 </ol> 1665 <p>Input will read chunks of 128 bytes at a time, reusing the same static buffer. This limitation is not visible to the tokenizer, which will only see the <code>input_getc</code> interface. 1666 </p> 1667 <p>When the buffer is exhausted, <code>input_getc</code> will call <code>nextline</code>, which will read the next chunk of the file. 1668 </p> 1669 1670 </div> 1671 <a name="1:46"><div class="section"><h4>46. Input Implementation</h4></a> 1672 <p>The implementation of the input module is pretty straightforward. We have the following data structures and defines as globals: 1673 </p> 1674 1675 </div> 1676 <a name="1:47"><div class="section"><h4 class="noheading">47. </h4></a> 1677 1678 <div class="codeblock"> 1679 <span class="codeblock_name">{Input Data <a href="lexer.html#1:47">47</a>}</span> 1680 <pre class="prettyprint lang-c"> 1681 #define CHUNK_SIZE 128 1682 static char buffer[CHUNK_SIZE]; 1683 static int buffer_pos = 0; 1684 static int buffer_size = 0; 1685 static char unget_buffer_stack[8]; 1686 static int unget_buffer_stack_pos = 0; 1687 1688 static FILE *file = NULL; 1689 </pre> 1690 1691 1692 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> 1693 </div> 1694 <p>When the program calls <code>input_init</code>, we open the file. 1695 </p> 1696 1697 </div> 1698 <a name="1:48"><div class="section"><h4 class="noheading">48. </h4></a> 1699 1700 <div class="codeblock"> 1701 <span class="codeblock_name">{Input Initialization <a href="lexer.html#1:48">48</a>}</span> 1702 <pre class="prettyprint lang-c"> 1703 void input_init(const char *filename) { 1704 file = fopen(filename, "r"); 1705 if (file == NULL) { 1706 fprintf(stderr, "Error: Cannot open file %s\n", filename); 1707 exit(1); 1708 } 1709 } 1710 </pre> 1711 1712 1713 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> 1714 </div> 1715 <p>When the program calls <code>input_getc</code>, we return the next character in the buffer. If the buffer is exhausted, we call <code>nextline</code>. We also track the line and column. 1716 </p> 1717 1718 </div> 1719 <a name="1:49"><div class="section"><h4 class="noheading">49. </h4></a> 1720 1721 <div class="codeblock"> 1722 <span class="codeblock_name">{Input Get Character <a href="lexer.html#1:49">49</a>}</span> 1723 <pre class="prettyprint lang-c"> 1724 int input_getc(void) { 1725 if (unget_buffer_stack_pos > 0) { 1726 return unget_buffer_stack[--unget_buffer_stack_pos]; 1727 } 1728 if (buffer_pos == buffer_size) { 1729 buffer_size = fread(buffer, 1, CHUNK_SIZE, file); 1730 buffer_pos = 0; 1731 } 1732 if (buffer_size == 0) { 1733 return EOF; 1734 } 1735 char c = buffer[buffer_pos++]; 1736 return c; 1737 } 1738 </pre> 1739 1740 1741 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> 1742 </div> 1743 <p>When the program calls <code>input_ungetc</code>, we save the character in the <code>unget_buffer</code>. 1744 </p> 1745 1746 </div> 1747 <a name="1:50"><div class="section"><h4 class="noheading">50. </h4></a> 1748 1749 <div class="codeblock"> 1750 <span class="codeblock_name">{Input Unget Character <a href="lexer.html#1:50">50</a>}</span> 1751 <pre class="prettyprint lang-c"> 1752 void input_ungetc(int c) { 1753 unget_buffer_stack[unget_buffer_stack_pos++] = c; 1754 } 1755 </pre> 1756 1757 1758 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> 1759 </div> 1760 <p>Since we're not using dynamic memory allocation, cleanup is pretty simple. 1761 </p> 1762 1763 </div> 1764 <a name="1:51"><div class="section"><h4 class="noheading">51. </h4></a> 1765 1766 <div class="codeblock"> 1767 <span class="codeblock_name">{Input Destroy <a href="lexer.html#1:51">51</a>}</span> 1768 <pre class="prettyprint lang-c"> 1769 void input_destroy(void) { 1770 fclose(file); 1771 } 1772 </pre> 1773 1774 1775 <p class="seealso">Used in section <a href="lexer.html#1:52">52</a></p> 1776 </div> 1777 </div> 1778 <a name="1:52"><div class="section"><h4 class="noheading">52. </h4></a> 1779 <p>We put the whole thing together in <code>input.c</code>. 1780 </p> 1781 1782 <div class="codeblock"> 1783 <span class="codeblock_name">{<strong>input.c</strong> <a href="lexer.html#1:52">52</a>}</span> 1784 <pre class="prettyprint lang-c"> 1785 #include <stdio.h> 1786 #include <stdlib.h> 1787 #include "input.h" 1788 <span class="nocode pln">{Input Data, <a href="lexer.html#1:47">47</a>}</span> 1789 <span class="nocode pln">{Input Initialization, <a href="lexer.html#1:48">48</a>}</span> 1790 <span class="nocode pln">{Input Get Character, <a href="lexer.html#1:49">49</a>}</span> 1791 <span class="nocode pln">{Input Unget Character, <a href="lexer.html#1:50">50</a>}</span> 1792 <span class="nocode pln">{Input Destroy, <a href="lexer.html#1:51">51</a>}</span> 1793 </pre> 1794 1795 1796 1797 </div> 1798 </div> 1799 <a name="1:53"><div class="section"><h4 class="noheading">53. </h4></a> 1800 <p>We'll need an external declaration for <code>file</code> in <code>input.h</code> so other programs can take advantage of it. 1801 </p> 1802 1803 <div class="codeblock"> 1804 <span class="codeblock_name">{<strong>input.h</strong> <a href="lexer.html#1:53">53</a>}</span> 1805 <pre class="prettyprint lang-c"> 1806 #ifndef INPUT_H 1807 #define INPUT_H 1808 <span class="nocode pln">{Input Interface, <a href="lexer.html#1:44">44</a>}</span> 1809 #endif 1810 </pre> 1811 1812 1813 1814 </div> 1815 </div> 1816 <a name="1:54"><div class="section"><h4 class="noheading">54. </h4></a> 1817 <p>We'll implement the lexer interface in <code>tokenizer.h</code> 1818 </p> 1819 1820 <div class="codeblock"> 1821 <span class="codeblock_name">{<strong>tokenizer.h</strong> <a href="lexer.html#1:54">54</a>}</span> 1822 <pre class="prettyprint lang-c"> 1823 #ifndef TOKENIZER_H 1824 #define TOKENIZER_H 1825 #include "token.h" 1826 #include "input.h" 1827 <span class="nocode pln">{Tokenization Interface, <a href="lexer.html#1:55">55</a>}</span> 1828 #endif 1829 </pre> 1830 1831 1832 1833 </div> 1834 </div> 1835 <a name="1:55"><div class="section"><h4 class="noheading">55. </h4></a> 1836 <p>The tokenization interface will have a couple of functions. <code>next_token</code> will return the next token in the input stream, <code>init_tokenizer</code> will initialize the tokenizer, and <code>destroy_tokenizer</code> will clean up. 1837 </p> 1838 <p>We'll also have some helper functions for lookahead and matching. 1839 </p> 1840 <p>'peek_token' will return the next token without consuming it (it technically does advance the input stream, but it saves the token so it can be reused). 1841 </p> 1842 <p>'consume' will consume the next token if it matches a given kind. If it doesn't match, it will print an error message and exit. 1843 </p> 1844 1845 <div class="codeblock"> 1846 <span class="codeblock_name">{Tokenization Interface <a href="lexer.html#1:55">55</a>}</span> 1847 <pre class="prettyprint lang-c"> 1848 void init_tokenizer(const char *filename); 1849 void destroy_tokenizer(void); 1850 token_t *next_token(void); 1851 void reject_token(token_t *token); 1852 token_t *peek_token(void); 1853 void consume(c_token_types kind); 1854 void consume_alt(c_token_types *kinds, int n); 1855 </pre> 1856 1857 1858 <p class="seealso">Used in section <a href="lexer.html#1:54">54</a></p> 1859 </div> 1860 </div> 1861 <a name="1:56"><div class="section"><h4 class="noheading">56. </h4></a> 1862 <p>Now we can finally implement the tokenizer. A stack for storing tokens for lookahead is defined, as is a hash table, modified by the parser, for typedefs. 1863 </p> 1864 1865 <div class="codeblock"> 1866 <span class="codeblock_name">{<strong>tokenizer.c</strong> <a href="lexer.html#1:56">56</a>}</span> 1867 <pre class="prettyprint lang-c"> 1868 #include <ctype.h> 1869 #include <errno.h> 1870 #include <float.h> 1871 #include <math.h> 1872 #include <stdarg.h> 1873 #include <stdint.h> 1874 #include <stdio.h> 1875 #include <stdlib.h> 1876 #include <string.h> 1877 #include "tokenizer.h" 1878 #include "token.h" 1879 #include "input.h" 1880 #include "util.h" 1881 token_t *left_stack[8]; 1882 int left_stack_pos = 0; 1883 hash_table_t *typedef_table = NULL; 1884 <span class="nocode pln">{Utility Functions, <a href="lexer.html#1:57">57</a>}</span> 1885 <span class="nocode pln">{Tokenization Function, <a href="lexer.html#1:59">59</a>}</span> 1886 </pre> 1887 1888 1889 1890 </div> 1891 </div> 1892 <a name="1:57"><div class="section"><h4 class="noheading">57. </h4></a> 1893 <p>Utility functions are everything that doesn't directly tokenize the input. 1894 </p> 1895 1896 <div class="codeblock"> 1897 <span class="codeblock_name">{Utility Functions <a href="lexer.html#1:57">57</a>}</span> 1898 <pre class="prettyprint lang-c"> 1899 void init_tokenizer(const char *filename) { 1900 input_init(filename); 1901 typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string); 1902 } 1903 1904 void destroy_tokenizer(void) { 1905 input_destroy(); 1906 } 1907 1908 void reject_token(token_t *token) { 1909 left_stack[left_stack_pos++] = token; 1910 } 1911 1912 token_t *peek_token(void) { 1913 if (left_stack_pos > 0) { 1914 return left_stack[left_stack_pos - 1]; 1915 } 1916 token_t *token = next_token(); 1917 reject_token(token); 1918 return token; 1919 } 1920 1921 <span class="nocode pln">{Stringify Type, <a href="lexer.html#1:58">58</a>}</span> 1922 1923 void consume(c_token_types kind) { 1924 token_t *token = next_token(); 1925 if (token_type(token) != kind) { 1926 fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token))); 1927 exit(1); 1928 } 1929 token_destroy(token); 1930 } 1931 1932 void consume_alt(c_token_types *kinds, int n) { 1933 token_t *token = next_token(); 1934 for (int i = 0; i < n; i++) { 1935 if (token_type(token) == kinds[i]) { 1936 token_destroy(token); 1937 return; 1938 } 1939 } 1940 fprintf(stderr, "Error: Expected one of the following tokens: "); 1941 for (int i = 0; i < n; i++) { 1942 fprintf(stderr, "\"%s\" ", stringify_type(kinds[i])); 1943 } 1944 fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token))); 1945 exit(1); 1946 } 1947 </pre> 1948 1949 1950 <p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> 1951 </div> 1952 </div> 1953 <a name="1:58"><div class="section"><h4 class="noheading">58. </h4></a> 1954 <p>We'll need a helper function to convert token types to strings. It's pretty simple, just tedious. 1955 </p> 1956 1957 <div class="codeblock"> 1958 <span class="codeblock_name">{Stringify Type <a href="lexer.html#1:58">58</a>}</span> 1959 <pre class="prettyprint lang-c"> 1960 const char *stringify_type(c_token_types type) { 1961 switch (type) { 1962 case TOK_IF: 1963 return "if"; 1964 case TOK_ELSE: 1965 return "else"; 1966 case TOK_SWITCH: 1967 return "switch"; 1968 case TOK_CASE: 1969 return "case"; 1970 case TOK_DEFAULT: 1971 return "default"; 1972 case TOK_WHILE: 1973 return "while"; 1974 case TOK_DO: 1975 return "do"; 1976 case TOK_FOR: 1977 return "for"; 1978 case TOK_CONTINUE: 1979 return "continue"; 1980 case TOK_BREAK: 1981 return "break"; 1982 case TOK_RETURN: 1983 return "return"; 1984 case TOK_GOTO: 1985 return "goto"; 1986 case TOK_VOID: 1987 return "void"; 1988 case TOK_CHAR: 1989 return "char"; 1990 case TOK_SHORT: 1991 return "short"; 1992 case TOK_INT: 1993 return "int"; 1994 case TOK_LONG: 1995 return "long"; 1996 case TOK_FLOAT: 1997 return "float"; 1998 case TOK_DOUBLE: 1999 return "double"; 2000 case TOK_SIGNED: 2001 return "signed"; 2002 case TOK_UNSIGNED: 2003 return "unsigned"; 2004 case TOK_STRUCT: 2005 return "struct"; 2006 case TOK_UNION: 2007 return "union"; 2008 case TOK_ENUM: 2009 return "enum"; 2010 case TOK_TYPEDEF: 2011 return "typedef"; 2012 case TOK_AUTO: 2013 return "auto"; 2014 case TOK_REGISTER: 2015 return "register"; 2016 case TOK_STATIC: 2017 return "static"; 2018 case TOK_EXTERN: 2019 return "extern"; 2020 case TOK_CONST: 2021 return "const"; 2022 case TOK_VOLATILE: 2023 return "volatile"; 2024 case TOK_SIZEOF: 2025 return "sizeof"; 2026 case TOK_ADD: 2027 return "+"; 2028 case TOK_SUB: 2029 return "-"; 2030 case TOK_MUL: 2031 return "*"; 2032 case TOK_DIV: 2033 return "/"; 2034 case TOK_MOD: 2035 return "%"; 2036 case TOK_BIT_AND: 2037 return "&"; 2038 case TOK_BIT_OR: 2039 return "|"; 2040 case TOK_BIT_XOR: 2041 return "^"; 2042 case TOK_BIT_NOT: 2043 return "~"; 2044 case TOK_LSHIFT: 2045 return "<<"; 2046 case TOK_RSHIFT: 2047 return ">>"; 2048 case TOK_NOT: 2049 return "!"; 2050 case TOK_ASSIGN: 2051 return "="; 2052 case TOK_LT: 2053 return "<"; 2054 case TOK_GT: 2055 return ">"; 2056 case TOK_INC: 2057 return "++"; 2058 case TOK_DEC: 2059 return "--"; 2060 case TOK_EQ: 2061 return "=="; 2062 case TOK_NE: 2063 return "!="; 2064 case TOK_LE: 2065 return "<="; 2066 case TOK_GE: 2067 return ">="; 2068 case TOK_AND: 2069 return "&&"; 2070 case TOK_OR: 2071 return "||"; 2072 case TOK_MEMBER_POINTER: 2073 return "->"; 2074 case TOK_MEMBER: 2075 return "."; 2076 case TOK_COND_DECISION: 2077 return ":"; 2078 case TOK_COND: 2079 return "?"; 2080 case TOK_ASSIGN_ADD: 2081 return "+="; 2082 case TOK_ASSIGN_SUB: 2083 return "-="; 2084 case TOK_ASSIGN_MUL: 2085 return "*="; 2086 case TOK_ASSIGN_DIV: 2087 return "/="; 2088 case TOK_ASSIGN_MOD: 2089 return "%="; 2090 case TOK_ASSIGN_BITAND: 2091 return "&="; 2092 case TOK_ASSIGN_BITOR: 2093 return "|="; 2094 case TOK_ASSIGN_BITXOR: 2095 return "^="; 2096 case TOK_ASSIGN_LSHIFT: 2097 return "<<="; 2098 case TOK_ASSIGN_RSHIFT: 2099 return ">>="; 2100 case TOK_HASH: 2101 return "#"; 2102 case TOK_ID: 2103 return "identifier"; 2104 case TOK_TYPEDEF_NAME: 2105 return "typedef name"; 2106 case TOK_INTEGER_U32: 2107 case TOK_INTEGER_U64: 2108 case TOK_INTEGER_S32: 2109 case TOK_INTEGER_S64: 2110 return "integer constant"; 2111 case TOK_FLOAT_32: 2112 case TOK_FLOAT_64: 2113 return "floating constant"; 2114 case TOK_CHAR_CONST: 2115 return "character constant"; 2116 case TOK_STRING_ASCII: 2117 return "string constant"; 2118 case TOK_EOF: 2119 return "EOF"; 2120 case TOK_ERROR: 2121 return "error"; 2122 case TOK_LEFT_PAREN: 2123 return "("; 2124 case TOK_RIGHT_PAREN: 2125 return ")"; 2126 case TOK_LEFT_BRACKET: 2127 return "["; 2128 case TOK_RIGHT_BRACKET: 2129 return "]"; 2130 case TOK_LEFT_BRACE: 2131 return "{"; 2132 case TOK_RIGHT_BRACE: 2133 return "}"; 2134 case TOK_COMMA: 2135 return ","; 2136 case TOK_SEMICOLON: 2137 return ";"; 2138 case TOK_DOT: 2139 return "."; 2140 case TOK_ELLIPSIS: 2141 return "..."; 2142 } 2143 return "UNKNOWN"; 2144 } 2145 </pre> 2146 2147 2148 <p class="seealso">Used in section <a href="lexer.html#1:57">57</a></p> 2149 </div> 2150 </div> 2151 <a name="1:59"><div class="section"><h4 class="noheading">59. </h4></a> 2152 <p>Now we can implement the tokenization function. The pattern is pretty simple: we call each of the tokenization functions in turn until we find a match. If we don't find a match, we print an error message and exit. 2153 You might wonder why skip_whitespace can return a token. This makes handling the divide operator easier as comments also start with a slash. 2154 </p> 2155 2156 <div class="codeblock"> 2157 <span class="codeblock_name">{Tokenization Function <a href="lexer.html#1:59">59</a>}</span> 2158 <pre class="prettyprint lang-c"> 2159 char file_name[1024]; 2160 <span class="nocode pln">{Warning/Error Functions, <a href="lexer.html#1:60">60</a>}</span> 2161 <span class="nocode pln">{Skip Whitespace, <a href="lexer.html#1:61">61</a>}</span> 2162 <span class="nocode pln">{Tokenize Identifier, <a href="lexer.html#1:62">62</a>}</span> 2163 <span class="nocode pln">{Tokenize Number, <a href="lexer.html#1:65">65</a>}</span> 2164 <span class="nocode pln">{Tokenize String, <a href="lexer.html#1:74">74</a>}</span> 2165 <span class="nocode pln">{Tokenize Character, <a href="lexer.html#1:73">73</a>}</span> 2166 <span class="nocode pln">{Tokenize Operator, <a href="lexer.html#1:64">64</a>}</span> 2167 token_t *next_token(void) { 2168 if (left_stack_pos > 0) { 2169 return left_stack[--left_stack_pos]; 2170 } 2171 token_t *tok = skip_whitespace(); 2172 if (tok != NULL) { 2173 return tok; 2174 } 2175 tok = read_identifier(); 2176 if (tok != NULL) { 2177 return tok; 2178 } 2179 tok = read_number(); 2180 if (tok != NULL) { 2181 return tok; 2182 } 2183 tok = read_char_constant(); 2184 if (tok != NULL) { 2185 return tok; 2186 } 2187 tok = read_string_literal(); 2188 if (tok != NULL) { 2189 return tok; 2190 } 2191 tok = read_operator(); 2192 if (tok != NULL) { 2193 return tok; 2194 } 2195 int c = input_getc(); 2196 if (c == EOF) { 2197 return NULL; 2198 } 2199 tok_warn( 2200 "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, 2201 line, column); 2202 return next_token(); 2203 } 2204 2205 #ifdef TEST_TOKENIZER 2206 <span class="nocode pln">{Run Test, <a href="lexer.html#1:76">76</a>}</span> 2207 #endif 2208 </pre> 2209 2210 2211 <p class="seealso">Used in section <a href="lexer.html#1:56">56</a></p> 2212 </div> 2213 </div> 2214 <a name="1:60"><div class="section"><h4 class="noheading">60. </h4></a> 2215 <p>We'll need a couple of helper functions to skip whitespace and print warnings/errors. 2216 </p> 2217 2218 <div class="codeblock"> 2219 <span class="codeblock_name">{Warning/Error Functions <a href="lexer.html#1:60">60</a>}</span> 2220 <pre class="prettyprint lang-c"> 2221 void tok_error(const char *fmt, ...) { 2222 va_list args; 2223 va_start(args, fmt); 2224 fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line, 2225 column); 2226 vfprintf(stderr, fmt, args); 2227 va_end(args); 2228 } 2229 2230 void tok_warn(const char *fmt, ...) { 2231 va_list args; 2232 va_start(args, fmt); 2233 fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line, 2234 column); 2235 vfprintf(stderr, fmt, args); 2236 va_end(args); 2237 } 2238 </pre> 2239 2240 2241 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 2242 </div> 2243 </div> 2244 <a name="1:61"><div class="section"><h4 class="noheading">61. </h4></a> 2245 <p>The <code>skip_whitespace</code> function is pretty simple. It just skips over any comments, whitespace, and line directives. 2246 </p> 2247 2248 <div class="codeblock"> 2249 <span class="codeblock_name">{Skip Whitespace <a href="lexer.html#1:61">61</a>}</span> 2250 <pre class="prettyprint lang-c"> 2251 static token_t *skip_whitespace(void) { 2252 int c; 2253 while ((c = input_getc()) != EOF) { 2254 if (isspace(c)) { // Whitespace 2255 if (c == '\n') { 2256 line++; 2257 column = 1; 2258 } else { 2259 column++; 2260 } 2261 } else if (c == '#') // GCC preprocessor line control directive. 2262 { 2263 char buf[512]; 2264 int i = 0; 2265 while ((c = input_getc()) != EOF && c != '\n') { 2266 buf[i++] = c; 2267 column++; 2268 } 2269 buf[i] = '\0'; 2270 if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) { 2271 column = 1; 2272 } else { 2273 tok_error("Invalid #line directive\n"); 2274 } 2275 if (c == EOF) { 2276 return NULL; 2277 } 2278 } else if (c == '/') { // Comment 2279 c = input_getc(); 2280 if (c == '/') { 2281 while ((c = input_getc()) != EOF && c != '\n') { 2282 column++; 2283 } 2284 if (c == EOF) { 2285 return NULL; 2286 } 2287 line++; 2288 column = 1; 2289 } else if (c == '*') { // Multiline comment 2290 while ((c = input_getc()) != EOF) { 2291 if (c == '*') { 2292 c = input_getc(); 2293 if (c == '/') { 2294 break; 2295 } 2296 } else if (c == '\n') { 2297 line++; 2298 column = 1; 2299 } else { 2300 column++; 2301 } 2302 } 2303 if (c == EOF) { 2304 return NULL; 2305 } 2306 } else { // Handled here to simplify the code. 2307 if (c == '=') 2308 return token_create(TOK_ASSIGN_DIV, line, column, 2); 2309 input_ungetc(c); 2310 return token_create(TOK_DIV, line, column, 1); 2311 } 2312 } else { 2313 input_ungetc(c); 2314 return NULL; 2315 } 2316 } 2317 return NULL; 2318 } 2319 </pre> 2320 2321 2322 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 2323 </div> 2324 </div> 2325 <a name="1:62"><div class="section"><h4 class="noheading">62. </h4></a> 2326 <p>The <code>read_identifier</code> function reads an identifier from the input stream. C identifiers can contain letters, digits, and underscores, but they can't start with a digit. 2327 </p> 2328 2329 <div class="codeblock"> 2330 <span class="codeblock_name">{Tokenize Identifier <a href="lexer.html#1:62">62</a>}</span> 2331 <pre class="prettyprint lang-c"> 2332 <span class="nocode pln">{Get Keyword, <a href="lexer.html#1:63">63</a>}</span> 2333 static token_t *read_identifier(void) { 2334 int c; 2335 char buf[1024]; 2336 int i = 0; 2337 c = input_getc(); 2338 if (!isalpha(c) && c != '_') { 2339 input_ungetc(c); 2340 return NULL; 2341 } 2342 buf[i++] = c; 2343 while ((c = input_getc()) != EOF) { 2344 if (!isalnum(c) && c != '_') { 2345 input_ungetc(c); 2346 break; 2347 } 2348 buf[i++] = c; 2349 if (i >= 1008) { 2350 tok_error("Identifier too long\n"); 2351 exit(1); 2352 } 2353 } 2354 buf[i] = '\0'; 2355 column += i; 2356 // Check if it's a keyword 2357 c_token_types kind = get_keyword(buf, i); 2358 if (kind != TOK_ID) { 2359 return token_create(kind, line, column, i); 2360 } 2361 // Check if it's a typedef 2362 if (hash_table_get(typedef_table, buf) != NULL) { 2363 return token_create(TOK_TYPEDEF_NAME, line, column, i); 2364 } 2365 return token_create_string(kind, line, column, buf, i); 2366 } 2367 </pre> 2368 2369 2370 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 2371 </div> 2372 </div> 2373 <a name="1:63"><div class="section"><h4 class="noheading">63. </h4></a> 2374 <p>The <code>get_keyword</code> function is a simple decision tree for identifying keywords. The code is pretty tedious, but it works. 2375 </p> 2376 2377 <div class="codeblock"> 2378 <span class="codeblock_name">{Get Keyword <a href="lexer.html#1:63">63</a>}</span> 2379 <pre class="prettyprint lang-c"> 2380 c_token_types get_keyword(const char *buf, int len) { 2381 switch (buf[0]) { 2382 case 'a': 2383 if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') 2384 return TOK_AUTO; 2385 break; 2386 2387 case 'b': 2388 if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && 2389 buf[4] == 'k') 2390 return TOK_BREAK; 2391 break; 2392 2393 case 'c': 2394 switch (buf[1]) { 2395 case 'a': 2396 if (len == 4 && buf[2] == 's' && buf[3] == 'e') 2397 return TOK_CASE; 2398 break; 2399 case 'h': 2400 if (len == 4 && buf[2] == 'a' && buf[3] == 'r') 2401 return TOK_CHAR; 2402 break; 2403 case 'o': 2404 if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') 2405 return TOK_CONST; 2406 if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && 2407 buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') 2408 return TOK_CONTINUE; 2409 break; 2410 } 2411 break; 2412 2413 case 'd': 2414 switch (buf[1]) { 2415 case 'e': 2416 if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && 2417 buf[5] == 'l' && buf[6] == 't') 2418 return TOK_DEFAULT; 2419 break; 2420 case 'o': 2421 if (len == 2 && buf[2] == '\0') 2422 return TOK_DO; 2423 if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && 2424 buf[5] == 'e') 2425 return TOK_DOUBLE; 2426 break; 2427 } 2428 break; 2429 2430 case 'e': 2431 switch (buf[1]) { 2432 case 'l': 2433 if (len == 4 && buf[2] == 's' && buf[3] == 'e') 2434 return TOK_ELSE; 2435 break; 2436 case 'n': 2437 if (len == 4 && buf[2] == 'u' && buf[3] == 'm') 2438 return TOK_ENUM; 2439 break; 2440 case 'x': 2441 if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && 2442 buf[5] == 'n') 2443 return TOK_EXTERN; 2444 break; 2445 } 2446 break; 2447 2448 case 'f': 2449 switch (buf[1]) { 2450 case 'l': 2451 if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') 2452 return TOK_FLOAT; 2453 break; 2454 case 'o': 2455 if (len == 3 && buf[2] == 'r') 2456 return TOK_FOR; 2457 break; 2458 } 2459 break; 2460 2461 case 'g': 2462 if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') 2463 return TOK_GOTO; 2464 break; 2465 2466 case 'i': 2467 switch (buf[1]) { 2468 case 'f': 2469 if (len == 2 && buf[2] == '\0') 2470 return TOK_IF; 2471 break; 2472 case 'n': 2473 if (len == 3 && buf[2] == 't') 2474 return TOK_INT; 2475 break; 2476 } 2477 break; 2478 2479 case 'l': 2480 if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') 2481 return TOK_LONG; 2482 break; 2483 2484 case 'r': 2485 switch (buf[1]) { 2486 case 'e': 2487 if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && 2488 buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') 2489 return TOK_REGISTER; 2490 if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && 2491 buf[5] == 'n') 2492 return TOK_RETURN; 2493 break; 2494 } 2495 break; 2496 2497 case 's': 2498 switch (buf[1]) { 2499 case 'h': 2500 if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') 2501 return TOK_SHORT; 2502 break; 2503 case 't': 2504 if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && 2505 buf[5] == 'c') 2506 return TOK_STATIC; 2507 break; 2508 case 'i': 2509 if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && 2510 buf[5] == 'd') 2511 return TOK_SIGNED; 2512 if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && 2513 buf[5] == 'f') 2514 return TOK_SIZEOF; 2515 break; 2516 case 'r': 2517 if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') 2518 return TOK_STRUCT; 2519 break; 2520 case 'w': 2521 if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && 2522 buf[5] == 'h') 2523 return TOK_SWITCH; 2524 break; 2525 } 2526 break; 2527 2528 case 't': 2529 if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && 2530 buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') 2531 return TOK_TYPEDEF; 2532 break; 2533 2534 case 'u': 2535 switch (buf[1]) { 2536 case 'n': 2537 if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') 2538 return TOK_UNION; 2539 if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && 2540 buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') 2541 return TOK_UNSIGNED; 2542 break; 2543 } 2544 break; 2545 2546 case 'v': 2547 switch (buf[1]) { 2548 case 'o': 2549 if (len == 4 && buf[2] == 'i' && buf[3] == 'd') 2550 return TOK_VOID; 2551 if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && 2552 buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') 2553 return TOK_VOLATILE; 2554 break; 2555 } 2556 break; 2557 2558 case 'w': 2559 if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && 2560 buf[4] == 'e') 2561 return TOK_WHILE; 2562 break; 2563 2564 default: 2565 return TOK_ID; 2566 } 2567 return TOK_ID; 2568 } 2569 </pre> 2570 2571 2572 <p class="seealso">Used in section <a href="lexer.html#1:62">62</a></p> 2573 </div> 2574 </div> 2575 <a name="1:64"><div class="section"><h4 class="noheading">64. </h4></a> 2576 <p>The <code>read_operator</code> function works similarly to the <code>read_identifier</code> function. It uses a decision tree to identify operators. 2577 </p> 2578 2579 <div class="codeblock"> 2580 <span class="codeblock_name">{Tokenize Operator <a href="lexer.html#1:64">64</a>}</span> 2581 <pre class="prettyprint lang-c"> 2582 2583 token_t *read_operator(void) { 2584 int c; 2585 c = input_getc(); 2586 switch (c) { 2587 case '!': { 2588 c = input_getc(); 2589 if (c == '=') 2590 return token_create(TOK_NE, line, column, 2); 2591 input_ungetc(c); 2592 return token_create(TOK_NOT, line, column, 1); 2593 } 2594 case '%': { 2595 c = input_getc(); 2596 if (c == '=') 2597 return token_create(TOK_ASSIGN_MOD, line, column, 2); 2598 input_ungetc(c); 2599 return token_create(TOK_MOD, line, column, 1); 2600 } 2601 case '&': { 2602 c = input_getc(); 2603 if (c == '&') 2604 return token_create(TOK_AND, line, column, 2); 2605 if (c == '=') 2606 return token_create(TOK_ASSIGN_BITAND, line, column, 2); 2607 input_ungetc(c); 2608 return token_create(TOK_BIT_AND, line, column, 1); 2609 } 2610 case '(': 2611 return token_create(TOK_LEFT_PAREN, line, column, 1); 2612 case ')': 2613 return token_create(TOK_RIGHT_PAREN, line, column, 1); 2614 case '*': { 2615 c = input_getc(); 2616 if (c == '=') 2617 return token_create(TOK_ASSIGN_MUL, line, column, 2); 2618 input_ungetc(c); 2619 return token_create(TOK_MUL, line, column, 1); 2620 } 2621 case '+': { 2622 c = input_getc(); 2623 if (c == '+') 2624 return token_create(TOK_INC, line, column, 2); 2625 if (c == '=') 2626 return token_create(TOK_ASSIGN_ADD, line, column, 2); 2627 input_ungetc(c); 2628 return token_create(TOK_ADD, line, column, 2); 2629 } 2630 case ',': 2631 return token_create(TOK_COMMA, line, column, 1); 2632 case '-': { 2633 c = input_getc(); 2634 if (c == '-') 2635 return token_create(TOK_DEC, line, column, 2); 2636 if (c == '=') 2637 return token_create(TOK_ASSIGN_SUB, line, column, 2); 2638 if (c == '>') 2639 return token_create(TOK_MEMBER_POINTER, line, column, 2); 2640 input_ungetc(c); 2641 return token_create(TOK_SUB, line, column, 1); 2642 } 2643 case '.': { 2644 c = input_getc(); 2645 if (c == '.') { 2646 c = input_getc(); 2647 if (c == '.') { 2648 return token_create(TOK_ELLIPSIS, line, column, 3); 2649 } else { 2650 // Bail out, can't store more than one unget 2651 tok_error("Unexpected character '.' at line %d, column %d\n", line, 2652 column); 2653 exit(1); 2654 } 2655 } 2656 return token_create('.', line, column, 1); 2657 } 2658 case '/': { 2659 c = input_getc(); 2660 if (c == '=') 2661 return token_create(TOK_ASSIGN_DIV, line, column, 2); 2662 input_ungetc(c); 2663 return token_create(TOK_DIV, line, column, 1); 2664 } 2665 case ':': 2666 return token_create(TOK_COND_DECISION, line, column, 1); 2667 case ';': 2668 return token_create(TOK_SEMICOLON, line, column, 1); 2669 case '<': { 2670 c = input_getc(); 2671 if (c == '<') { 2672 c = input_getc(); 2673 if (c == '=') 2674 return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); 2675 input_ungetc(c); 2676 return token_create(TOK_LSHIFT, line, column, 2); 2677 } 2678 if (c == '=') 2679 return token_create(TOK_LE, line, column, 2); 2680 input_ungetc(c); 2681 return token_create(TOK_LT, line, column, 1); 2682 } 2683 case '=': { 2684 c = input_getc(); 2685 if (c == '=') 2686 return token_create(TOK_ASSIGN, line, column, 2); 2687 input_ungetc(c); 2688 return token_create(TOK_ASSIGN, line, column, 1); 2689 } 2690 case '>': { 2691 c = input_getc(); 2692 if (c == '>') { 2693 c = input_getc(); 2694 if (c == '=') 2695 return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); 2696 input_ungetc(c); 2697 return token_create(TOK_RSHIFT, line, column, 2); 2698 } 2699 if (c == '=') 2700 return token_create(TOK_GE, line, column, 2); 2701 input_ungetc(c); 2702 return token_create(TOK_GT, line, column, 1); 2703 } 2704 case '?': 2705 return token_create(TOK_COND, line, column, 1); 2706 case '[': 2707 return token_create(TOK_LEFT_BRACKET, line, column, 1); 2708 case ']': 2709 return token_create(TOK_RIGHT_BRACKET, line, column, 1); 2710 case '^': { 2711 c = input_getc(); 2712 if (c == '=') 2713 return token_create(TOK_ASSIGN_BITXOR, line, column, 2); 2714 input_ungetc(c); 2715 return token_create(TOK_BIT_XOR, line, column, 1); 2716 } 2717 case '{': 2718 return token_create(TOK_LEFT_BRACE, line, column, 1); 2719 case '|': { 2720 c = input_getc(); 2721 if (c == '|') 2722 return token_create(TOK_OR, line, column, 2); 2723 if (c == '=') 2724 return token_create(TOK_ASSIGN_BITOR, line, column, 2); 2725 input_ungetc(c); 2726 return token_create(TOK_BIT_OR, line, column, 1); 2727 } 2728 case '}': 2729 return token_create(TOK_RIGHT_BRACE, line, column, 1); 2730 case '~': 2731 return token_create(TOK_BIT_NOT, line, column, 1); 2732 default: 2733 input_ungetc(c); 2734 return NULL; 2735 } 2736 2737 return NULL; 2738 } 2739 </pre> 2740 2741 2742 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 2743 </div> 2744 </div> 2745 <a name="1:65"><div class="section"><h4 class="noheading">65. </h4></a> 2746 <p>The <code>read_number</code> function reads a number from the input stream. It can be an integer or a floating-point number. 2747 </p> 2748 <p>I've broken it up a bit to make it easier to read. 2749 </p> 2750 2751 <div class="codeblock"> 2752 <span class="codeblock_name">{Tokenize Number <a href="lexer.html#1:65">65</a>}</span> 2753 <pre class="prettyprint lang-c"> 2754 static token_t *read_number(void) { 2755 int c; 2756 char buf[1024]; 2757 int i = 0; 2758 c = input_getc(); 2759 <span class="nocode pln"> {Check for valid prefix, <a href="lexer.html#1:66">66</a>}</span> 2760 int radix = 10; 2761 <span class="nocode pln"> {Process Radix, <a href="lexer.html#1:67">67</a>}</span> 2762 int is_float = 0; 2763 <span class="nocode pln"> {Read Number Loop, <a href="lexer.html#1:68">68</a>}</span> 2764 buf[i] = '\0'; 2765 <span class="nocode pln"> {Process Suffixes, <a href="lexer.html#1:69">69</a>}</span> 2766 <span class="nocode pln"> {Check for conflicting suffixes, <a href="lexer.html#1:70">70</a>}</span> 2767 if (is_float) { 2768 <span class="nocode pln"> {Convert to float, <a href="lexer.html#1:71">71</a>}</span> 2769 } else { 2770 <span class="nocode pln"> {Convert to integer, <a href="lexer.html#1:72">72</a>}</span> 2771 } 2772 return NULL; 2773 } 2774 </pre> 2775 2776 2777 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 2778 </div> 2779 </div> 2780 <a name="1:66"><div class="section"><h4 class="noheading">66. </h4></a> 2781 <p>To determine if a character is a valid prefix for a number, we need to check if it's a digit or a period followed by a digit 2782 </p> 2783 2784 <div class="codeblock"> 2785 <span class="codeblock_name">{Check for valid prefix <a href="lexer.html#1:66">66</a>}</span> 2786 <pre class="prettyprint lang-c"> 2787 // If we don't have a digit or decimal point, it's not a number 2788 if (!isdigit(c) && c != '.') { 2789 input_ungetc(c); 2790 return NULL; 2791 } 2792 // Decimal point followed by non-digit is a struct member 2793 if (c == '.') { 2794 char cnext = input_getc(); 2795 if (!isdigit(cnext)) { 2796 input_ungetc(cnext); 2797 return token_create(TOK_MEMBER, line, column, 1); 2798 } 2799 input_ungetc(cnext); 2800 } 2801 </pre> 2802 2803 2804 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2805 </div> 2806 </div> 2807 <a name="1:67"><div class="section"><h4 class="noheading">67. </h4></a> 2808 <p>A C constant starting with a zero is either an octal or hexadecimal constant. We need to check the next character to determine which one it is. 2809 </p> 2810 2811 <div class="codeblock"> 2812 <span class="codeblock_name">{Process Radix <a href="lexer.html#1:67">67</a>}</span> 2813 <pre class="prettyprint lang-c"> 2814 // Check for hex and octal. 2815 if (c == '0') { 2816 char cnext = input_getc(); 2817 if (cnext == 'x' || cnext == 'X') { 2818 // Hex, discard the 0x 2819 radix = 16; 2820 } else { 2821 // Octal, discard the 0 2822 input_ungetc(cnext); 2823 radix = 8; 2824 } 2825 } else { 2826 // Decimal, append the first digit 2827 buf[i++] = c; 2828 } 2829 </pre> 2830 2831 2832 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2833 </div> 2834 </div> 2835 <a name="1:68"><div class="section"><h4 class="noheading">68. </h4></a> 2836 2837 <div class="codeblock"> 2838 <span class="codeblock_name">{Read Number Loop <a href="lexer.html#1:68">68</a>}</span> 2839 <pre class="prettyprint lang-c"> 2840 while ((c = input_getc()) != EOF) { 2841 // Since there can be multiple writes to the buffer, we want to make sure we 2842 // don't overflow by giving a 4 byte pad 2843 if (i > 1020) { 2844 tok_error("Number too long\n"); 2845 return NULL; 2846 } 2847 // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and 2848 // a-f/A-F for hex 2849 if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) || 2850 (radix == 8 && c >= '0' && c <= '7')) { 2851 buf[i++] = c; 2852 // Decimal point and not a float yet, must be a float 2853 } else if (c == '.' && !is_float) { 2854 is_float = 1; 2855 if (radix != 10) { 2856 tok_error("Invalid floating point constant, expected decimal, got %s\n", 2857 radix == 16 ? "hexadecimal" : "octal"); 2858 return NULL; 2859 } 2860 buf[i++] = c; 2861 } 2862 // Exponent on the end of a constant. (By standard this forces it to be a 2863 // float) 2864 else if (c == 'e' || c == 'E') { 2865 buf[i++] = c; 2866 c = input_getc(); 2867 // Sign on the exponent 2868 if (c == '+' || c == '-') { 2869 buf[i++] = c; 2870 c = input_getc(); 2871 } 2872 // Exponent must be a digit, I.E no 1e1.2 2873 if (!isdigit(c)) { 2874 tok_error("Invalid floating point exponent\n"); 2875 return NULL; 2876 } 2877 buf[i++] = c; 2878 is_float = 1; 2879 } else { 2880 // Reached the end, unget the character so other functions can read it 2881 input_ungetc(c); 2882 break; 2883 } 2884 } 2885 </pre> 2886 2887 2888 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2889 </div> 2890 </div> 2891 <a name="1:69"><div class="section"><h4 class="noheading">69. </h4></a> 2892 <p>C constants can have suffixes to indicate their type. We need to check for these suffixes and set the appropriate flags. 2893 </p> 2894 2895 <div class="codeblock"> 2896 <span class="codeblock_name">{Process Suffixes <a href="lexer.html#1:69">69</a>}</span> 2897 <pre class="prettyprint lang-c"> 2898 int is_unsigned = 0; 2899 int is_long = 0; 2900 int is_single = 0; 2901 while (1) { 2902 c = input_getc(); 2903 if (c == 'u' || c == 'U') { 2904 if (is_unsigned) { 2905 tok_warn( 2906 "Warning: Duplicate suffix 'u' for integer constant ignored\n"); 2907 } 2908 is_unsigned = 1; 2909 } else if (c == 'l' || c == 'L') { 2910 if (is_long) { 2911 tok_warn( 2912 "Warning: Duplicate suffix 'l' for integer constant ignored\n"); 2913 } 2914 is_long = 1; 2915 } else if (c == 'f' || c == 'F') { 2916 if (is_single) { 2917 tok_warn("Warning: Duplicate suffix 'f' for floating point constant " 2918 "ignored\n"); 2919 } 2920 is_single = 1; 2921 } else { 2922 input_ungetc(c); 2923 break; 2924 } 2925 } 2926 </pre> 2927 2928 2929 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2930 </div> 2931 </div> 2932 <a name="1:70"><div class="section"><h4 class="noheading">70. </h4></a> 2933 <p>If we find conflicting suffixes, we print a warning and ignore the suffixes. 2934 </p> 2935 2936 <div class="codeblock"> 2937 <span class="codeblock_name">{Check for conflicting suffixes <a href="lexer.html#1:70">70</a>}</span> 2938 <pre class="prettyprint lang-c"> 2939 if (is_single && is_long) { 2940 tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " 2941 "constant. Ignoring 'l'\n"); 2942 is_long = 0; 2943 } 2944 if (is_single && is_unsigned) { 2945 tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point " 2946 "constant. Ignoring 'u'\n"); 2947 is_unsigned = 0; 2948 } 2949 if (is_single && !is_float) { 2950 tok_warn( 2951 "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n"); 2952 is_single = 0; 2953 } 2954 </pre> 2955 2956 2957 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2958 </div> 2959 </div> 2960 <a name="1:71"><div class="section"><h4 class="noheading">71. </h4></a> 2961 <p>If the string contains a float, we pass it to strtod. We need to make sure that the number is in range for the given type and check for errors from strtod 2962 </p> 2963 2964 <div class="codeblock"> 2965 <span class="codeblock_name">{Convert to float <a href="lexer.html#1:71">71</a>}</span> 2966 <pre class="prettyprint lang-c"> 2967 errno = 0; 2968 // Strtod generates a unix-style error when it's given something out of 2969 // range, so we want to get on top of that quickly instead of ignoring it 2970 // That way we can avoid some nasty NAN-propagation in the constant folder. 2971 double f = strtod(buf, NULL); 2972 if (errno == ERANGE) { 2973 tok_error("Floating point constant out of range\n"); 2974 return NULL; 2975 } 2976 // Warn if the constant is out of range for a float, I.E it's too big or too 2977 // small 2978 if (is_single && (f < FLT_MIN || f > FLT_MAX)) { 2979 tok_warn( 2980 "Warning: Floating point constant %f is out of range for float\n", f); 2981 } 2982 // Warn if the constant is too precise for a float 2983 if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) { 2984 tok_warn("Warning: Converting double precision floating point constant " 2985 "%f to float loses " 2986 "precision\n", 2987 f); 2988 } 2989 return token_create_float(is_single ? TOK_FLOAT_32 2990 : TOK_FLOAT_64, 2991 line, column, f, i); 2992 </pre> 2993 2994 2995 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 2996 </div> 2997 </div> 2998 <a name="1:72"><div class="section"><h4 class="noheading">72. </h4></a> 2999 <p>If the string contains a number, we pass it to stroll. We need to make sure that the number is in range for the given type and check for errors from strtoll 3000 </p> 3001 3002 <div class="codeblock"> 3003 <span class="codeblock_name">{Convert to integer <a href="lexer.html#1:72">72</a>}</span> 3004 <pre class="prettyprint lang-c"> 3005 errno = 0; 3006 uint64_t int_ = strtoull(buf, NULL, radix); 3007 // Same as above, but for integers 3008 if (errno == ERANGE) { 3009 tok_error("Integer constant out of range\n"); 3010 return NULL; 3011 } 3012 if (is_unsigned) { 3013 if (is_long) { 3014 return token_create_int(TOK_INTEGER_U64, line, column, int_, i); 3015 } else { 3016 if (int_ > UINT32_MAX) { 3017 tok_warn( 3018 "Warning: Integer constant %lld is out of range for unsigned " 3019 "int\n", 3020 int_); 3021 } 3022 return token_create_int(TOK_INTEGER_U32, line, column, int_, i); 3023 } 3024 } else { 3025 if (is_long) { 3026 // If the highest bit is set, that means this will overflow a signed 3027 // long (Due to two's complement) 3028 if (int_ & (1UL << 63)) { 3029 tok_warn( 3030 "Warning: Integer constant %lld is out of range for long long\n", 3031 i); 3032 } 3033 return token_create_int(TOK_INTEGER_S64, line, column, int_, i); 3034 } else { 3035 if (int_ & (1UL << 31)) { 3036 tok_warn("Warning: Integer constant %lld is out of range for int\n", 3037 int_); 3038 } 3039 return token_create_int(TOK_INTEGER_S32, line, column, int_, i); 3040 } 3041 } 3042 </pre> 3043 3044 3045 <p class="seealso">Used in section <a href="lexer.html#1:65">65</a></p> 3046 </div> 3047 </div> 3048 <a name="1:73"><div class="section"><h4 class="noheading">73. </h4></a> 3049 <p>The <code>read_char_constant</code> function reads a character constant from the input stream. It can be a single character or a multi-character escape sequence. 3050 </p> 3051 3052 <div class="codeblock"> 3053 <span class="codeblock_name">{Tokenize Character <a href="lexer.html#1:73">73</a>}</span> 3054 <pre class="prettyprint lang-c"> 3055 static token_t *read_char_constant(void) { 3056 int c; 3057 int len = 0; 3058 c = input_getc(); 3059 if (c != '\'') { 3060 input_ungetc(c); 3061 return NULL; 3062 } 3063 len++; 3064 c = input_getc(); 3065 if (c == '\'') { 3066 tok_error("Empty character constant\n"); 3067 return NULL; 3068 } 3069 if (c == '\\') { 3070 c = read_escape_sequence(&len); 3071 } 3072 int val = c; 3073 c = input_getc(); 3074 if (c != '\'') { 3075 tok_error("Expected closing quote for character constant\n"); 3076 return NULL; 3077 } 3078 len++; 3079 return token_create_char(TOK_CHAR_CONST, line, column, val, len); 3080 } 3081 </pre> 3082 3083 3084 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 3085 </div> 3086 </div> 3087 <a name="1:74"><div class="section"><h4 class="noheading">74. </h4></a> 3088 <p>The <code>read_string_literal</code> function reads a string literal from the input stream. 3089 </p> 3090 <p>For this function, an automatic-lifetime buffer is used to store the string it becomes too large. At that point, a heap-allocated buffer is used. 3091 This way we can avoid unnecessary heap allocations for small strings. 3092 </p> 3093 3094 <div class="codeblock"> 3095 <span class="codeblock_name">{Tokenize String <a href="lexer.html#1:74">74</a>}</span> 3096 <pre class="prettyprint lang-c"> 3097 <span class="nocode pln">{Read Escape Sequence, <a href="lexer.html#1:75">75</a>}</span> 3098 static token_t *read_string_literal(void) { 3099 int c; 3100 c = input_getc(); 3101 if (c != '"') { 3102 input_ungetc(c); 3103 return NULL; 3104 } 3105 int i = 0; 3106 char s_buf[512]; 3107 char *buf = s_buf; 3108 int len = 512; 3109 int esc_pad = 0; 3110 while ((c = input_getc()) != EOF) { 3111 if (c == '"') { 3112 // Implicit skip of closing quote 3113 break; 3114 } 3115 if (c == '\\') { 3116 c = read_escape_sequence(&esc_pad); 3117 if (c == 0) { 3118 return NULL; 3119 } 3120 } 3121 if (i >= len) { 3122 if (buf == s_buf) { 3123 buf = malloc(1024); 3124 if (buf == NULL) { 3125 fputs("Out of memory. Could not parse string literal.\n", stderr); 3126 exit(1); 3127 } 3128 memcpy(buf, s_buf, 512); 3129 len *= 2; 3130 } else { 3131 len *= 2; 3132 buf = realloc(buf, len); 3133 } 3134 } 3135 buf[i++] = c; 3136 } 3137 buf[i] = '\0'; 3138 if (c == EOF) { 3139 tok_error("Unterminated string literal\n"); 3140 if (buf != s_buf) { 3141 free(buf); 3142 } 3143 return NULL; 3144 } 3145 3146 token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, 3147 i + esc_pad + 2); 3148 if (buf != s_buf) { 3149 free(buf); 3150 } 3151 return tok; 3152 } 3153 </pre> 3154 3155 3156 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 3157 </div> 3158 </div> 3159 <a name="1:75"><div class="section"><h4 class="noheading">75. </h4></a> 3160 <p>Escape sequences in C can either be single characters or octal/hexadecimal values. We need to handle both cases. 3161 </p> 3162 3163 <div class="codeblock"> 3164 <span class="codeblock_name">{Read Escape Sequence <a href="lexer.html#1:75">75</a>}</span> 3165 <pre class="prettyprint lang-c"> 3166 static char read_escape_sequence(int *len) { 3167 int c = input_getc(); 3168 *len += 1; 3169 switch (c) { 3170 case 'a': 3171 return '\a'; 3172 case 'b': 3173 return '\b'; 3174 case 'f': 3175 return '\f'; 3176 case 'n': 3177 return '\n'; 3178 case 'r': 3179 return '\r'; 3180 case 't': 3181 return '\t'; 3182 case 'v': 3183 return '\v'; 3184 case '\'': 3185 return '\''; 3186 case '"': 3187 return '"'; 3188 case '?': 3189 return '?'; 3190 case '\\': 3191 return '\\'; 3192 case '0': 3193 return '\0'; 3194 case 'x': { 3195 c = input_getc(); 3196 if (!isxdigit(c)) { 3197 tok_error("Invalid hexadecimal escape sequence\n"); 3198 return 0; 3199 } 3200 int val = 0; 3201 while (isxdigit(c)) { 3202 *len += 1; 3203 val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10); 3204 c = input_getc(); 3205 } 3206 input_ungetc(c); 3207 return (char)val; 3208 } 3209 default: 3210 if (!isdigit(c)) { 3211 tok_error("Invalid escape sequence\n"); 3212 return 0; 3213 } 3214 int val = 0; 3215 while (isdigit(c)) { 3216 *len += 1; 3217 val = val * 8 + c - '0'; 3218 c = input_getc(); 3219 } 3220 input_ungetc(c); 3221 return (char)val; 3222 } 3223 } 3224 </pre> 3225 3226 3227 <p class="seealso">Used in section <a href="lexer.html#1:74">74</a></p> 3228 </div> 3229 </div> 3230 <a name="1:76"><div class="section"><h4 class="noheading">76. </h4></a> 3231 <p>Finally, I'll add some code for running the tokenizer as its own program. This way we can test it out. 3232 </p> 3233 3234 <div class="codeblock"> 3235 <span class="codeblock_name">{Run Test <a href="lexer.html#1:76">76</a>}</span> 3236 <pre class="prettyprint lang-c"> 3237 char *preprocess(char *in) { 3238 char *output_name = malloc(1024); 3239 snprintf(output_name, 1024, "%s.preprocessed", in); 3240 char *command = malloc(2048); 3241 snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name); 3242 system(command); 3243 free(command); 3244 return output_name; 3245 } 3246 3247 // Tokenize the input file 3248 int main(int argc, char **argv) { 3249 if (argc != 2) { 3250 fprintf(stderr, "Usage: %s <input.c>\n", argv[0]); 3251 return 1; 3252 } 3253 char *input_name = argv[1]; 3254 char *preprocessed = preprocess(input_name); 3255 init_tokenizer(preprocessed); 3256 token_t *tok; 3257 while ((tok = next_token()) != NULL) { 3258 print_token(tok); 3259 token_destroy(tok); 3260 } 3261 destroy_tokenizer(); 3262 remove(preprocessed); 3263 free(preprocessed); 3264 hash_table_destroy(string_table); 3265 return 0; 3266 } 3267 </pre> 3268 3269 3270 <p class="seealso">Used in section <a href="lexer.html#1:59">59</a></p> 3271 </div> 3272 <h3> Bugs/Errata</h3> 3273 <p>I wrote this code in a single sitting, so there are bound to be bugs. I'll list them here as I find them. The code you see here is the final version, with all bugs fixed. 3274 </p> 3275 <ul> 3276 <li>had <code>buffer_pos == buffer_size - 1</code>, left in from trying to plug some code for lookahead in, didn't work out, but I forgot to remove it, causes fallthrough to <code>buffer_size == 0</code> check which if true returns EOF, preventing input initialization. Fixed by changing to <code>buffer_pos == buffer_size</code>. 3277 </li> 3278 <li>assertion <code>token->kind == TOK_STRING_ASCII</code> failed in token_string. Forgot to expand check for identifiers which also use token_string. Fixed by changing to <code>token->kind == TOK_STRING_ASCII || token->kind == TOK_ID || token->kind == TOK_TID</code>. 3279 </li> 3280 <li>token_create_string - call to <code>hash_table_get</code> with freed key. Fixed by moving the call to free after the call to <code>hash_table_get</code>. 3281 </li> 3282 <li>ibid - Design of hash table and call to <code>hash_table_get</code> in token_create_string created double free. Fixed by rewriting part of function. 3283 </li> 3284 <li>Tokenizer missing code to handle GCC preprocessor line directives. Fixed by adding code to handle them. 3285 </li> 3286 <li>Destructor for string literals missing in tokenizer teardown, added it in. 3287 </li> 3288 <li>read_number - check <code>int_ > INT32_MAX</code> does not work due to some weird casting. Added explicit cast. 3289 </li> 3290 <li>read_char_constant - Forgot to handle '\\0'. Added. 3291 </li> 3292 <li>skip_whitespace - When a division operator occurs in code, skip_whitespace assumes it's a comment. Fixed by adding a check for division operator. 3293 </li> 3294 <li>hash_string - Optimization, not a bug, Dragon Book hash function not very fast due to misprediction. Replaced with ELF hash function. 3295 </li> 3296 <li>read_identifier - strlen gets called 3 times even though we already get the string len by incrementing an array index. Ew. Used i instead of strlen. 3297 </li> 3298 <li>read_identifier - stringized version of keywords stored, not needed. Code added to call token_create instead of token_create_string for keywords. 3299 </li> 3300 <li>Everywhere - Checks added for memory allocation failure. 3301 </li> 3302 <li>Not a bug. Removed the seperate token type for TID. Will try to handle in the parser. 3303 </li> 3304 </ul> 3305 <h3> Conclusion</h3> 3306 <p>That's it! The C Minus tokenizer is complete. It's hopefully pretty understandable, and given the testing I've put it through, it should be pretty robust. 3307 </p> 3308 <p>Next time, we'll start on the parser. 3309 </p> 3310 <h1> Source code, biblography</h1> 3311 <p>Source code for the tokenizer is available <a href="/projects/cminus/code/tokenizer.c">here</a>, header file is available <a href="/projects/cminus/code/tokenizer.h">here</a>. 3312 </p> 3313 <p>Source code for the input module is available <a href="/projects/cminus/code/input.c">here</a>, header file is available <a href="/projects/cminus/code/input.h">here</a>. 3314 </p> 3315 <p>Source code for the hash table is available <a href="/projects/cminus/code/hash_table.c">here</a>, header file is available <a href="/projects/cminus/code/hash_table.h">here</a>. 3316 </p> 3317 <p>Source code for the token module is available <a href="/projects/cminus/code/token.c">here</a>, header file is available <a href="/projects/cminus/code/token.h">here</a>. 3318 </p> 3319 <p>A lot of the logic for this project is from either the Dragon Book, Engineering a Compiler, or LCC: A Retargetable Compiler for ANSI C. Grammars are from The C Reference Manual. 3320 </p> 3321 <p>I got the idea for using zero-width arrays for optional content (the struct hack) from hacking on some weird virtio drivers (they seem to love it). 3322 </p> 3323 <p>Crafting Interpreters by Bob Nystrom inspired me to do this project, so if you see any similarities, there's probably some unintentional influence there. 3324 </p> 3325 <p>The code for the ELF hash function is from the glibc source code. 3326 </p> 3327 <p>The idea for decision trees came from LCC. 3328 </p> 3329 <p>Literate programming rendered using <a href="https://github.com/zyedidia/Literate">literate</a>. 3330 </p> 3331 <p> <footer style=" text-align: center; padding: 20px;"> 3332 <p>© 2024 Reagan Fischer. If for some reason you want to use my AMAZING code (lol), it's available under the MIT 3333 license <a href="/projects/cminus/code/LICENSE.md">here</a>.</p> 3334 </footer> 3335 </p> 3336 3337 </div> 3338 </body>