參加ThoughtWorks University的一個來月沒啥事情,閑了寫寫compiler玩。發現Lexer部分比較基礎也比較常用,有很多相似的東西,每次都要寫一遍也太麻煩了,下面是我按著JSL寫的一個common java-like lexer,對于大多數接近java語法的語言估計是夠用了。BTW:這個Lexer定義是TDD出來,以通過測試為要務,可能可讀性不太強。
1.WhiteSpace
?1?WhiteSpace
?2?????:?('?'????//?ASCII?SP
?3?????|??'\t'???//?ASCII?HT
?4?????|??'\f'???//?ASCII?FF
?5?????|??LineTerminator?{newline();}
?6?????)+{$setType(Token.SKIP);}
?7?????;
?8?protected?LineTerminator
?9?????options?{generateAmbigWarnings=false;}
10?????:?'\n'???//?ASCII?LF
11?????|?'\r'???//?ASCII?CR
12?????|?"\r\n"?//?ASCII?CR?followed?ASCII?LF
13?????;
2.Comments
?1?Comment
?2?????:?(SingleLineComment?|?MultiLineComment)
?3?????{$setType(Token.SKIP);}
?4?????;
?5?protected?SingleLineComment
?6?????:?"//"?(~('\n'|'\r'))*?(LineTerminator{newline();})?
?7?????;
?8?protected?MultiLineComment
?9?????:?"/*"
10???????(~('\n'|'\r'|'*')?|?LineTerminator{newline();})*?
11???????"*/"
12?????;
3.Escape Sequences
?1?protected?EscapeSequence
?2?????:'\\'!
?3?????????('n'?{$setText("\n");}
?4?????????|'r'?{$setText("\r");}
?5?????????|'t'?{$setText("\t");}
?6?????????|'b'?{$setText("\b");}
?7?????????|'f'?{$setText("\f");}
?8?????????|'"'?
?9?????????|'\''
10?????????|'\\'
11?????????//?octal?escape
12?????????|'0'..'3'
13?????????????(?options?{?warnWhenFollowAmbig?=?false;?}:?'0'..'7'
14?????????????(?options?{?warnWhenFollowAmbig?=?false;?}:?'0'..'7')?)?
15?????????{char?c?=?(char)Integer.parseInt($getText,8);?$setText(c);}
16?????????|'4'..'7'
17?????????????(?options?{?warnWhenFollowAmbig?=?false;?}:?'0'..'7'?)?
18?????????{char?c?=?(char)Integer.parseInt($getText,8);?$setText(c);}
19?????????)
20?????|?("\\u")?=>?UnicodeEscape
21?????;
22?protected?UnicodeEscape
23?????:?'\\'!?('u')+{$setText("");}?HexDigit?HexDigit?HexDigit?HexDigit
24?????{char?c?=?(char)Integer.parseInt($getText,16);?$setText(c);}
25?????;
26?protected?HexDigit:?'0'..'9'?|?'a'..'f'?|?'A'..'F';
27?
這個東西比較麻煩,種類很多,有像\t \n \r這樣的escape,也有\uuu1234這樣的unicode escape,還有octal escape,說實話,這個東西還是這次寫compiler的時候新發現的,以前還真不知道有這么個東西,也從來沒用過...汗啊...octal escape是對于小于255的數,可以用\012這樣的八進制數表示,這個東西沒想明白有什么用。反正JSL上寫了,就按這個來吧。
4. String & Character Literal
1?StringLiteral
2?????:?'"'!?(EscapeSequence|~'"')*?'"'!
3?????;
4?CharacterLiteral
5?????:?'\''!?(EscapeSequence|~'"')??'\''!
6?????;
5. NumericLiteral
?1?NumericLiteral
?2?????options{testLiterals?=?true;}
?3??????{int?type?=?0;}
?4??????:?((".end")?=>?type?=?EndOfDirective
?5?????????|(".max")?=>?type?=?MaxDirective
?6????????|('.'?'a'..'z')?=>?type?=?Directives?????????
?7????????|?('+'!?|?'-')??(type?=?IntegerLiteral?|?type?=?HexIntegerLiteral?|?type?=?DoubleLiteral)
?8???????)
?9?????{$setType(type);}
10?????;
11
26?protected?IntegerLiteral
27?????returns?[int?type?=?0]
28?????{$setType(DecimalIntegerLiteral);}
29?????:?('0'?
30?????|?'0'!?(?'0'..'7'?{$setType(OctalIntegerLiteral);})+
31?????|?'1'..'9'?('0'..'9')*)?
32??????((LongTypeSuffix!?{
33?????????????if?(_ttype?==?OctalIntegerLiteral)?
34???????????????$setType(OctalLongLiteral);
35?????????????else?
36???????????????$setType(DecimalLongLiteral);
37????????})???
38???????|?{_ttype?==?DecimalIntegerLiteral}??
39?????????(FloatingPointPart?|?ExponentPart)?{$setType(DoubleLiteral);}
40?????????(DoubleTypeSuffix!?|?FloatTypeSuffix!{$setType(FloatLiteral);})?
41???????){type?=?_ttype;}
42?????;
43?protected?HexIntegerLiteral
44?????returns?[int?type?=?0]
45?????:?('0'!?('x'!?|?'X'!)?(HexDigit)+?
46???????(LongTypeSuffix!?{$setType(HexLongLiteral);})??)
47???????{type?=?_ttype;}
48?????;
49?protected?DoubleLiteral
50?????returns?[int?type?=?0]
51?????:?(FloatingPointPart?(DoubleTypeSuffix!?|?FloatTypeSuffix!{$setType(FloatLiteral);})?)
52???????{type?=?_ttype;}
53?????;
54?protected?FloatingPointPart
55?????:?'.'?('0'..'9')+?(ExponentPart)?
56?????;
57?protected?ExponentPart?
58?????:?('E'|'e')?('+'|'-')??('0'..'9')+
59?????;
60?protected?LongTypeSuffix?:?'l'?|?'L';
61?protected?DoubleTypeSuffix?:?'d'?|?'D';
62?protected?FloatTypeSuffix?:?'f'?|?'F';
這個是最復雜的一部分...
Unit Test比較長,節選吧
??1?public?void?testShouldIgnoreWhiteSpaces()?throws?Exception?{
??2???assertRecognized(OctaneTokenTypes.EOF,?"?");
??3???assertRecognized(OctaneTokenTypes.EOF,?"\t");
??4???assertRecognized(OctaneTokenTypes.EOF,?"\f");
??5?}
??6?
??7?public?void?testShouldIgnoreLineTerminators()?throws?Exception?{
??8???assertRecognized(OctaneTokenTypes.EOF,?"\r");
??9???assertRecognized(OctaneTokenTypes.EOF,?"\n");
?10???assertRecognized(OctaneTokenTypes.EOF,?"\r\n");
?11?}
?12?
?13?public?void?testShouldIgnoreSingleLineComment()?throws?Exception?{
?14???assertRecognized(OctaneTokenTypes.EOF,?"//?comments?1234?&*^$\n");
?15?}
?16?
?17?public?void?testShouldIgnoreMultiLineComment()?throws?Exception?{
?18???assertRecognized(OctaneLexer.EOF,?"/*?comment?line?1\ncomment?line?2\n*/");
?19?}
?20?
?21?public?void?testShouldIncreaseLineNumberIfLineTerminatorsGiven()?throws?Exception?{
?22???assertEquals(2,?createLexer("\r").nextToken().getLine());
?23???assertEquals(2,?createLexer("\n").nextToken().getLine());
?24???assertEquals(2,?createLexer("\r\n").nextToken().getLine());
?25?}
?26?
?27?public?void?testShouldRecognizeBasicEscapeInCharacterLiteral()?throws?Exception?{
?28???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\n",?"'\\n'");
?29???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\r",?"'\\r'");
?30???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\t",?"'\\t'");
?31???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\b",?"'\\b'");
?32???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\f",?"'\\f'");
?33???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\"",?"'\\\"'");
?34???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\\",?"'\\\\'");
?35???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\'",?"'\\\''");
?36?}
?37?
?38?public?void?testShouldRecognizeBasicEscapeInStringLiteral()?throws?Exception?{
?39???assertRecognized(OctaneTokenTypes.StringLiteral,?"\n",?"\"\\n\"");
?40???assertRecognized(OctaneTokenTypes.StringLiteral,?"\r",?"\"\\r\"");
?41???assertRecognized(OctaneTokenTypes.StringLiteral,?"\t",?"\"\\t\"");
?42???assertRecognized(OctaneTokenTypes.StringLiteral,?"\b",?"\"\\b\"");
?43???assertRecognized(OctaneTokenTypes.StringLiteral,?"\f",?"\"\\f\"");
?44???assertRecognized(OctaneTokenTypes.StringLiteral,?"\"",?"\"\\\"\"");
?45???assertRecognized(OctaneTokenTypes.StringLiteral,?"\\",?"\"\\\\\"");
?46???assertRecognized(OctaneTokenTypes.StringLiteral,?"\'",?"\"\\\'\"");
?47?}
?48?
?49?public?void?testShouldRecognizeOctalEscapeInCharacterLiteral()?throws?Exception?{
?50???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\077",?"'\\077'");
?51???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\77",?"'\\77'");
?52???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\37",?"'\\37'");
?53???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\7",?"'\\7'");
?54?}
?55?
?56?public?void?testShouldRecognizeOctalEscapeInStringLiteral()?throws?Exception?{
?57???assertRecognized(OctaneTokenTypes.StringLiteral,?"\077",?"\"\\077\"");
?58???assertRecognized(OctaneTokenTypes.StringLiteral,?"\77",?"\"\\77\"");
?59???assertRecognized(OctaneTokenTypes.StringLiteral,?"\37",?"\"\\37\"");
?60???assertRecognized(OctaneTokenTypes.StringLiteral,?"\7",?"\"\\7\"");
?61?}
?62?
?63?public?void?testShouldRecognizeUnicodeEscapeInCharacterLiteral()?throws?Exception?{
?64???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\u1234",?"'\\u1234'");
?65???assertRecognized(OctaneTokenTypes.CharacterLiteral,?"\uu1234","'\\uu1234\'");
?66?}
?67?
?68?public?void?testShouldRecognizeUnicodeEscapeInStringLiteral()?throws?Exception?{
?69???assertRecognized(OctaneTokenTypes.StringLiteral,?"\u1234",?"\"\\u1234\"");
?70???assertRecognized(OctaneTokenTypes.StringLiteral,?"\uu1234",?"\"\\uu1234\"");
?71?}
?72?
?73?public?void?testShouldRecognizeUnicodeInStringLiteral()?throws?Exception?{
?74???assertRecognized(OctaneTokenTypes.StringLiteral,?"\"這是一行中文\"");
?75?}
?76?
?77?public?void?testShouldRecognizeDecimalIntegerLiteral()?throws?Exception?{
?78???assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral,?"0",?"0");
?79???assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral,?"-123",?"-123");
?80?}
?81?
?82?public?void?testShouldRecognizeDecimalLongLiteral()?throws?Exception?{
?83???assertRecognized(OctaneTokenTypes.DecimalLongLiteral,?"0",?"0l");
?84???assertRecognized(OctaneTokenTypes.DecimalLongLiteral,?"-123",?"-123L");
?85?}
?86?
?87?public?void?testShouldRecognizeHexIntegerLiteral()?throws?Exception?{
?88???assertRecognized(OctaneTokenTypes.HexIntegerLiteral,?"1A3B",?"+0x1A3B");
?89???assertRecognized(OctaneTokenTypes.HexIntegerLiteral,?"-1A3B",?"-0x1A3B");
?90?}
?91?
?92?public?void?testShouldRecognizeHexLongLiteral()?throws?Exception?{
?93???assertRecognized(OctaneTokenTypes.HexLongLiteral,?"1A3B",?"+0x1A3BL");
?94???assertRecognized(OctaneTokenTypes.HexLongLiteral,?"-1A3F",?"-0x1A3Fl");
?95?}
?96?
?97?public?void?testShouldRecognizeOctalIntegerLiteral()?throws?Exception?{
?98???assertRecognized(OctaneTokenTypes.OctalIntegerLiteral,?"123",?"+0123");
?99???assertRecognized(OctaneTokenTypes.OctalIntegerLiteral,?"-123",?"-0123");
100?}
101?
102?public?void?testShouldRecognizeOctalLongLiteral()?throws?Exception?{
103???assertRecognized(OctaneTokenTypes.OctalLongLiteral,?"1237",?"+01237L");
104???assertRecognized(OctaneTokenTypes.OctalLongLiteral,?"-1237",?"-01237l");
105?}
106?
107?public?void?testShouldRecognizeDoubleLiteral()?throws?Exception?{
108???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"0.5",?"+0.5");
109???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"-.5",?"-.5");
110???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"0.5",?"+0.5D");
111???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"-.5",?"-.5d");
112?}
113?
114?public?void?testShouldRecognizeDoubleLiteralInExponentialForm()?throws?Exception?{
115???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"0.5e+10",?"+0.5e+10");
116???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"-.5E-10",?"-.5E-10");
117???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"0.5E+5",?"+0.5E+5D");
118???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"-.5E-5",?"-.5E-5d");
119???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"10E+5",?"+10E+5d");
120???assertRecognized(OctaneTokenTypes.DoubleLiteral,?"-10e-5",?"-10e-5D");
121?}
122?
123?public?void?testShouldRecognizeFloatLiteral()?throws?Exception?{
124???assertRecognized(OctaneTokenTypes.FloatLiteral,?"0.5",?"+0.5F");
125???assertRecognized(OctaneTokenTypes.FloatLiteral,?"-.5",?"-.5f");
126???assertRecognized(OctaneTokenTypes.FloatLiteral,?"10E+5",?"+10E+5f");
127???assertRecognized(OctaneTokenTypes.FloatLiteral,?"-10e-5",?"-10e-5F");
128?}
129?
130?public?void?testShouldRecognizeFloatLiteralInExponentialForm()?throws?Exception?{
131???assertRecognized(OctaneTokenTypes.FloatLiteral,?"0.5E+5",?"+0.5E+5F");
132???assertRecognized(OctaneTokenTypes.FloatLiteral,?"-.5e-5",?"-.5e-5f");
133?}
134?
135?protected?void?assertRecognized(int?tokenType,?String?sourceString)?throws?Exception?{
136???assertRecognized(tokenType,?null,?sourceString);
137?
138?}
139?
140?protected?void?assertRecognized(int?tokenType,?String?exceptedText,?String?sourceString)?throws?Exception?{
141???assertRecognized(new?int[]?{?tokenType?},?exceptedText?==?null???null?:?new?String[]?{?exceptedText?},?sourceString);
142?}
143?
144?protected?void?assertRecognized(int[]?tokenTypes,?String[]?exceptedText,?String?sourceString)?throws?TokenStreamException?{
145???TokenStream?lexer?=?createLexer(sourceString);
146???for?(int?i?=?0;?i?<?tokenTypes.length;?i++)?{
147?????Token?token?=?lexer.nextToken();
148?????assertEquals(tokenTypes[i],?token.getType());
149?????if?(exceptedText?!=?null)?assertEquals(exceptedText[i],?token.getText());
150???}
151???assertEquals(OctaneTokenTypes.EOF,?lexer.nextToken().getType());
152?}