/******************************* ****** Cormac Redmond ********* ****** credmond85 /at/ gmail.com ********************************/ /******************************** ****** SECTION 1 - OPTIONS ****** ********************************/ options { JAVA_UNICODE_ESCAPE = true; } /********************************** ****** SECTION 2 - USER CODE ****** **********************************/ PARSER_BEGIN(TigerTokeniser) public class TigerTokeniser { public static void main(String args[]) { TigerTokeniser tokeniser; if (args.length == 0) { System.out.println("Reading from standard input ..."); tokeniser = new TigerTokeniser(System.in); } else if (args.length == 1) { System.out.println("Reading from file " + args[0] + " . . ."); try { tokeniser = new TigerTokeniser(new java.io.FileInputStream(args[0])); } catch (java.io.FileNotFoundException e) { System.out.println("File " + args[0] + " not found."); return; } } else { System.out.println("Tiger Tokeniser: Usage is one of:"); System.out.println(" java TigerTokeniser < inputfile"); System.out.println("OR"); System.out.println(" java TigerTokeniser inputfile"); return; } /* Below: Put an english description for each token into tokenDesc[] This is nicer than doing many 'if' statements for each token when printing them */ String [] tokenDesc = new String[65]; tokenDesc[ARRAY] = "ARRAY"; tokenDesc[BREAK] = "BREAK"; tokenDesc[DO] = "DO"; tokenDesc[ELSE] = "ELSE"; tokenDesc[END] = "END"; tokenDesc[FOR] = "FOR"; tokenDesc[FUNCTION] = "FUNCTION"; tokenDesc[IF] = "IF"; tokenDesc[IN] = "IN"; tokenDesc[LET] = "LET"; tokenDesc[NIL] = "NIL"; tokenDesc[OF] = "OF"; tokenDesc[THEN] = "THEN"; tokenDesc[TO] = "TO"; tokenDesc[TYPE] = "TYPE"; tokenDesc[VAR] = "VAR"; tokenDesc[WHILE] = "WHILE"; tokenDesc[PLUS_SIGN] = "PLUS_SIGN"; tokenDesc[MINUS_SIGN] = "MINUS_SIGN"; tokenDesc[MULT_SIGN] = "MULT_SIGN"; tokenDesc[DIV] = "DIV"; tokenDesc[AND] = "AND"; tokenDesc[OR] = "OR"; tokenDesc[EQUALS] = "EQUALS"; tokenDesc[NEQ] = "NEQ"; tokenDesc[LT] = "LT"; tokenDesc[LTE] = "LTE"; tokenDesc[GT] = "GT"; tokenDesc[GTE] = "GTE"; tokenDesc[ASSIGN] = "ASSIGN"; tokenDesc[SEMIC] = "SEMIC"; tokenDesc[COMMA] = "COMMA"; tokenDesc[COLON] = "COLON"; tokenDesc[DOT] = "DOT"; tokenDesc[LBR] = "LBR"; tokenDesc[RBR] = "RBR"; tokenDesc[LSQB] = "LSQB"; tokenDesc[RSQB] = "RSQB"; tokenDesc[LCURLB] = "LCURLB"; tokenDesc[RCURLB] = "RCURLB"; tokenDesc[IDENTIFIER] = "IDENTIFIER"; tokenDesc[INTEGER] = "INTEGER"; tokenDesc[STRING] = "STRING"; tokenDesc[OTHER] = "OTHER"; tokenDesc[SEMIC] = "SEMIC"; // No current need for the below in the array, but maybe at later stage //tokenDesc[LETTER] = "LETTER"; //tokenDesc[DIGIT] = "DIGIT"; //tokenDesc[IN_COMMENT] = "IN_COMMENT"; // Get every token found for (Token t = getNextToken(); t.kind!=EOF; t = getNextToken()) { // Print out type of token found System.out.print(tokenDesc[t.kind] + ": "); // Print out token's value System.out.print(" (" + t.image + ") " + "\n"); } } } PARSER_END(TigerTokeniser) /******************* ****** TOKENS ****** *******************/ TOKEN_MGR_DECLS : { static int commentNesting = 0; } SKIP : /* Skip white space */ { " " | "\t" | "\n" | "\r" | "\f" } SKIP : /* Skip comments */ { "/*" { commentNesting++; } : IN_COMMENT } SKIP : { "/*" { commentNesting++; } | "*/" { commentNesting--; if (commentNesting == 0) SwitchTo(DEFAULT); } | <~[]> } TOKEN : /* Reserved Words */ { < ARRAY : "array" > | < BREAK : "break" > | < DO : "do" > | < ELSE : "else" > | < END : "end" > | < FOR : "for" > | < FUNCTION : "function" > | < IF : "if" > | < IN : "in" > | < LET : "let" > | < NIL : "nil" > | < OF : "of" > | < THEN : "then" > | < TO : "to" > | < TYPE : "type" > | < VAR : "var" > | < WHILE : "while" > } TOKEN : /* Operators */ { < PLUS_SIGN : "+" > | < MINUS_SIGN : "-" > | < MULT_SIGN : "*" > | < DIV : "/" > | < AND : "&" > | < OR : "|" > | < EQUALS : "=" > | < NEQ : "<>" > | < LT : "<" > | < LTE : "<=" > | < GT : ">" > | < GTE : ">=" > | < ASSIGN : ":=" > } TOKEN : /* Punctuation */ { < SEMIC : ";" > | < COMMA : "," > | < COLON : ":" > | < DOT : "." > | < LBR : "(" > | < RBR : ")" > | < LSQB : "[" > | < RSQB : "]" > | < LCURLB : "{" > | < RCURLB : "}" > } TOKEN : /* Identifiers and integers */ { < IDENTIFIER : (||"_")* > | )+ > | < #LETTER : ["A"-"Z", "a"-"z"] > | < #DIGIT : ["0"-"9"] > } MORE : /* Strings */ { "\"" : WITHIN_STRING } TOKEN : { /* Assigned the altered 'image' to matchedToken.image */ { matchedToken.image = image.toString(); } : DEFAULT } MORE : { < ~["\\","\""] > | < "\\" (["n", "t", "\\", "\""]||"^"["A"-"Z", "a"-"z"]) > /* When we encounter whitespace between two \'s, remove it, including the \'s */ | < "\\" ([" ","\t","\n","\r"])+ "\\" > { image.delete(image.length() - lengthOfMatch, image.length()); } : WITHIN_STRING } TOKEN : /* Anything else that we don't recognise */ { < OTHER : ~[] > }