blob: 2b0a056df80e2d4b247b4532e43265e804206327 [file] [log] [blame]
The Android Open Source Projectf8057102009-03-15 16:47:16 -07001package java_cup;
2
3import java.util.Hashtable;
4
5import java_cup.runtime.str_token;
6import java_cup.runtime.token;
7
8/** This class implements a small scanner (aka lexical analyzer or lexer) for
9 * the JavaCup specification. This scanner reads characters from standard
10 * input (System.in) and returns integers corresponding to the terminal
11 * number of the next token. Once end of input is reached the EOF token is
12 * returned on every subsequent call.<p>
13 * Tokens currently returned include: <pre>
14 * Symbol Constant Returned Symbol Constant Returned
15 * ------ ----------------- ------ -----------------
16 * "package" PACKAGE "import" IMPORT
17 * "code" CODE "action" ACTION
18 * "parser" PARSER "terminal" TERMINAL
19 * "non" NON "init" INIT
20 * "scan" SCAN "with" WITH
21 * "start" START ; SEMI
22 * , COMMA * STAR
23 * . DOT : COLON
24 * ::= COLON_COLON_EQUALS | BAR
25 * identifier ID {:...:} CODE_STRING
26 * "debug" DEBUG
27 * </pre>
28 * All symbol constants are defined in sym.java which is generated by
29 * JavaCup from parser.cup.<p>
30 *
31 * In addition to the scanner proper (called first via init() then with
32 * next_token() to get each token) this class provides simple error and
33 * warning routines and keeps a count of errors and warnings that is
34 * publicly accessible.<p>
35 *
36 * This class is "static" (i.e., it has only static members and methods).
37 *
38 * @version last updated: 11/25/95
39 * @author Scott Hudson
40 */
41public class lexer {
42
43 /*-----------------------------------------------------------*/
44 /*--- Constructor(s) ----------------------------------------*/
45 /*-----------------------------------------------------------*/
46
47 /** The only constructor is private, so no instances can be created. */
48 private lexer() { }
49
50 /*-----------------------------------------------------------*/
51 /*--- Static (Class) Variables ------------------------------*/
52 /*-----------------------------------------------------------*/
53
54 /** First character of lookahead. */
55 protected static int next_char;
56
57 /** Second character of lookahead. */
58 protected static int next_char2;
59
60 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
61
62 /** EOF constant. */
63 protected static final int EOF_CHAR = -1;
64
65 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
66
67 /** Table of keywords. Keywords are initially treated as identifiers.
68 * Just before they are returned we look them up in this table to see if
69 * they match one of the keywords. The string of the name is the key here,
70 * which indexes Integer objects holding the symbol number.
71 */
72 protected static Hashtable keywords = new Hashtable(23);
73
74 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
75
76 /** Table of single character symbols. For ease of implementation, we
77 * store all unambiguous single character tokens in this table of Integer
78 * objects keyed by Integer objects with the numerical value of the
79 * appropriate char (currently Character objects have a bug which precludes
80 * their use in tables).
81 */
82 protected static Hashtable char_symbols = new Hashtable(11);
83
84 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
85
86 /** Current line number for use in error messages. */
87 protected static int current_line = 1;
88
89 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
90
91 /** Character position in current line. */
92 protected static int current_position = 1;
93
94 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
95
96 /** Count of total errors detected so far. */
97 public static int error_count = 0;
98
99 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
100
101 /** Count of warnings issued so far */
102 public static int warning_count = 0;
103
104 /*-----------------------------------------------------------*/
105 /*--- Static Methods ----------------------------------------*/
106 /*-----------------------------------------------------------*/
107
108 /** Initialize the scanner. This sets up the keywords and char_symbols
109 * tables and reads the first two characters of lookahead.
110 */
111 public static void init() throws java.io.IOException
112 {
113 /* set up the keyword table */
114 keywords.put("package", new Integer(sym.PACKAGE));
115 keywords.put("import", new Integer(sym.IMPORT));
116 keywords.put("code", new Integer(sym.CODE));
117 keywords.put("action", new Integer(sym.ACTION));
118 keywords.put("parser", new Integer(sym.PARSER));
119 keywords.put("terminal", new Integer(sym.TERMINAL));
120 keywords.put("non", new Integer(sym.NON));
121 keywords.put("init", new Integer(sym.INIT));
122 keywords.put("scan", new Integer(sym.SCAN));
123 keywords.put("with", new Integer(sym.WITH));
124 keywords.put("start", new Integer(sym.START));
125 keywords.put("debug", new Integer(sym.DEBUG));
126
127 /* set up the table of single character symbols */
128 char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
129 char_symbols.put(new Integer(','), new Integer(sym.COMMA));
130 char_symbols.put(new Integer('*'), new Integer(sym.STAR));
131 char_symbols.put(new Integer('.'), new Integer(sym.DOT));
132 char_symbols.put(new Integer('|'), new Integer(sym.BAR));
133
134 /* read two characters of lookahead */
135 next_char = System.in.read();
136 if (next_char == EOF_CHAR)
137 next_char2 = EOF_CHAR;
138 else
139 next_char2 = System.in.read();
140 }
141
142 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
143
144 /** Advance the scanner one character in the input stream. This moves
145 * next_char2 to next_char and then reads a new next_char2.
146 */
147 protected static void advance() throws java.io.IOException
148 {
149 int old_char;
150
151 old_char = next_char;
152 next_char = next_char2;
153 if (next_char == EOF_CHAR)
154 next_char2 = EOF_CHAR;
155 else
156 next_char2 = System.in.read();
157
158 /* count this */
159 current_position++;
160 if (old_char == '\n')
161 {
162 current_line++;
163 current_position = 1;
164 }
165 }
166
167 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
168
169 /** Emit an error message. The message will be marked with both the
170 * current line number and the position in the line. Error messages
171 * are printed on standard error (System.err).
172 * @param message the message to print.
173 */
174 public static void emit_error(String message)
175 {
176 System.err.println("Error at " + current_line + "(" + current_position +
177 "): " + message);
178 error_count++;
179 }
180
181 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
182
183 /** Emit a warning message. The message will be marked with both the
184 * current line number and the position in the line. Messages are
185 * printed on standard error (System.err).
186 * @param message the message to print.
187 */
188 public static void emit_warn(String message)
189 {
190 System.err.println("Warning at " + current_line + "(" + current_position +
191 "): " + message);
192 warning_count++;
193 }
194
195 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
196
197 /** Determine if a character is ok to start an id.
198 * @param ch the character in question.
199 */
200 protected static boolean id_start_char(int ch)
201 {
202 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
203 (ch == '_');
204
205 // later need to deal with non-8-bit chars here
206 }
207
208 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
209
210 /** Determine if a character is ok for the middle of an id.
211 * @param ch the character in question.
212 */
213 protected static boolean id_char(int ch)
214 {
215 return id_start_char(ch) || (ch >= '0' && ch <= '9');
216 }
217
218 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
219
220 /** Try to look up a single character symbol, returns -1 for not found.
221 * @param ch the character in question.
222 */
223 protected static int find_single_char(int ch)
224 {
225 Integer result;
226
227 result = (Integer)char_symbols.get(new Integer((char)ch));
228 if (result == null)
229 return -1;
230 else
231 return result.intValue();
232 }
233
234 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
235
236 /** Handle swallowing up a comment. Both old style C and new style C++
237 * comments are handled.
238 */
239 protected static void swallow_comment() throws java.io.IOException
240 {
241 /* next_char == '/' at this point */
242
243 /* is it a traditional comment */
244 if (next_char2 == '*')
245 {
246 /* swallow the opener */
247 advance(); advance();
248
249 /* swallow the comment until end of comment or EOF */
250 for (;;)
251 {
252 /* if its EOF we have an error */
253 if (next_char == EOF_CHAR)
254 {
255 emit_error("Specification file ends inside a comment");
256 return;
257 }
258
259 /* if we can see the closer we are done */
260 if (next_char == '*' && next_char2 == '/')
261 {
262 advance();
263 advance();
264 return;
265 }
266
267 /* otherwise swallow char and move on */
268 advance();
269 }
270 }
271
272 /* is its a new style comment */
273 if (next_char2 == '/')
274 {
275 /* swallow the opener */
276 advance(); advance();
277
278 /* swallow to '\n', '\f', or EOF */
279 while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
280 advance();
281
282 return;
283
284 }
285
286 /* shouldn't get here, but... if we get here we have an error */
287 emit_error("Malformed comment in specification -- ignored");
288 advance();
289 }
290
291 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
292
293 /** Swallow up a code string. Code strings begin with "{:" and include
294 all characters up to the first occurrence of ":}" (there is no way to
295 include ":}" inside a code string). The routine returns an str_token
296 object suitable for return by the scanner.
297 */
298 protected static token do_code_string() throws java.io.IOException
299 {
300 StringBuffer result = new StringBuffer();
301
302 /* at this point we have lookahead of "{:" -- swallow that */
303 advance(); advance();
304
305 /* save chars until we see ":}" */
306 while (!(next_char == ':' && next_char2 == '}'))
307 {
308 /* if we have run off the end issue a message and break out of loop */
309 if (next_char == EOF_CHAR)
310 {
311 emit_error("Specification file ends inside a code string");
312 break;
313 }
314
315 /* otherwise record the char and move on */
316 result.append(new Character((char)next_char));
317 advance();
318 }
319
320 /* advance past the closer and build a return token */
321 advance(); advance();
322 return new str_token(sym.CODE_STRING, result.toString());
323 }
324
325 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
326
327 /** Process an identifier. Identifiers begin with a letter, underscore,
328 * or dollar sign, which is followed by zero or more letters, numbers,
329 * underscores or dollar signs. This routine returns an str_token suitable
330 * for return by the scanner.
331 */
332 protected static token do_id() throws java.io.IOException
333 {
334 StringBuffer result = new StringBuffer();
335 String result_str;
336 Integer keyword_num;
337 char buffer[] = new char[1];
338
339 /* next_char holds first character of id */
340 buffer[0] = (char)next_char;
341 result.append(buffer,0,1);
342 advance();
343
344 /* collect up characters while they fit in id */
345 while(id_char(next_char))
346 {
347 buffer[0] = (char)next_char;
348 result.append(buffer,0,1);
349 advance();
350 }
351
352 /* extract a string and try to look it up as a keyword */
353 result_str = result.toString();
354 keyword_num = (Integer)keywords.get(result_str);
355
356 /* if we found something, return that keyword */
357 if (keyword_num != null)
358 return new token(keyword_num.intValue());
359
360 /* otherwise build and return an id token with an attached string */
361 return new str_token(sym.ID, result_str);
362 }
363
364 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
365
366 /** Return one token. This is the main external interface to the scanner.
367 * It consumes sufficient characters to determine the next input token
368 * and returns it. To help with debugging, this routine actually calls
369 * real_next_token() which does the work. If you need to debug the
370 * parser, this can be changed to call debug_next_token() which prints
371 * a debugging message before returning the token.
372 */
373 public static token next_token() throws java.io.IOException
374 {
375 return real_next_token();
376 }
377
378 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
379
380 /** Debugging version of next_token(). This routine calls the real scanning
381 * routine, prints a message on System.out indicating what the token is,
382 * then returns it.
383 */
384 public static token debug_next_token() throws java.io.IOException
385 {
386 token result = real_next_token();
387 System.out.println("# next_token() => " + result.sym);
388 return result;
389 }
390
391 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
392
393 /** The actual routine to return one token. This is normally called from
394 * next_token(), but for debugging purposes can be called indirectly from
395 * debug_next_token().
396 */
397 protected static token real_next_token() throws java.io.IOException
398 {
399 int sym_num;
400
401 for (;;)
402 {
403 /* look for white space */
404 if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
405 next_char == '\f' || next_char == '\r')
406 {
407 /* advance past it and try the next character */
408 advance();
409 continue;
410 }
411
412 /* look for a single character symbol */
413 sym_num = find_single_char(next_char);
414 if (sym_num != -1)
415 {
416 /* found one -- advance past it and return a token for it */
417 advance();
418 return new token(sym_num);
419 }
420
421 /* look for : or ::= */
422 if (next_char == ':')
423 {
424 /* if we don't have a second ':' return COLON */
425 if (next_char2 != ':')
426 {
427 advance();
428 return new token(sym.COLON);
429 }
430
431 /* move forward and look for the '=' */
432 advance();
433 if (next_char2 == '=')
434 {
435 advance(); advance();
436 return new token(sym.COLON_COLON_EQUALS);
437 }
438 else
439 {
440 /* return just the colon (already consumed) */
441 return new token(sym.COLON);
442 }
443 }
444
445 /* look for a comment */
446 if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
447 {
448 /* swallow then continue the scan */
449 swallow_comment();
450 continue;
451 }
452
453 /* look for start of code string */
454 if (next_char == '{' && next_char2 == ':')
455 return do_code_string();
456
457 /* look for an id or keyword */
458 if (id_start_char(next_char)) return do_id();
459
460 /* look for EOF */
461 if (next_char == EOF_CHAR) return new token(sym.EOF);
462
463 /* if we get here, we have an unrecognized character */
464 emit_warn("Unrecognized character '" +
465 new Character((char)next_char) + "'(" + next_char +
466 ") -- ignored");
467
468 /* advance past it */
469 advance();
470 }
471 }
472
473 /*-----------------------------------------------------------*/
474};
475