Jeff Vander Stoep | 74e4f93 | 2016-02-08 15:27:10 -0800 | [diff] [blame^] | 1 | #----------------------------------------------------------------------------- |
| 2 | # ply: lex.py |
| 3 | # |
| 4 | # Author: David M. Beazley (dave@dabeaz.com) |
| 5 | # |
| 6 | # Copyright (C) 2001-2006, David M. Beazley |
| 7 | # |
| 8 | # This library is free software; you can redistribute it and/or |
| 9 | # modify it under the terms of the GNU Lesser General Public |
| 10 | # License as published by the Free Software Foundation; either |
| 11 | # version 2.1 of the License, or (at your option) any later version. |
| 12 | # |
| 13 | # This library is distributed in the hope that it will be useful, |
| 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | # Lesser General Public License for more details. |
| 17 | # |
| 18 | # You should have received a copy of the GNU Lesser General Public |
| 19 | # License along with this library; if not, write to the Free Software |
| 20 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 21 | # |
| 22 | # See the file COPYING for a complete copy of the LGPL. |
| 23 | #----------------------------------------------------------------------------- |
| 24 | |
| 25 | __version__ = "2.2" |
| 26 | |
| 27 | import re, sys, types |
| 28 | |
| 29 | from . import util |
| 30 | import collections |
| 31 | |
| 32 | |
| 33 | # Regular expression used to match valid token names |
| 34 | _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') |
| 35 | |
| 36 | # Available instance types. This is used when parsers are defined by a class. |
| 37 | # In Python3 the InstanceType and ObjectType are no more, they've passed, ceased |
| 38 | # to be, they are ex-classes along with old-style classes |
| 39 | |
| 40 | try: |
| 41 | _INSTANCETYPE = (types.InstanceType, types.ObjectType) |
| 42 | except AttributeError: |
| 43 | _INSTANCETYPE = object |
| 44 | |
| 45 | # Exception thrown when invalid token encountered and no default error |
| 46 | # handler is defined. |
| 47 | class LexError(Exception): |
| 48 | def __init__(self,message,s): |
| 49 | self.args = (message,) |
| 50 | self.text = s |
| 51 | |
| 52 | # Token class |
| 53 | class LexToken(object): |
| 54 | def __str__(self): |
| 55 | return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) |
| 56 | def __repr__(self): |
| 57 | return str(self) |
| 58 | def skip(self,n): |
| 59 | self.lexer.skip(n) |
| 60 | |
| 61 | # ----------------------------------------------------------------------------- |
| 62 | # Lexer class |
| 63 | # |
| 64 | # This class encapsulates all of the methods and data associated with a lexer. |
| 65 | # |
| 66 | # input() - Store a new string in the lexer |
| 67 | # token() - Get the next token |
| 68 | # ----------------------------------------------------------------------------- |
| 69 | |
| 70 | class Lexer: |
| 71 | def __init__(self): |
| 72 | self.lexre = None # Master regular expression. This is a list of |
| 73 | # tuples (re,findex) where re is a compiled |
| 74 | # regular expression and findex is a list |
| 75 | # mapping regex group numbers to rules |
| 76 | self.lexretext = None # Current regular expression strings |
| 77 | self.lexstatere = {} # Dictionary mapping lexer states to master regexs |
| 78 | self.lexstateretext = {} # Dictionary mapping lexer states to regex strings |
| 79 | self.lexstate = "INITIAL" # Current lexer state |
| 80 | self.lexstatestack = [] # Stack of lexer states |
| 81 | self.lexstateinfo = None # State information |
| 82 | self.lexstateignore = {} # Dictionary of ignored characters for each state |
| 83 | self.lexstateerrorf = {} # Dictionary of error functions for each state |
| 84 | self.lexreflags = 0 # Optional re compile flags |
| 85 | self.lexdata = None # Actual input data (as a string) |
| 86 | self.lexpos = 0 # Current position in input text |
| 87 | self.lexlen = 0 # Length of the input text |
| 88 | self.lexerrorf = None # Error rule (if any) |
| 89 | self.lextokens = None # List of valid tokens |
| 90 | self.lexignore = "" # Ignored characters |
| 91 | self.lexliterals = "" # Literal characters that can be passed through |
| 92 | self.lexmodule = None # Module |
| 93 | self.lineno = 1 # Current line number |
| 94 | self.lexdebug = 0 # Debugging mode |
| 95 | self.lexoptimize = 0 # Optimized mode |
| 96 | |
| 97 | def clone(self,object=None): |
| 98 | c = Lexer() |
| 99 | c.lexstatere = self.lexstatere |
| 100 | c.lexstateinfo = self.lexstateinfo |
| 101 | c.lexstateretext = self.lexstateretext |
| 102 | c.lexstate = self.lexstate |
| 103 | c.lexstatestack = self.lexstatestack |
| 104 | c.lexstateignore = self.lexstateignore |
| 105 | c.lexstateerrorf = self.lexstateerrorf |
| 106 | c.lexreflags = self.lexreflags |
| 107 | c.lexdata = self.lexdata |
| 108 | c.lexpos = self.lexpos |
| 109 | c.lexlen = self.lexlen |
| 110 | c.lextokens = self.lextokens |
| 111 | c.lexdebug = self.lexdebug |
| 112 | c.lineno = self.lineno |
| 113 | c.lexoptimize = self.lexoptimize |
| 114 | c.lexliterals = self.lexliterals |
| 115 | c.lexmodule = self.lexmodule |
| 116 | |
| 117 | # If the object parameter has been supplied, it means we are attaching the |
| 118 | # lexer to a new object. In this case, we have to rebind all methods in |
| 119 | # the lexstatere and lexstateerrorf tables. |
| 120 | |
| 121 | if object: |
| 122 | newtab = { } |
| 123 | for key, ritem in self.lexstatere.items(): |
| 124 | newre = [] |
| 125 | for cre, findex in ritem: |
| 126 | newfindex = [] |
| 127 | for f in findex: |
| 128 | if not f or not f[0]: |
| 129 | newfindex.append(f) |
| 130 | continue |
| 131 | newfindex.append((getattr(object,f[0].__name__),f[1])) |
| 132 | newre.append((cre,newfindex)) |
| 133 | newtab[key] = newre |
| 134 | c.lexstatere = newtab |
| 135 | c.lexstateerrorf = { } |
| 136 | for key, ef in self.lexstateerrorf.items(): |
| 137 | c.lexstateerrorf[key] = getattr(object,ef.__name__) |
| 138 | c.lexmodule = object |
| 139 | |
| 140 | # Set up other attributes |
| 141 | c.begin(c.lexstate) |
| 142 | return c |
| 143 | |
| 144 | # ------------------------------------------------------------ |
| 145 | # writetab() - Write lexer information to a table file |
| 146 | # ------------------------------------------------------------ |
| 147 | def writetab(self,tabfile): |
| 148 | tf = open(tabfile+".py","w") |
| 149 | tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) |
| 150 | tf.write("_lextokens = %s\n" % repr(self.lextokens)) |
| 151 | tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) |
| 152 | tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) |
| 153 | tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) |
| 154 | |
| 155 | tabre = { } |
| 156 | for key, lre in self.lexstatere.items(): |
| 157 | titem = [] |
| 158 | for i in range(len(lre)): |
| 159 | titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1]))) |
| 160 | tabre[key] = titem |
| 161 | |
| 162 | tf.write("_lexstatere = %s\n" % repr(tabre)) |
| 163 | tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) |
| 164 | |
| 165 | taberr = { } |
| 166 | for key, ef in self.lexstateerrorf.items(): |
| 167 | if ef: |
| 168 | taberr[key] = ef.__name__ |
| 169 | else: |
| 170 | taberr[key] = None |
| 171 | tf.write("_lexstateerrorf = %s\n" % repr(taberr)) |
| 172 | tf.close() |
| 173 | |
| 174 | # ------------------------------------------------------------ |
| 175 | # readtab() - Read lexer information from a tab file |
| 176 | # ------------------------------------------------------------ |
| 177 | def readtab(self,tabfile,fdict): |
| 178 | exec("import %s as lextab" % tabfile) |
| 179 | self.lextokens = lextab._lextokens |
| 180 | self.lexreflags = lextab._lexreflags |
| 181 | self.lexliterals = lextab._lexliterals |
| 182 | self.lexstateinfo = lextab._lexstateinfo |
| 183 | self.lexstateignore = lextab._lexstateignore |
| 184 | self.lexstatere = { } |
| 185 | self.lexstateretext = { } |
| 186 | for key,lre in lextab._lexstatere.items(): |
| 187 | titem = [] |
| 188 | txtitem = [] |
| 189 | for i in range(len(lre)): |
| 190 | titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) |
| 191 | txtitem.append(lre[i][0]) |
| 192 | self.lexstatere[key] = titem |
| 193 | self.lexstateretext[key] = txtitem |
| 194 | self.lexstateerrorf = { } |
| 195 | for key,ef in lextab._lexstateerrorf.items(): |
| 196 | self.lexstateerrorf[key] = fdict[ef] |
| 197 | self.begin('INITIAL') |
| 198 | |
| 199 | # ------------------------------------------------------------ |
| 200 | # input() - Push a new string into the lexer |
| 201 | # ------------------------------------------------------------ |
| 202 | def input(self,s): |
| 203 | if not (isinstance(s,util.bytes_type) or isinstance(s, util.string_type)): |
| 204 | raise ValueError("Expected a string") |
| 205 | self.lexdata = s |
| 206 | self.lexpos = 0 |
| 207 | self.lexlen = len(s) |
| 208 | |
| 209 | # ------------------------------------------------------------ |
| 210 | # begin() - Changes the lexing state |
| 211 | # ------------------------------------------------------------ |
| 212 | def begin(self,state): |
| 213 | if state not in self.lexstatere: |
| 214 | raise ValueError("Undefined state") |
| 215 | self.lexre = self.lexstatere[state] |
| 216 | self.lexretext = self.lexstateretext[state] |
| 217 | self.lexignore = self.lexstateignore.get(state,"") |
| 218 | self.lexerrorf = self.lexstateerrorf.get(state,None) |
| 219 | self.lexstate = state |
| 220 | |
| 221 | # ------------------------------------------------------------ |
| 222 | # push_state() - Changes the lexing state and saves old on stack |
| 223 | # ------------------------------------------------------------ |
| 224 | def push_state(self,state): |
| 225 | self.lexstatestack.append(self.lexstate) |
| 226 | self.begin(state) |
| 227 | |
| 228 | # ------------------------------------------------------------ |
| 229 | # pop_state() - Restores the previous state |
| 230 | # ------------------------------------------------------------ |
| 231 | def pop_state(self): |
| 232 | self.begin(self.lexstatestack.pop()) |
| 233 | |
| 234 | # ------------------------------------------------------------ |
| 235 | # current_state() - Returns the current lexing state |
| 236 | # ------------------------------------------------------------ |
| 237 | def current_state(self): |
| 238 | return self.lexstate |
| 239 | |
| 240 | # ------------------------------------------------------------ |
| 241 | # skip() - Skip ahead n characters |
| 242 | # ------------------------------------------------------------ |
| 243 | def skip(self,n): |
| 244 | self.lexpos += n |
| 245 | |
| 246 | # ------------------------------------------------------------ |
| 247 | # token() - Return the next token from the Lexer |
| 248 | # |
| 249 | # Note: This function has been carefully implemented to be as fast |
| 250 | # as possible. Don't make changes unless you really know what |
| 251 | # you are doing |
| 252 | # ------------------------------------------------------------ |
| 253 | def token(self): |
| 254 | # Make local copies of frequently referenced attributes |
| 255 | lexpos = self.lexpos |
| 256 | lexlen = self.lexlen |
| 257 | lexignore = self.lexignore |
| 258 | lexdata = self.lexdata |
| 259 | |
| 260 | while lexpos < lexlen: |
| 261 | # This code provides some short-circuit code for whitespace, tabs, and other ignored characters |
| 262 | if lexdata[lexpos] in lexignore: |
| 263 | lexpos += 1 |
| 264 | continue |
| 265 | |
| 266 | # Look for a regular expression match |
| 267 | for lexre,lexindexfunc in self.lexre: |
| 268 | m = lexre.match(lexdata,lexpos) |
| 269 | if not m: continue |
| 270 | |
| 271 | # Set last match in lexer so that rules can access it if they want |
| 272 | self.lexmatch = m |
| 273 | |
| 274 | # Create a token for return |
| 275 | tok = LexToken() |
| 276 | tok.value = m.group() |
| 277 | tok.lineno = self.lineno |
| 278 | tok.lexpos = lexpos |
| 279 | tok.lexer = self |
| 280 | |
| 281 | lexpos = m.end() |
| 282 | i = m.lastindex |
| 283 | func,tok.type = lexindexfunc[i] |
| 284 | self.lexpos = lexpos |
| 285 | |
| 286 | if not func: |
| 287 | # If no token type was set, it's an ignored token |
| 288 | if tok.type: return tok |
| 289 | break |
| 290 | |
| 291 | # if func not callable, it means it's an ignored token |
| 292 | if not isinstance(func, collections.Callable): |
| 293 | break |
| 294 | |
| 295 | # If token is processed by a function, call it |
| 296 | newtok = func(tok) |
| 297 | |
| 298 | # Every function must return a token, if nothing, we just move to next token |
| 299 | if not newtok: |
| 300 | lexpos = self.lexpos # This is here in case user has updated lexpos. |
| 301 | break |
| 302 | |
| 303 | # Verify type of the token. If not in the token map, raise an error |
| 304 | if not self.lexoptimize: |
| 305 | if newtok.type not in self.lextokens: |
| 306 | raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( |
| 307 | func.__code__.co_filename, func.__code__.co_firstlineno, |
| 308 | func.__name__, newtok.type),lexdata[lexpos:]) |
| 309 | |
| 310 | return newtok |
| 311 | else: |
| 312 | # No match, see if in literals |
| 313 | if lexdata[lexpos] in self.lexliterals: |
| 314 | tok = LexToken() |
| 315 | tok.value = lexdata[lexpos] |
| 316 | tok.lineno = self.lineno |
| 317 | tok.lexer = self |
| 318 | tok.type = tok.value |
| 319 | tok.lexpos = lexpos |
| 320 | self.lexpos = lexpos + 1 |
| 321 | return tok |
| 322 | |
| 323 | # No match. Call t_error() if defined. |
| 324 | if self.lexerrorf: |
| 325 | tok = LexToken() |
| 326 | tok.value = self.lexdata[lexpos:] |
| 327 | tok.lineno = self.lineno |
| 328 | tok.type = "error" |
| 329 | tok.lexer = self |
| 330 | tok.lexpos = lexpos |
| 331 | self.lexpos = lexpos |
| 332 | newtok = self.lexerrorf(tok) |
| 333 | if lexpos == self.lexpos: |
| 334 | # Error method didn't change text position at all. This is an error. |
| 335 | raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) |
| 336 | lexpos = self.lexpos |
| 337 | if not newtok: continue |
| 338 | return newtok |
| 339 | |
| 340 | self.lexpos = lexpos |
| 341 | raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) |
| 342 | |
| 343 | self.lexpos = lexpos + 1 |
| 344 | if self.lexdata is None: |
| 345 | raise RuntimeError("No input string given with input()") |
| 346 | return None |
| 347 | |
| 348 | # ----------------------------------------------------------------------------- |
| 349 | # _validate_file() |
| 350 | # |
| 351 | # This checks to see if there are duplicated t_rulename() functions or strings |
| 352 | # in the parser input file. This is done using a simple regular expression |
| 353 | # match on each line in the filename. |
| 354 | # ----------------------------------------------------------------------------- |
| 355 | |
| 356 | def _validate_file(filename): |
| 357 | import os.path |
| 358 | base,ext = os.path.splitext(filename) |
| 359 | if ext != '.py': return 1 # No idea what the file is. Return OK |
| 360 | |
| 361 | try: |
| 362 | f = open(filename) |
| 363 | lines = f.readlines() |
| 364 | f.close() |
| 365 | except IOError: |
| 366 | return 1 # Oh well |
| 367 | |
| 368 | fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') |
| 369 | sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') |
| 370 | counthash = { } |
| 371 | linen = 1 |
| 372 | noerror = 1 |
| 373 | for l in lines: |
| 374 | m = fre.match(l) |
| 375 | if not m: |
| 376 | m = sre.match(l) |
| 377 | if m: |
| 378 | name = m.group(1) |
| 379 | prev = counthash.get(name) |
| 380 | if not prev: |
| 381 | counthash[name] = linen |
| 382 | else: |
| 383 | print("%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)) |
| 384 | noerror = 0 |
| 385 | linen += 1 |
| 386 | return noerror |
| 387 | |
| 388 | # ----------------------------------------------------------------------------- |
| 389 | # _funcs_to_names() |
| 390 | # |
| 391 | # Given a list of regular expression functions, this converts it to a list |
| 392 | # suitable for output to a table file |
| 393 | # ----------------------------------------------------------------------------- |
| 394 | |
| 395 | def _funcs_to_names(funclist): |
| 396 | result = [] |
| 397 | for f in funclist: |
| 398 | if f and f[0]: |
| 399 | result.append((f[0].__name__,f[1])) |
| 400 | else: |
| 401 | result.append(f) |
| 402 | return result |
| 403 | |
| 404 | # ----------------------------------------------------------------------------- |
| 405 | # _names_to_funcs() |
| 406 | # |
| 407 | # Given a list of regular expression function names, this converts it back to |
| 408 | # functions. |
| 409 | # ----------------------------------------------------------------------------- |
| 410 | |
| 411 | def _names_to_funcs(namelist,fdict): |
| 412 | result = [] |
| 413 | for n in namelist: |
| 414 | if n and n[0]: |
| 415 | result.append((fdict[n[0]],n[1])) |
| 416 | else: |
| 417 | result.append(n) |
| 418 | return result |
| 419 | |
| 420 | # ----------------------------------------------------------------------------- |
| 421 | # _form_master_re() |
| 422 | # |
| 423 | # This function takes a list of all of the regex components and attempts to |
| 424 | # form the master regular expression. Given limitations in the Python re |
| 425 | # module, it may be necessary to break the master regex into separate expressions. |
| 426 | # ----------------------------------------------------------------------------- |
| 427 | |
| 428 | def _form_master_re(relist,reflags,ldict): |
| 429 | if not relist: return [] |
| 430 | regex = "|".join(relist) |
| 431 | try: |
| 432 | lexre = re.compile(regex,re.VERBOSE | reflags) |
| 433 | |
| 434 | # Build the index to function map for the matching engine |
| 435 | lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) |
| 436 | for f,i in lexre.groupindex.items(): |
| 437 | handle = ldict.get(f,None) |
| 438 | if type(handle) in (types.FunctionType, types.MethodType): |
| 439 | lexindexfunc[i] = (handle,handle.__name__[2:]) |
| 440 | elif handle is not None: |
| 441 | # If rule was specified as a string, we build an anonymous |
| 442 | # callback function to carry out the action |
| 443 | if f.find("ignore_") > 0: |
| 444 | lexindexfunc[i] = (None,None) |
| 445 | print("IGNORE", f) |
| 446 | else: |
| 447 | lexindexfunc[i] = (None, f[2:]) |
| 448 | |
| 449 | return [(lexre,lexindexfunc)],[regex] |
| 450 | except Exception as e: |
| 451 | m = int(len(relist)/2) |
| 452 | if m == 0: m = 1 |
| 453 | llist, lre = _form_master_re(relist[:m],reflags,ldict) |
| 454 | rlist, rre = _form_master_re(relist[m:],reflags,ldict) |
| 455 | return llist+rlist, lre+rre |
| 456 | |
| 457 | # ----------------------------------------------------------------------------- |
| 458 | # def _statetoken(s,names) |
| 459 | # |
| 460 | # Given a declaration name s of the form "t_" and a dictionary whose keys are |
| 461 | # state names, this function returns a tuple (states,tokenname) where states |
| 462 | # is a tuple of state names and tokenname is the name of the token. For example, |
| 463 | # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') |
| 464 | # ----------------------------------------------------------------------------- |
| 465 | |
| 466 | def _statetoken(s,names): |
| 467 | nonstate = 1 |
| 468 | parts = s.split("_") |
| 469 | for i in range(1,len(parts)): |
| 470 | if parts[i] not in names and parts[i] != 'ANY': break |
| 471 | if i > 1: |
| 472 | states = tuple(parts[1:i]) |
| 473 | else: |
| 474 | states = ('INITIAL',) |
| 475 | |
| 476 | if 'ANY' in states: |
| 477 | states = tuple(names.keys()) |
| 478 | |
| 479 | tokenname = "_".join(parts[i:]) |
| 480 | return (states,tokenname) |
| 481 | |
| 482 | # ----------------------------------------------------------------------------- |
| 483 | # lex(module) |
| 484 | # |
| 485 | # Build all of the regular expression rules from definitions in the supplied module |
| 486 | # ----------------------------------------------------------------------------- |
| 487 | def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0): |
| 488 | global lexer |
| 489 | ldict = None |
| 490 | stateinfo = { 'INITIAL' : 'inclusive'} |
| 491 | error = 0 |
| 492 | files = { } |
| 493 | lexobj = Lexer() |
| 494 | lexobj.lexdebug = debug |
| 495 | lexobj.lexoptimize = optimize |
| 496 | global token,input |
| 497 | |
| 498 | if nowarn: warn = 0 |
| 499 | else: warn = 1 |
| 500 | |
| 501 | if object: module = object |
| 502 | |
| 503 | if module: |
| 504 | # User supplied a module object. |
| 505 | if isinstance(module, types.ModuleType): |
| 506 | ldict = module.__dict__ |
| 507 | elif isinstance(module, _INSTANCETYPE): |
| 508 | _items = [(k,getattr(module,k)) for k in dir(module)] |
| 509 | ldict = { } |
| 510 | for (i,v) in _items: |
| 511 | ldict[i] = v |
| 512 | else: |
| 513 | raise ValueError("Expected a module or instance") |
| 514 | lexobj.lexmodule = module |
| 515 | |
| 516 | else: |
| 517 | # No module given. We might be able to get information from the caller. |
| 518 | try: |
| 519 | raise RuntimeError |
| 520 | except RuntimeError: |
| 521 | e,b,t = sys.exc_info() |
| 522 | f = t.tb_frame |
| 523 | f = f.f_back # Walk out to our calling function |
| 524 | ldict = f.f_globals # Grab its globals dictionary |
| 525 | |
| 526 | if optimize and lextab: |
| 527 | try: |
| 528 | lexobj.readtab(lextab,ldict) |
| 529 | token = lexobj.token |
| 530 | input = lexobj.input |
| 531 | lexer = lexobj |
| 532 | return lexobj |
| 533 | |
| 534 | except ImportError: |
| 535 | pass |
| 536 | |
| 537 | # Get the tokens, states, and literals variables (if any) |
| 538 | if (module and isinstance(module,_INSTANCETYPE)): |
| 539 | tokens = getattr(module,"tokens",None) |
| 540 | states = getattr(module,"states",None) |
| 541 | literals = getattr(module,"literals","") |
| 542 | else: |
| 543 | tokens = ldict.get("tokens",None) |
| 544 | states = ldict.get("states",None) |
| 545 | literals = ldict.get("literals","") |
| 546 | |
| 547 | if not tokens: |
| 548 | raise SyntaxError("lex: module does not define 'tokens'") |
| 549 | if not (isinstance(tokens,list) or isinstance(tokens,tuple)): |
| 550 | raise SyntaxError("lex: tokens must be a list or tuple.") |
| 551 | |
| 552 | # Build a dictionary of valid token names |
| 553 | lexobj.lextokens = { } |
| 554 | if not optimize: |
| 555 | for n in tokens: |
| 556 | if not _is_identifier.match(n): |
| 557 | print("lex: Bad token name '%s'" % n) |
| 558 | error = 1 |
| 559 | if warn and n in lexobj.lextokens: |
| 560 | print("lex: Warning. Token '%s' multiply defined." % n) |
| 561 | lexobj.lextokens[n] = None |
| 562 | else: |
| 563 | for n in tokens: lexobj.lextokens[n] = None |
| 564 | |
| 565 | if debug: |
| 566 | print("lex: tokens = '%s'" % list(lexobj.lextokens.keys())) |
| 567 | |
| 568 | try: |
| 569 | for c in literals: |
| 570 | if not (isinstance(c,util.bytes_type) or isinstance(c, util.string_type)) or len(c) > 1: |
| 571 | print("lex: Invalid literal %s. Must be a single character" % repr(c)) |
| 572 | error = 1 |
| 573 | continue |
| 574 | |
| 575 | except TypeError: |
| 576 | print("lex: Invalid literals specification. literals must be a sequence of characters.") |
| 577 | error = 1 |
| 578 | |
| 579 | lexobj.lexliterals = literals |
| 580 | |
| 581 | # Build statemap |
| 582 | if states: |
| 583 | if not (isinstance(states,tuple) or isinstance(states,list)): |
| 584 | print("lex: states must be defined as a tuple or list.") |
| 585 | error = 1 |
| 586 | else: |
| 587 | for s in states: |
| 588 | if not isinstance(s,tuple) or len(s) != 2: |
| 589 | print("lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)) |
| 590 | error = 1 |
| 591 | continue |
| 592 | name, statetype = s |
| 593 | if isinstance(name, util.string_type): |
| 594 | original_name = name |
| 595 | name = util.encode_input(name) |
| 596 | if not isinstance(name,util.bytes_type) or len(original_name) != len(name): |
| 597 | print("lex: state name %s must be a byte string" % repr(original_name)) |
| 598 | error = 1 |
| 599 | continue |
| 600 | if not (statetype == 'inclusive' or statetype == 'exclusive'): |
| 601 | print("lex: state type for state %s must be 'inclusive' or 'exclusive'" % name) |
| 602 | error = 1 |
| 603 | continue |
| 604 | if name in stateinfo: |
| 605 | print("lex: state '%s' already defined." % name) |
| 606 | error = 1 |
| 607 | continue |
| 608 | stateinfo[name] = statetype |
| 609 | |
| 610 | # Get a list of symbols with the t_ or s_ prefix |
| 611 | tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ] |
| 612 | |
| 613 | # Now build up a list of functions and a list of strings |
| 614 | |
| 615 | funcsym = { } # Symbols defined as functions |
| 616 | strsym = { } # Symbols defined as strings |
| 617 | toknames = { } # Mapping of symbols to token names |
| 618 | |
| 619 | for s in stateinfo.keys(): |
| 620 | funcsym[s] = [] |
| 621 | strsym[s] = [] |
| 622 | |
| 623 | ignore = { } # Ignore strings by state |
| 624 | errorf = { } # Error functions by state |
| 625 | |
| 626 | if len(tsymbols) == 0: |
| 627 | raise SyntaxError("lex: no rules of the form t_rulename are defined.") |
| 628 | |
| 629 | for f in tsymbols: |
| 630 | t = ldict[f] |
| 631 | states, tokname = _statetoken(f,stateinfo) |
| 632 | toknames[f] = tokname |
| 633 | |
| 634 | if isinstance(t, collections.Callable): |
| 635 | for s in states: funcsym[s].append((f,t)) |
| 636 | elif (isinstance(t, util.bytes_type) or isinstance(t,util.string_type)): |
| 637 | for s in states: strsym[s].append((f,t)) |
| 638 | else: |
| 639 | print("lex: %s not defined as a function or string" % f) |
| 640 | error = 1 |
| 641 | |
| 642 | # Sort the functions by line number |
| 643 | for f in funcsym.values(): |
| 644 | f.sort(key=lambda x: x[1].__code__.co_firstlineno) |
| 645 | |
| 646 | # Sort the strings by regular expression length |
| 647 | for s in strsym.values(): |
| 648 | s.sort(key=lambda x: len(x[1])) |
| 649 | |
| 650 | regexs = { } |
| 651 | |
| 652 | # Build the master regular expressions |
| 653 | for state in stateinfo.keys(): |
| 654 | regex_list = [] |
| 655 | |
| 656 | # Add rules defined by functions first |
| 657 | for fname, f in funcsym[state]: |
| 658 | line = f.__code__.co_firstlineno |
| 659 | file = f.__code__.co_filename |
| 660 | files[file] = None |
| 661 | tokname = toknames[fname] |
| 662 | |
| 663 | ismethod = isinstance(f, types.MethodType) |
| 664 | |
| 665 | if not optimize: |
| 666 | nargs = f.__code__.co_argcount |
| 667 | if ismethod: |
| 668 | reqargs = 2 |
| 669 | else: |
| 670 | reqargs = 1 |
| 671 | if nargs > reqargs: |
| 672 | print("%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)) |
| 673 | error = 1 |
| 674 | continue |
| 675 | |
| 676 | if nargs < reqargs: |
| 677 | print("%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)) |
| 678 | error = 1 |
| 679 | continue |
| 680 | |
| 681 | if tokname == 'ignore': |
| 682 | print("%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)) |
| 683 | error = 1 |
| 684 | continue |
| 685 | |
| 686 | if tokname == 'error': |
| 687 | errorf[state] = f |
| 688 | continue |
| 689 | |
| 690 | if f.__doc__: |
| 691 | if not optimize: |
| 692 | try: |
| 693 | c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) |
| 694 | if c.match(""): |
| 695 | print("%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)) |
| 696 | error = 1 |
| 697 | continue |
| 698 | except re.error as e: |
| 699 | print("%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)) |
| 700 | if '#' in f.__doc__: |
| 701 | print("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)) |
| 702 | error = 1 |
| 703 | continue |
| 704 | |
| 705 | if debug: |
| 706 | print("lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)) |
| 707 | |
| 708 | # Okay. The regular expression seemed okay. Let's append it to the master regular |
| 709 | # expression we're building |
| 710 | |
| 711 | regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) |
| 712 | else: |
| 713 | print("%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)) |
| 714 | |
| 715 | # Now add all of the simple rules |
| 716 | for name,r in strsym[state]: |
| 717 | tokname = toknames[name] |
| 718 | |
| 719 | if tokname == 'ignore': |
| 720 | ignore[state] = r |
| 721 | continue |
| 722 | |
| 723 | if not optimize: |
| 724 | if tokname == 'error': |
| 725 | raise SyntaxError("lex: Rule '%s' must be defined as a function" % name) |
| 726 | error = 1 |
| 727 | continue |
| 728 | |
| 729 | if tokname not in lexobj.lextokens and tokname.find("ignore_") < 0: |
| 730 | print("lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)) |
| 731 | error = 1 |
| 732 | continue |
| 733 | try: |
| 734 | c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) |
| 735 | if (c.match("")): |
| 736 | print("lex: Regular expression for rule '%s' matches empty string." % name) |
| 737 | error = 1 |
| 738 | continue |
| 739 | except re.error as e: |
| 740 | print("lex: Invalid regular expression for rule '%s'. %s" % (name,e)) |
| 741 | if '#' in r: |
| 742 | print("lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name) |
| 743 | |
| 744 | error = 1 |
| 745 | continue |
| 746 | if debug: |
| 747 | print("lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)) |
| 748 | |
| 749 | regex_list.append("(?P<%s>%s)" % (name,r)) |
| 750 | |
| 751 | if not regex_list: |
| 752 | print("lex: No rules defined for state '%s'" % state) |
| 753 | error = 1 |
| 754 | |
| 755 | regexs[state] = regex_list |
| 756 | |
| 757 | |
| 758 | if not optimize: |
| 759 | for f in files.keys(): |
| 760 | if not _validate_file(f): |
| 761 | error = 1 |
| 762 | |
| 763 | if error: |
| 764 | raise SyntaxError("lex: Unable to build lexer.") |
| 765 | |
| 766 | # From this point forward, we're reasonably confident that we can build the lexer. |
| 767 | # No more errors will be generated, but there might be some warning messages. |
| 768 | |
| 769 | # Build the master regular expressions |
| 770 | |
| 771 | for state in regexs.keys(): |
| 772 | lexre, re_text = _form_master_re(regexs[state],reflags,ldict) |
| 773 | lexobj.lexstatere[state] = lexre |
| 774 | lexobj.lexstateretext[state] = re_text |
| 775 | if debug: |
| 776 | for i in range(len(re_text)): |
| 777 | print("lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])) |
| 778 | |
| 779 | # For inclusive states, we need to add the INITIAL state |
| 780 | for state,type in stateinfo.items(): |
| 781 | if state != "INITIAL" and type == 'inclusive': |
| 782 | lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) |
| 783 | lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) |
| 784 | |
| 785 | lexobj.lexstateinfo = stateinfo |
| 786 | lexobj.lexre = lexobj.lexstatere["INITIAL"] |
| 787 | lexobj.lexretext = lexobj.lexstateretext["INITIAL"] |
| 788 | |
| 789 | # Set up ignore variables |
| 790 | lexobj.lexstateignore = ignore |
| 791 | lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") |
| 792 | |
| 793 | # Set up error functions |
| 794 | lexobj.lexstateerrorf = errorf |
| 795 | lexobj.lexerrorf = errorf.get("INITIAL",None) |
| 796 | if warn and not lexobj.lexerrorf: |
| 797 | print("lex: Warning. no t_error rule is defined.") |
| 798 | |
| 799 | # Check state information for ignore and error rules |
| 800 | for s,stype in stateinfo.items(): |
| 801 | if stype == 'exclusive': |
| 802 | if warn and s not in errorf: |
| 803 | print("lex: Warning. no error rule is defined for exclusive state '%s'" % s) |
| 804 | if warn and s not in ignore and lexobj.lexignore: |
| 805 | print("lex: Warning. no ignore rule is defined for exclusive state '%s'" % s) |
| 806 | elif stype == 'inclusive': |
| 807 | if s not in errorf: |
| 808 | errorf[s] = errorf.get("INITIAL",None) |
| 809 | if s not in ignore: |
| 810 | ignore[s] = ignore.get("INITIAL","") |
| 811 | |
| 812 | |
| 813 | # Create global versions of the token() and input() functions |
| 814 | token = lexobj.token |
| 815 | input = lexobj.input |
| 816 | lexer = lexobj |
| 817 | |
| 818 | # If in optimize mode, we write the lextab |
| 819 | if lextab and optimize: |
| 820 | lexobj.writetab(lextab) |
| 821 | |
| 822 | return lexobj |
| 823 | |
| 824 | # ----------------------------------------------------------------------------- |
| 825 | # runmain() |
| 826 | # |
| 827 | # This runs the lexer as a main program |
| 828 | # ----------------------------------------------------------------------------- |
| 829 | |
| 830 | def runmain(lexer=None,data=None): |
| 831 | if not data: |
| 832 | try: |
| 833 | filename = sys.argv[1] |
| 834 | f = open(filename) |
| 835 | data = f.read() |
| 836 | f.close() |
| 837 | except IndexError: |
| 838 | print("Reading from standard input (type EOF to end):") |
| 839 | data = sys.stdin.read() |
| 840 | |
| 841 | if lexer: |
| 842 | _input = lexer.input |
| 843 | else: |
| 844 | _input = input |
| 845 | _input(data) |
| 846 | if lexer: |
| 847 | _token = lexer.token |
| 848 | else: |
| 849 | _token = token |
| 850 | |
| 851 | while 1: |
| 852 | tok = _token() |
| 853 | if not tok: break |
| 854 | print("(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)) |
| 855 | |
| 856 | |
| 857 | # ----------------------------------------------------------------------------- |
| 858 | # @TOKEN(regex) |
| 859 | # |
| 860 | # This decorator function can be used to set the regex expression on a function |
| 861 | # when its docstring might need to be set in an alternative way |
| 862 | # ----------------------------------------------------------------------------- |
| 863 | |
| 864 | def TOKEN(r): |
| 865 | def set_doc(f): |
| 866 | f.__doc__ = r |
| 867 | return f |
| 868 | return set_doc |
| 869 | |
| 870 | # Alternative spelling of the TOKEN decorator |
| 871 | Token = TOKEN |
| 872 | |