Jeremy Hylton | a505812 | 2000-02-14 14:14:29 +0000 | [diff] [blame] | 1 | """Assembler for Python bytecode |
| 2 | |
| 3 | The new module is used to create the code object. The following |
| 4 | attribute definitions are included from the reference manual: |
| 5 | |
| 6 | co_name gives the function name |
| 7 | co_argcount is the number of positional arguments (including |
| 8 | arguments with default values) |
| 9 | co_nlocals is the number of local variables used by the function |
| 10 | (including arguments) |
| 11 | co_varnames is a tuple containing the names of the local variables |
| 12 | (starting with the argument names) |
| 13 | co_code is a string representing the sequence of bytecode instructions |
| 14 | co_consts is a tuple containing the literals used by the bytecode |
| 15 | co_names is a tuple containing the names used by the bytecode |
| 16 | co_filename is the filename from which the code was compiled |
| 17 | co_firstlineno is the first line number of the function |
| 18 | co_lnotab is a string encoding the mapping from byte code offsets |
| 19 | to line numbers. see LineAddrTable below. |
| 20 | co_stacksize is the required stack size (including local variables) |
| 21 | co_flags is an integer encoding a number of flags for the |
| 22 | interpreter. There are four flags: |
| 23 | CO_OPTIMIZED -- uses load fast |
| 24 | CO_NEWLOCALS -- everything? |
| 25 | CO_VARARGS -- use *args |
| 26 | CO_VARKEYWORDS -- uses **args |
| 27 | |
| 28 | If a code object represents a function, the first item in co_consts is |
| 29 | the documentation string of the function, or None if undefined. |
| 30 | """ |
| 31 | |
| 32 | import sys |
| 33 | import dis |
| 34 | import new |
| 35 | import string |
| 36 | |
| 37 | import misc |
| 38 | |
| 39 | # flags for code objects |
| 40 | CO_OPTIMIZED = 0x0001 |
| 41 | CO_NEWLOCALS = 0x0002 |
| 42 | CO_VARARGS = 0x0004 |
| 43 | CO_VARKEYWORDS = 0x0008 |
| 44 | |
| 45 | class PyAssembler: |
| 46 | """Creates Python code objects |
| 47 | """ |
| 48 | |
| 49 | # XXX this class needs to major refactoring |
| 50 | |
| 51 | def __init__(self, args=(), name='?', filename='<?>', |
| 52 | docstring=None): |
| 53 | # XXX why is the default value for flags 3? |
| 54 | self.insts = [] |
| 55 | # used by makeCodeObject |
| 56 | self.argcount = len(args) |
| 57 | self.code = '' |
| 58 | self.consts = [docstring] |
| 59 | self.filename = filename |
| 60 | self.flags = CO_NEWLOCALS |
| 61 | self.name = name |
| 62 | self.names = [] |
| 63 | self.varnames = list(args) or [] |
| 64 | # lnotab support |
| 65 | self.firstlineno = 0 |
| 66 | self.lastlineno = 0 |
| 67 | self.last_addr = 0 |
| 68 | self.lnotab = '' |
| 69 | |
| 70 | def __repr__(self): |
| 71 | return "<bytecode: %d instrs>" % len(self.insts) |
| 72 | |
| 73 | def setFlags(self, val): |
| 74 | """XXX for module's function""" |
| 75 | self.flags = val |
| 76 | |
| 77 | def setOptimized(self): |
| 78 | self.flags = self.flags | CO_OPTIMIZED |
| 79 | |
| 80 | def setVarArgs(self): |
| 81 | self.flags = self.flags | CO_VARARGS |
| 82 | |
| 83 | def setKWArgs(self): |
| 84 | self.flags = self.flags | CO_VARKEYWORDS |
| 85 | |
| 86 | def getCurInst(self): |
| 87 | return len(self.insts) |
| 88 | |
| 89 | def getNextInst(self): |
| 90 | return len(self.insts) + 1 |
| 91 | |
| 92 | def dump(self, io=sys.stdout): |
| 93 | i = 0 |
| 94 | for inst in self.insts: |
| 95 | if inst[0] == 'SET_LINENO': |
| 96 | io.write("\n") |
| 97 | io.write(" %3d " % i) |
| 98 | if len(inst) == 1: |
| 99 | io.write("%s\n" % inst) |
| 100 | else: |
| 101 | io.write("%-15.15s\t%s\n" % inst) |
| 102 | i = i + 1 |
| 103 | |
| 104 | def makeCodeObject(self): |
| 105 | """Make a Python code object |
| 106 | |
| 107 | This creates a Python code object using the new module. This |
| 108 | seems simpler than reverse-engineering the way marshal dumps |
| 109 | code objects into .pyc files. One of the key difficulties is |
| 110 | figuring out how to layout references to code objects that |
| 111 | appear on the VM stack; e.g. |
| 112 | 3 SET_LINENO 1 |
| 113 | 6 LOAD_CONST 0 (<code object fact at 8115878 [...] |
| 114 | 9 MAKE_FUNCTION 0 |
| 115 | 12 STORE_NAME 0 (fact) |
| 116 | """ |
| 117 | |
| 118 | self._findOffsets() |
| 119 | lnotab = LineAddrTable() |
| 120 | for t in self.insts: |
| 121 | opname = t[0] |
| 122 | if len(t) == 1: |
| 123 | lnotab.addCode(chr(self.opnum[opname])) |
| 124 | elif len(t) == 2: |
| 125 | oparg = self._convertArg(opname, t[1]) |
| 126 | if opname == 'SET_LINENO': |
| 127 | lnotab.nextLine(oparg) |
| 128 | try: |
| 129 | hi, lo = divmod(oparg, 256) |
| 130 | except TypeError: |
| 131 | raise TypeError, "untranslated arg: %s, %s" % (opname, oparg) |
| 132 | lnotab.addCode(chr(self.opnum[opname]) + chr(lo) + |
| 133 | chr(hi)) |
| 134 | # why is a module a special case? |
| 135 | if self.flags == 0: |
| 136 | nlocals = 0 |
| 137 | else: |
| 138 | nlocals = len(self.varnames) |
| 139 | # XXX danger! can't pass through here twice |
| 140 | if self.flags & CO_VARKEYWORDS: |
| 141 | self.argcount = self.argcount - 1 |
| 142 | stacksize = findDepth(self.insts) |
Jeremy Hylton | 410e840 | 2000-02-15 21:59:50 +0000 | [diff] [blame^] | 143 | try: |
| 144 | co = new.code(self.argcount, nlocals, stacksize, |
| 145 | self.flags, lnotab.getCode(), self._getConsts(), |
| 146 | tuple(self.names), tuple(self.varnames), |
| 147 | self.filename, self.name, self.firstlineno, |
| 148 | lnotab.getTable()) |
| 149 | except SystemError, err: |
| 150 | print err |
| 151 | print repr(self.argcount) |
| 152 | print repr(nlocals) |
| 153 | print repr(stacksize) |
| 154 | print repr(self.flags) |
| 155 | print repr(lnotab.getCode()) |
| 156 | print repr(self._getConsts()) |
| 157 | print repr(self.names) |
| 158 | print repr(self.varnames) |
| 159 | print repr(self.filename) |
| 160 | print repr(self.name) |
| 161 | print repr(self.firstlineno) |
| 162 | print repr(lnotab.getTable()) |
| 163 | raise |
Jeremy Hylton | a505812 | 2000-02-14 14:14:29 +0000 | [diff] [blame] | 164 | return co |
| 165 | |
| 166 | def _getConsts(self): |
| 167 | """Return a tuple for the const slot of a code object |
| 168 | |
| 169 | Converts PythonVMCode objects to code objects |
| 170 | """ |
| 171 | l = [] |
| 172 | for elt in self.consts: |
| 173 | # XXX might be clearer to just as isinstance(CodeGen) |
| 174 | if hasattr(elt, 'asConst'): |
| 175 | l.append(elt.asConst()) |
| 176 | else: |
| 177 | l.append(elt) |
| 178 | return tuple(l) |
| 179 | |
| 180 | def _findOffsets(self): |
| 181 | """Find offsets for use in resolving StackRefs""" |
| 182 | self.offsets = [] |
| 183 | cur = 0 |
| 184 | for t in self.insts: |
| 185 | self.offsets.append(cur) |
| 186 | l = len(t) |
| 187 | if l == 1: |
| 188 | cur = cur + 1 |
| 189 | elif l == 2: |
| 190 | cur = cur + 3 |
| 191 | arg = t[1] |
| 192 | # XXX this is a total hack: for a reference used |
| 193 | # multiple times, we create a list of offsets and |
| 194 | # expect that we when we pass through the code again |
| 195 | # to actually generate the offsets, we'll pass in the |
| 196 | # same order. |
| 197 | if isinstance(arg, StackRef): |
| 198 | try: |
| 199 | arg.__offset.append(cur) |
| 200 | except AttributeError: |
| 201 | arg.__offset = [cur] |
| 202 | |
| 203 | def _convertArg(self, op, arg): |
| 204 | """Convert the string representation of an arg to a number |
| 205 | |
| 206 | The specific handling depends on the opcode. |
| 207 | |
| 208 | XXX This first implementation isn't going to be very |
| 209 | efficient. |
| 210 | """ |
| 211 | if op == 'SET_LINENO': |
| 212 | return arg |
| 213 | if op == 'LOAD_CONST': |
| 214 | return self._lookupName(arg, self.consts) |
| 215 | if op in self.localOps: |
| 216 | # make sure it's in self.names, but use the bytecode offset |
| 217 | self._lookupName(arg, self.names) |
| 218 | return self._lookupName(arg, self.varnames) |
| 219 | if op in self.globalOps: |
| 220 | return self._lookupName(arg, self.names) |
| 221 | if op in self.nameOps: |
| 222 | return self._lookupName(arg, self.names) |
| 223 | if op == 'COMPARE_OP': |
| 224 | return self.cmp_op.index(arg) |
| 225 | if self.hasjrel.has_elt(op): |
| 226 | offset = arg.__offset[0] |
| 227 | del arg.__offset[0] |
| 228 | return self.offsets[arg.resolve()] - offset |
| 229 | if self.hasjabs.has_elt(op): |
| 230 | return self.offsets[arg.resolve()] |
| 231 | return arg |
| 232 | |
| 233 | nameOps = ('STORE_NAME', 'IMPORT_NAME', 'IMPORT_FROM', |
| 234 | 'STORE_ATTR', 'LOAD_ATTR', 'LOAD_NAME', 'DELETE_NAME') |
| 235 | localOps = ('LOAD_FAST', 'STORE_FAST', 'DELETE_FAST') |
| 236 | globalOps = ('LOAD_GLOBAL', 'STORE_GLOBAL', 'DELETE_GLOBAL') |
| 237 | |
| 238 | def _lookupName(self, name, list, list2=None): |
| 239 | """Return index of name in list, appending if necessary |
| 240 | |
| 241 | Yicky hack: Second list can be used for lookup of local names |
| 242 | where the name needs to be added to varnames and names. |
| 243 | """ |
| 244 | if name in list: |
| 245 | return list.index(name) |
| 246 | else: |
| 247 | end = len(list) |
| 248 | list.append(name) |
| 249 | if list2 is not None: |
| 250 | list2.append(name) |
| 251 | return end |
| 252 | |
| 253 | # Convert some stuff from the dis module for local use |
| 254 | |
| 255 | cmp_op = list(dis.cmp_op) |
| 256 | hasjrel = misc.Set() |
| 257 | for i in dis.hasjrel: |
| 258 | hasjrel.add(dis.opname[i]) |
| 259 | hasjabs = misc.Set() |
| 260 | for i in dis.hasjabs: |
| 261 | hasjabs.add(dis.opname[i]) |
| 262 | |
| 263 | opnum = {} |
| 264 | for num in range(len(dis.opname)): |
| 265 | opnum[dis.opname[num]] = num |
| 266 | |
| 267 | # this version of emit + arbitrary hooks might work, but it's damn |
| 268 | # messy. |
| 269 | |
| 270 | def emit(self, *args): |
| 271 | self._emitDispatch(args[0], args[1:]) |
| 272 | self.insts.append(args) |
| 273 | |
| 274 | def _emitDispatch(self, type, args): |
| 275 | for func in self._emit_hooks.get(type, []): |
| 276 | func(self, args) |
| 277 | |
| 278 | _emit_hooks = {} |
| 279 | |
| 280 | class LineAddrTable: |
| 281 | """lnotab |
| 282 | |
| 283 | This class builds the lnotab, which is undocumented but described |
| 284 | by com_set_lineno in compile.c. Here's an attempt at explanation: |
| 285 | |
| 286 | For each SET_LINENO instruction after the first one, two bytes are |
| 287 | added to lnotab. (In some cases, multiple two-byte entries are |
| 288 | added.) The first byte is the distance in bytes between the |
| 289 | instruction for the last SET_LINENO and the current SET_LINENO. |
| 290 | The second byte is offset in line numbers. If either offset is |
| 291 | greater than 255, multiple two-byte entries are added -- one entry |
| 292 | for each factor of 255. |
| 293 | """ |
| 294 | |
| 295 | def __init__(self): |
| 296 | self.code = [] |
| 297 | self.codeOffset = 0 |
| 298 | self.firstline = 0 |
| 299 | self.lastline = 0 |
| 300 | self.lastoff = 0 |
| 301 | self.lnotab = [] |
| 302 | |
| 303 | def addCode(self, code): |
| 304 | self.code.append(code) |
| 305 | self.codeOffset = self.codeOffset + len(code) |
| 306 | |
| 307 | def nextLine(self, lineno): |
| 308 | if self.firstline == 0: |
| 309 | self.firstline = lineno |
| 310 | self.lastline = lineno |
| 311 | else: |
| 312 | # compute deltas |
| 313 | addr = self.codeOffset - self.lastoff |
| 314 | line = lineno - self.lastline |
| 315 | while addr > 0 or line > 0: |
| 316 | # write the values in 1-byte chunks that sum |
| 317 | # to desired value |
| 318 | trunc_addr = addr |
| 319 | trunc_line = line |
| 320 | if trunc_addr > 255: |
| 321 | trunc_addr = 255 |
| 322 | if trunc_line > 255: |
| 323 | trunc_line = 255 |
| 324 | self.lnotab.append(trunc_addr) |
| 325 | self.lnotab.append(trunc_line) |
| 326 | addr = addr - trunc_addr |
| 327 | line = line - trunc_line |
| 328 | self.lastline = lineno |
| 329 | self.lastoff = self.codeOffset |
| 330 | |
| 331 | def getCode(self): |
| 332 | return string.join(self.code, '') |
| 333 | |
| 334 | def getTable(self): |
| 335 | return string.join(map(chr, self.lnotab), '') |
| 336 | |
| 337 | class StackRef: |
| 338 | """Manage stack locations for jumps, loops, etc.""" |
| 339 | count = 0 |
| 340 | |
| 341 | def __init__(self, id=None, val=None): |
| 342 | if id is None: |
| 343 | id = StackRef.count |
| 344 | StackRef.count = StackRef.count + 1 |
| 345 | self.id = id |
| 346 | self.val = val |
| 347 | |
| 348 | def __repr__(self): |
| 349 | if self.val: |
| 350 | return "StackRef(val=%d)" % self.val |
| 351 | else: |
| 352 | return "StackRef(id=%d)" % self.id |
| 353 | |
| 354 | def bind(self, inst): |
| 355 | self.val = inst |
| 356 | |
| 357 | def resolve(self): |
| 358 | if self.val is None: |
| 359 | print "UNRESOLVE REF", self |
| 360 | return 0 |
| 361 | return self.val |
| 362 | |
| 363 | class StackDepthTracker: |
| 364 | # XXX need to keep track of stack depth on jumps |
| 365 | |
| 366 | def findDepth(self, insts): |
| 367 | depth = 0 |
| 368 | maxDepth = 0 |
| 369 | for i in insts: |
| 370 | opname = i[0] |
| 371 | delta = self.effect.get(opname, 0) |
| 372 | if delta > 1: |
| 373 | depth = depth + delta |
| 374 | elif delta < 0: |
| 375 | if depth > maxDepth: |
| 376 | maxDepth = depth |
| 377 | depth = depth + delta |
| 378 | else: |
| 379 | if depth > maxDepth: |
| 380 | maxDepth = depth |
| 381 | # now check patterns |
| 382 | for pat, delta in self.patterns: |
| 383 | if opname[:len(pat)] == pat: |
| 384 | depth = depth + delta |
| 385 | break |
| 386 | # if we still haven't found a match |
| 387 | if delta == 0: |
| 388 | meth = getattr(self, opname) |
| 389 | depth = depth + meth(i[1]) |
| 390 | if depth < 0: |
| 391 | depth = 0 |
| 392 | return maxDepth |
| 393 | |
| 394 | effect = { |
| 395 | 'POP_TOP': -1, |
| 396 | 'DUP_TOP': 1, |
| 397 | 'SLICE+1': -1, |
| 398 | 'SLICE+2': -1, |
| 399 | 'SLICE+3': -2, |
| 400 | 'STORE_SLICE+0': -1, |
| 401 | 'STORE_SLICE+1': -2, |
| 402 | 'STORE_SLICE+2': -2, |
| 403 | 'STORE_SLICE+3': -3, |
| 404 | 'DELETE_SLICE+0': -1, |
| 405 | 'DELETE_SLICE+1': -2, |
| 406 | 'DELETE_SLICE+2': -2, |
| 407 | 'DELETE_SLICE+3': -3, |
| 408 | 'STORE_SUBSCR': -3, |
| 409 | 'DELETE_SUBSCR': -2, |
| 410 | # PRINT_EXPR? |
| 411 | 'PRINT_ITEM': -1, |
| 412 | 'LOAD_LOCALS': 1, |
| 413 | 'RETURN_VALUE': -1, |
| 414 | 'EXEC_STMT': -2, |
| 415 | 'BUILD_CLASS': -2, |
| 416 | 'STORE_NAME': -1, |
| 417 | 'STORE_ATTR': -2, |
| 418 | 'DELETE_ATTR': -1, |
| 419 | 'STORE_GLOBAL': -1, |
| 420 | 'BUILD_MAP': 1, |
| 421 | 'COMPARE_OP': -1, |
| 422 | 'STORE_FAST': -1, |
| 423 | } |
| 424 | # use pattern match |
| 425 | patterns = [ |
| 426 | ('BINARY_', -1), |
| 427 | ('LOAD_', 1), |
| 428 | ('IMPORT_', 1), |
| 429 | ] |
| 430 | # special cases |
| 431 | |
| 432 | #: UNPACK_TUPLE, UNPACK_LIST, BUILD_TUPLE, |
| 433 | # BUILD_LIST, CALL_FUNCTION, MAKE_FUNCTION, BUILD_SLICE |
| 434 | def UNPACK_TUPLE(self, count): |
| 435 | return count |
| 436 | def UNPACK_LIST(self, count): |
| 437 | return count |
| 438 | def BUILD_TUPLE(self, count): |
| 439 | return -count |
| 440 | def BUILD_LIST(self, count): |
| 441 | return -count |
| 442 | def CALL_FUNCTION(self, argc): |
| 443 | hi, lo = divmod(argc, 256) |
| 444 | return lo + hi * 2 |
| 445 | def MAKE_FUNCTION(self, argc): |
| 446 | return -argc |
| 447 | def BUILD_SLICE(self, argc): |
| 448 | if argc == 2: |
| 449 | return -1 |
| 450 | elif argc == 3: |
| 451 | return -2 |
| 452 | |
| 453 | findDepth = StackDepthTracker().findDepth |