| Tor Norbye | 3a2425a | 2013-11-04 10:16:08 -0800 | [diff] [blame^] | 1 | """Code parsing for Coverage.""" |
| 2 | |
| 3 | import glob, opcode, os, re, sys, token, tokenize |
| 4 | |
| 5 | from coverage.backward import set, sorted, StringIO # pylint: disable=W0622 |
| 6 | from coverage.backward import open_source |
| 7 | from coverage.bytecode import ByteCodes, CodeObjects |
| 8 | from coverage.misc import nice_pair, expensive, join_regex |
| 9 | from coverage.misc import CoverageException, NoSource, NotPython |
| 10 | |
| 11 | |
| 12 | class CodeParser(object): |
| 13 | """Parse code to find executable lines, excluded lines, etc.""" |
| 14 | |
| 15 | def __init__(self, text=None, filename=None, exclude=None): |
| 16 | """ |
| 17 | Source can be provided as `text`, the text itself, or `filename`, from |
| 18 | which the text will be read. Excluded lines are those that match |
| 19 | `exclude`, a regex. |
| 20 | |
| 21 | """ |
| 22 | assert text or filename, "CodeParser needs either text or filename" |
| 23 | self.filename = filename or "<code>" |
| 24 | self.text = text |
| 25 | if not self.text: |
| 26 | try: |
| 27 | sourcef = open_source(self.filename) |
| 28 | try: |
| 29 | self.text = sourcef.read() |
| 30 | finally: |
| 31 | sourcef.close() |
| 32 | except IOError: |
| 33 | _, err, _ = sys.exc_info() |
| 34 | raise NoSource( |
| 35 | "No source for code: %r: %s" % (self.filename, err) |
| 36 | ) |
| 37 | |
| 38 | self.exclude = exclude |
| 39 | |
| 40 | self.show_tokens = False |
| 41 | |
| 42 | # The text lines of the parsed code. |
| 43 | self.lines = self.text.split('\n') |
| 44 | |
| 45 | # The line numbers of excluded lines of code. |
| 46 | self.excluded = set() |
| 47 | |
| 48 | # The line numbers of docstring lines. |
| 49 | self.docstrings = set() |
| 50 | |
| 51 | # The line numbers of class definitions. |
| 52 | self.classdefs = set() |
| 53 | |
| 54 | # A dict mapping line numbers to (lo,hi) for multi-line statements. |
| 55 | self.multiline = {} |
| 56 | |
| 57 | # The line numbers that start statements. |
| 58 | self.statement_starts = set() |
| 59 | |
| 60 | # Lazily-created ByteParser |
| 61 | self._byte_parser = None |
| 62 | |
| 63 | def _get_byte_parser(self): |
| 64 | """Create a ByteParser on demand.""" |
| 65 | if not self._byte_parser: |
| 66 | self._byte_parser = \ |
| 67 | ByteParser(text=self.text, filename=self.filename) |
| 68 | return self._byte_parser |
| 69 | byte_parser = property(_get_byte_parser) |
| 70 | |
| 71 | def lines_matching(self, *regexes): |
| 72 | """Find the lines matching one of a list of regexes. |
| 73 | |
| 74 | Returns a set of line numbers, the lines that contain a match for one |
| 75 | of the regexes in `regexes`. The entire line needn't match, just a |
| 76 | part of it. |
| 77 | |
| 78 | """ |
| 79 | regex_c = re.compile(join_regex(regexes)) |
| 80 | matches = set() |
| 81 | for i, ltext in enumerate(self.lines): |
| 82 | if regex_c.search(ltext): |
| 83 | matches.add(i+1) |
| 84 | return matches |
| 85 | |
| 86 | def _raw_parse(self): |
| 87 | """Parse the source to find the interesting facts about its lines. |
| 88 | |
| 89 | A handful of member fields are updated. |
| 90 | |
| 91 | """ |
| 92 | # Find lines which match an exclusion pattern. |
| 93 | if self.exclude: |
| 94 | self.excluded = self.lines_matching(self.exclude) |
| 95 | |
| 96 | # Tokenize, to find excluded suites, to find docstrings, and to find |
| 97 | # multi-line statements. |
| 98 | indent = 0 |
| 99 | exclude_indent = 0 |
| 100 | excluding = False |
| 101 | prev_toktype = token.INDENT |
| 102 | first_line = None |
| 103 | empty = True |
| 104 | |
| 105 | tokgen = tokenize.generate_tokens(StringIO(self.text).readline) |
| 106 | for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen: |
| 107 | if self.show_tokens: # pragma: no cover |
| 108 | print("%10s %5s %-20r %r" % ( |
| 109 | tokenize.tok_name.get(toktype, toktype), |
| 110 | nice_pair((slineno, elineno)), ttext, ltext |
| 111 | )) |
| 112 | if toktype == token.INDENT: |
| 113 | indent += 1 |
| 114 | elif toktype == token.DEDENT: |
| 115 | indent -= 1 |
| 116 | elif toktype == token.NAME and ttext == 'class': |
| 117 | # Class definitions look like branches in the byte code, so |
| 118 | # we need to exclude them. The simplest way is to note the |
| 119 | # lines with the 'class' keyword. |
| 120 | self.classdefs.add(slineno) |
| 121 | elif toktype == token.OP and ttext == ':': |
| 122 | if not excluding and elineno in self.excluded: |
| 123 | # Start excluding a suite. We trigger off of the colon |
| 124 | # token so that the #pragma comment will be recognized on |
| 125 | # the same line as the colon. |
| 126 | exclude_indent = indent |
| 127 | excluding = True |
| 128 | elif toktype == token.STRING and prev_toktype == token.INDENT: |
| 129 | # Strings that are first on an indented line are docstrings. |
| 130 | # (a trick from trace.py in the stdlib.) This works for |
| 131 | # 99.9999% of cases. For the rest (!) see: |
| 132 | # http://stackoverflow.com/questions/1769332/x/1769794#1769794 |
| 133 | for i in range(slineno, elineno+1): |
| 134 | self.docstrings.add(i) |
| 135 | elif toktype == token.NEWLINE: |
| 136 | if first_line is not None and elineno != first_line: |
| 137 | # We're at the end of a line, and we've ended on a |
| 138 | # different line than the first line of the statement, |
| 139 | # so record a multi-line range. |
| 140 | rng = (first_line, elineno) |
| 141 | for l in range(first_line, elineno+1): |
| 142 | self.multiline[l] = rng |
| 143 | first_line = None |
| 144 | |
| 145 | if ttext.strip() and toktype != tokenize.COMMENT: |
| 146 | # A non-whitespace token. |
| 147 | empty = False |
| 148 | if first_line is None: |
| 149 | # The token is not whitespace, and is the first in a |
| 150 | # statement. |
| 151 | first_line = slineno |
| 152 | # Check whether to end an excluded suite. |
| 153 | if excluding and indent <= exclude_indent: |
| 154 | excluding = False |
| 155 | if excluding: |
| 156 | self.excluded.add(elineno) |
| 157 | |
| 158 | prev_toktype = toktype |
| 159 | |
| 160 | # Find the starts of the executable statements. |
| 161 | if not empty: |
| 162 | self.statement_starts.update(self.byte_parser._find_statements()) |
| 163 | |
| 164 | def first_line(self, line): |
| 165 | """Return the first line number of the statement including `line`.""" |
| 166 | rng = self.multiline.get(line) |
| 167 | if rng: |
| 168 | first_line = rng[0] |
| 169 | else: |
| 170 | first_line = line |
| 171 | return first_line |
| 172 | |
| 173 | def first_lines(self, lines, ignore=None): |
| 174 | """Map the line numbers in `lines` to the correct first line of the |
| 175 | statement. |
| 176 | |
| 177 | Skip any line mentioned in `ignore`. |
| 178 | |
| 179 | Returns a sorted list of the first lines. |
| 180 | |
| 181 | """ |
| 182 | ignore = ignore or [] |
| 183 | lset = set() |
| 184 | for l in lines: |
| 185 | if l in ignore: |
| 186 | continue |
| 187 | new_l = self.first_line(l) |
| 188 | if new_l not in ignore: |
| 189 | lset.add(new_l) |
| 190 | return sorted(lset) |
| 191 | |
| 192 | def parse_source(self): |
| 193 | """Parse source text to find executable lines, excluded lines, etc. |
| 194 | |
| 195 | Return values are 1) a sorted list of executable line numbers, and |
| 196 | 2) a sorted list of excluded line numbers. |
| 197 | |
| 198 | Reported line numbers are normalized to the first line of multi-line |
| 199 | statements. |
| 200 | |
| 201 | """ |
| 202 | self._raw_parse() |
| 203 | |
| 204 | excluded_lines = self.first_lines(self.excluded) |
| 205 | ignore = excluded_lines + list(self.docstrings) |
| 206 | lines = self.first_lines(self.statement_starts, ignore) |
| 207 | |
| 208 | return lines, excluded_lines |
| 209 | |
| 210 | def arcs(self): |
| 211 | """Get information about the arcs available in the code. |
| 212 | |
| 213 | Returns a sorted list of line number pairs. Line numbers have been |
| 214 | normalized to the first line of multiline statements. |
| 215 | |
| 216 | """ |
| 217 | all_arcs = [] |
| 218 | for l1, l2 in self.byte_parser._all_arcs(): |
| 219 | fl1 = self.first_line(l1) |
| 220 | fl2 = self.first_line(l2) |
| 221 | if fl1 != fl2: |
| 222 | all_arcs.append((fl1, fl2)) |
| 223 | return sorted(all_arcs) |
| 224 | arcs = expensive(arcs) |
| 225 | |
| 226 | def exit_counts(self): |
| 227 | """Get a mapping from line numbers to count of exits from that line. |
| 228 | |
| 229 | Excluded lines are excluded. |
| 230 | |
| 231 | """ |
| 232 | excluded_lines = self.first_lines(self.excluded) |
| 233 | exit_counts = {} |
| 234 | for l1, l2 in self.arcs(): |
| 235 | if l1 < 0: |
| 236 | # Don't ever report -1 as a line number |
| 237 | continue |
| 238 | if l1 in excluded_lines: |
| 239 | # Don't report excluded lines as line numbers. |
| 240 | continue |
| 241 | if l2 in excluded_lines: |
| 242 | # Arcs to excluded lines shouldn't count. |
| 243 | continue |
| 244 | if l1 not in exit_counts: |
| 245 | exit_counts[l1] = 0 |
| 246 | exit_counts[l1] += 1 |
| 247 | |
| 248 | # Class definitions have one extra exit, so remove one for each: |
| 249 | for l in self.classdefs: |
| 250 | # Ensure key is there: classdefs can include excluded lines. |
| 251 | if l in exit_counts: |
| 252 | exit_counts[l] -= 1 |
| 253 | |
| 254 | return exit_counts |
| 255 | exit_counts = expensive(exit_counts) |
| 256 | |
| 257 | |
| 258 | ## Opcodes that guide the ByteParser. |
| 259 | |
| 260 | def _opcode(name): |
| 261 | """Return the opcode by name from the opcode module.""" |
| 262 | return opcode.opmap[name] |
| 263 | |
| 264 | def _opcode_set(*names): |
| 265 | """Return a set of opcodes by the names in `names`.""" |
| 266 | s = set() |
| 267 | for name in names: |
| 268 | try: |
| 269 | s.add(_opcode(name)) |
| 270 | except KeyError: |
| 271 | pass |
| 272 | return s |
| 273 | |
| 274 | # Opcodes that leave the code object. |
| 275 | OPS_CODE_END = _opcode_set('RETURN_VALUE') |
| 276 | |
| 277 | # Opcodes that unconditionally end the code chunk. |
| 278 | OPS_CHUNK_END = _opcode_set( |
| 279 | 'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS', |
| 280 | 'BREAK_LOOP', 'CONTINUE_LOOP', |
| 281 | ) |
| 282 | |
| 283 | # Opcodes that unconditionally begin a new code chunk. By starting new chunks |
| 284 | # with unconditional jump instructions, we neatly deal with jumps to jumps |
| 285 | # properly. |
| 286 | OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD') |
| 287 | |
| 288 | # Opcodes that push a block on the block stack. |
| 289 | OPS_PUSH_BLOCK = _opcode_set( |
| 290 | 'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH' |
| 291 | ) |
| 292 | |
| 293 | # Block types for exception handling. |
| 294 | OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY') |
| 295 | |
| 296 | # Opcodes that pop a block from the block stack. |
| 297 | OPS_POP_BLOCK = _opcode_set('POP_BLOCK') |
| 298 | |
| 299 | # Opcodes that have a jump destination, but aren't really a jump. |
| 300 | OPS_NO_JUMP = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY') |
| 301 | |
| 302 | # Individual opcodes we need below. |
| 303 | OP_BREAK_LOOP = _opcode('BREAK_LOOP') |
| 304 | OP_END_FINALLY = _opcode('END_FINALLY') |
| 305 | OP_COMPARE_OP = _opcode('COMPARE_OP') |
| 306 | COMPARE_EXCEPTION = 10 # just have to get this const from the code. |
| 307 | OP_LOAD_CONST = _opcode('LOAD_CONST') |
| 308 | OP_RETURN_VALUE = _opcode('RETURN_VALUE') |
| 309 | |
| 310 | |
| 311 | class ByteParser(object): |
| 312 | """Parse byte codes to understand the structure of code.""" |
| 313 | |
| 314 | def __init__(self, code=None, text=None, filename=None): |
| 315 | if code: |
| 316 | self.code = code |
| 317 | else: |
| 318 | if not text: |
| 319 | assert filename, "If no code or text, need a filename" |
| 320 | sourcef = open_source(filename) |
| 321 | try: |
| 322 | text = sourcef.read() |
| 323 | finally: |
| 324 | sourcef.close() |
| 325 | |
| 326 | try: |
| 327 | # Python 2.3 and 2.4 don't like partial last lines, so be sure |
| 328 | # the text ends nicely for them. |
| 329 | self.code = compile(text + '\n', filename, "exec") |
| 330 | except SyntaxError: |
| 331 | _, synerr, _ = sys.exc_info() |
| 332 | raise NotPython( |
| 333 | "Couldn't parse '%s' as Python source: '%s' at line %d" % |
| 334 | (filename, synerr.msg, synerr.lineno) |
| 335 | ) |
| 336 | |
| 337 | # Alternative Python implementations don't always provide all the |
| 338 | # attributes on code objects that we need to do the analysis. |
| 339 | for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']: |
| 340 | if not hasattr(self.code, attr): |
| 341 | raise CoverageException( |
| 342 | "This implementation of Python doesn't support code " |
| 343 | "analysis.\n" |
| 344 | "Run coverage.py under CPython for this command." |
| 345 | ) |
| 346 | |
| 347 | def child_parsers(self): |
| 348 | """Iterate over all the code objects nested within this one. |
| 349 | |
| 350 | The iteration includes `self` as its first value. |
| 351 | |
| 352 | """ |
| 353 | return map(lambda c: ByteParser(code=c), CodeObjects(self.code)) |
| 354 | |
| 355 | # Getting numbers from the lnotab value changed in Py3.0. |
| 356 | if sys.version_info >= (3, 0): |
| 357 | def _lnotab_increments(self, lnotab): |
| 358 | """Return a list of ints from the lnotab bytes in 3.x""" |
| 359 | return list(lnotab) |
| 360 | else: |
| 361 | def _lnotab_increments(self, lnotab): |
| 362 | """Return a list of ints from the lnotab string in 2.x""" |
| 363 | return [ord(c) for c in lnotab] |
| 364 | |
| 365 | def _bytes_lines(self): |
| 366 | """Map byte offsets to line numbers in `code`. |
| 367 | |
| 368 | Uses co_lnotab described in Python/compile.c to map byte offsets to |
| 369 | line numbers. Returns a list: [(b0, l0), (b1, l1), ...] |
| 370 | |
| 371 | """ |
| 372 | # Adapted from dis.py in the standard library. |
| 373 | byte_increments = self._lnotab_increments(self.code.co_lnotab[0::2]) |
| 374 | line_increments = self._lnotab_increments(self.code.co_lnotab[1::2]) |
| 375 | |
| 376 | bytes_lines = [] |
| 377 | last_line_num = None |
| 378 | line_num = self.code.co_firstlineno |
| 379 | byte_num = 0 |
| 380 | for byte_incr, line_incr in zip(byte_increments, line_increments): |
| 381 | if byte_incr: |
| 382 | if line_num != last_line_num: |
| 383 | bytes_lines.append((byte_num, line_num)) |
| 384 | last_line_num = line_num |
| 385 | byte_num += byte_incr |
| 386 | line_num += line_incr |
| 387 | if line_num != last_line_num: |
| 388 | bytes_lines.append((byte_num, line_num)) |
| 389 | return bytes_lines |
| 390 | |
| 391 | def _find_statements(self): |
| 392 | """Find the statements in `self.code`. |
| 393 | |
| 394 | Return a set of line numbers that start statements. Recurses into all |
| 395 | code objects reachable from `self.code`. |
| 396 | |
| 397 | """ |
| 398 | stmts = set() |
| 399 | for bp in self.child_parsers(): |
| 400 | # Get all of the lineno information from this code. |
| 401 | for _, l in bp._bytes_lines(): |
| 402 | stmts.add(l) |
| 403 | return stmts |
| 404 | |
| 405 | def _disassemble(self): # pragma: no cover |
| 406 | """Disassemble code, for ad-hoc experimenting.""" |
| 407 | |
| 408 | import dis |
| 409 | |
| 410 | for bp in self.child_parsers(): |
| 411 | print("\n%s: " % bp.code) |
| 412 | dis.dis(bp.code) |
| 413 | print("Bytes lines: %r" % bp._bytes_lines()) |
| 414 | |
| 415 | print("") |
| 416 | |
| 417 | def _split_into_chunks(self): |
| 418 | """Split the code object into a list of `Chunk` objects. |
| 419 | |
| 420 | Each chunk is only entered at its first instruction, though there can |
| 421 | be many exits from a chunk. |
| 422 | |
| 423 | Returns a list of `Chunk` objects. |
| 424 | |
| 425 | """ |
| 426 | |
| 427 | # The list of chunks so far, and the one we're working on. |
| 428 | chunks = [] |
| 429 | chunk = None |
| 430 | bytes_lines_map = dict(self._bytes_lines()) |
| 431 | |
| 432 | # The block stack: loops and try blocks get pushed here for the |
| 433 | # implicit jumps that can occur. |
| 434 | # Each entry is a tuple: (block type, destination) |
| 435 | block_stack = [] |
| 436 | |
| 437 | # Some op codes are followed by branches that should be ignored. This |
| 438 | # is a count of how many ignores are left. |
| 439 | ignore_branch = 0 |
| 440 | |
| 441 | # We have to handle the last two bytecodes specially. |
| 442 | ult = penult = None |
| 443 | |
| 444 | for bc in ByteCodes(self.code.co_code): |
| 445 | # Maybe have to start a new chunk |
| 446 | if bc.offset in bytes_lines_map: |
| 447 | # Start a new chunk for each source line number. |
| 448 | if chunk: |
| 449 | chunk.exits.add(bc.offset) |
| 450 | chunk = Chunk(bc.offset, bytes_lines_map[bc.offset]) |
| 451 | chunks.append(chunk) |
| 452 | elif bc.op in OPS_CHUNK_BEGIN: |
| 453 | # Jumps deserve their own unnumbered chunk. This fixes |
| 454 | # problems with jumps to jumps getting confused. |
| 455 | if chunk: |
| 456 | chunk.exits.add(bc.offset) |
| 457 | chunk = Chunk(bc.offset) |
| 458 | chunks.append(chunk) |
| 459 | |
| 460 | if not chunk: |
| 461 | chunk = Chunk(bc.offset) |
| 462 | chunks.append(chunk) |
| 463 | |
| 464 | # Look at the opcode |
| 465 | if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP: |
| 466 | if ignore_branch: |
| 467 | # Someone earlier wanted us to ignore this branch. |
| 468 | ignore_branch -= 1 |
| 469 | else: |
| 470 | # The opcode has a jump, it's an exit for this chunk. |
| 471 | chunk.exits.add(bc.jump_to) |
| 472 | |
| 473 | if bc.op in OPS_CODE_END: |
| 474 | # The opcode can exit the code object. |
| 475 | chunk.exits.add(-self.code.co_firstlineno) |
| 476 | if bc.op in OPS_PUSH_BLOCK: |
| 477 | # The opcode adds a block to the block_stack. |
| 478 | block_stack.append((bc.op, bc.jump_to)) |
| 479 | if bc.op in OPS_POP_BLOCK: |
| 480 | # The opcode pops a block from the block stack. |
| 481 | block_stack.pop() |
| 482 | if bc.op in OPS_CHUNK_END: |
| 483 | # This opcode forces the end of the chunk. |
| 484 | if bc.op == OP_BREAK_LOOP: |
| 485 | # A break is implicit: jump where the top of the |
| 486 | # block_stack points. |
| 487 | chunk.exits.add(block_stack[-1][1]) |
| 488 | chunk = None |
| 489 | if bc.op == OP_END_FINALLY: |
| 490 | if block_stack: |
| 491 | # A break that goes through a finally will jump to whatever |
| 492 | # block is on top of the stack. |
| 493 | chunk.exits.add(block_stack[-1][1]) |
| 494 | # For the finally clause we need to find the closest exception |
| 495 | # block, and use its jump target as an exit. |
| 496 | for iblock in range(len(block_stack)-1, -1, -1): |
| 497 | if block_stack[iblock][0] in OPS_EXCEPT_BLOCKS: |
| 498 | chunk.exits.add(block_stack[iblock][1]) |
| 499 | break |
| 500 | if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION: |
| 501 | # This is an except clause. We want to overlook the next |
| 502 | # branch, so that except's don't count as branches. |
| 503 | ignore_branch += 1 |
| 504 | |
| 505 | penult = ult |
| 506 | ult = bc |
| 507 | |
| 508 | if chunks: |
| 509 | # The last two bytecodes could be a dummy "return None" that |
| 510 | # shouldn't be counted as real code. Every Python code object seems |
| 511 | # to end with a return, and a "return None" is inserted if there |
| 512 | # isn't an explicit return in the source. |
| 513 | if ult and penult: |
| 514 | if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE: |
| 515 | if self.code.co_consts[penult.arg] is None: |
| 516 | # This is "return None", but is it dummy? A real line |
| 517 | # would be a last chunk all by itself. |
| 518 | if chunks[-1].byte != penult.offset: |
| 519 | ex = -self.code.co_firstlineno |
| 520 | # Split the last chunk |
| 521 | last_chunk = chunks[-1] |
| 522 | last_chunk.exits.remove(ex) |
| 523 | last_chunk.exits.add(penult.offset) |
| 524 | chunk = Chunk(penult.offset) |
| 525 | chunk.exits.add(ex) |
| 526 | chunks.append(chunk) |
| 527 | |
| 528 | # Give all the chunks a length. |
| 529 | chunks[-1].length = bc.next_offset - chunks[-1].byte |
| 530 | for i in range(len(chunks)-1): |
| 531 | chunks[i].length = chunks[i+1].byte - chunks[i].byte |
| 532 | |
| 533 | return chunks |
| 534 | |
| 535 | def _arcs(self): |
| 536 | """Find the executable arcs in the code. |
| 537 | |
| 538 | Returns a set of pairs, (from,to). From and to are integer line |
| 539 | numbers. If from is < 0, then the arc is an entrance into the code |
| 540 | object. If to is < 0, the arc is an exit from the code object. |
| 541 | |
| 542 | """ |
| 543 | chunks = self._split_into_chunks() |
| 544 | |
| 545 | # A map from byte offsets to chunks jumped into. |
| 546 | byte_chunks = dict([(c.byte, c) for c in chunks]) |
| 547 | |
| 548 | # Build a map from byte offsets to actual lines reached. |
| 549 | byte_lines = {} |
| 550 | bytes_to_add = set([c.byte for c in chunks]) |
| 551 | |
| 552 | while bytes_to_add: |
| 553 | byte_to_add = bytes_to_add.pop() |
| 554 | if byte_to_add in byte_lines or byte_to_add < 0: |
| 555 | continue |
| 556 | |
| 557 | # Which lines does this chunk lead to? |
| 558 | bytes_considered = set() |
| 559 | bytes_to_consider = [byte_to_add] |
| 560 | lines = set() |
| 561 | |
| 562 | while bytes_to_consider: |
| 563 | byte = bytes_to_consider.pop() |
| 564 | bytes_considered.add(byte) |
| 565 | |
| 566 | # Find chunk for byte |
| 567 | try: |
| 568 | ch = byte_chunks[byte] |
| 569 | except KeyError: |
| 570 | for ch in chunks: |
| 571 | if ch.byte <= byte < ch.byte+ch.length: |
| 572 | break |
| 573 | else: |
| 574 | # No chunk for this byte! |
| 575 | raise Exception("Couldn't find chunk @ %d" % byte) |
| 576 | byte_chunks[byte] = ch |
| 577 | |
| 578 | if ch.line: |
| 579 | lines.add(ch.line) |
| 580 | else: |
| 581 | for ex in ch.exits: |
| 582 | if ex < 0: |
| 583 | lines.add(ex) |
| 584 | elif ex not in bytes_considered: |
| 585 | bytes_to_consider.append(ex) |
| 586 | |
| 587 | bytes_to_add.update(ch.exits) |
| 588 | |
| 589 | byte_lines[byte_to_add] = lines |
| 590 | |
| 591 | # Figure out for each chunk where the exits go. |
| 592 | arcs = set() |
| 593 | for chunk in chunks: |
| 594 | if chunk.line: |
| 595 | for ex in chunk.exits: |
| 596 | if ex < 0: |
| 597 | exit_lines = [ex] |
| 598 | else: |
| 599 | exit_lines = byte_lines[ex] |
| 600 | for exit_line in exit_lines: |
| 601 | if chunk.line != exit_line: |
| 602 | arcs.add((chunk.line, exit_line)) |
| 603 | for line in byte_lines[0]: |
| 604 | arcs.add((-1, line)) |
| 605 | |
| 606 | return arcs |
| 607 | |
| 608 | def _all_chunks(self): |
| 609 | """Returns a list of `Chunk` objects for this code and its children. |
| 610 | |
| 611 | See `_split_into_chunks` for details. |
| 612 | |
| 613 | """ |
| 614 | chunks = [] |
| 615 | for bp in self.child_parsers(): |
| 616 | chunks.extend(bp._split_into_chunks()) |
| 617 | |
| 618 | return chunks |
| 619 | |
| 620 | def _all_arcs(self): |
| 621 | """Get the set of all arcs in this code object and its children. |
| 622 | |
| 623 | See `_arcs` for details. |
| 624 | |
| 625 | """ |
| 626 | arcs = set() |
| 627 | for bp in self.child_parsers(): |
| 628 | arcs.update(bp._arcs()) |
| 629 | |
| 630 | return arcs |
| 631 | |
| 632 | |
| 633 | class Chunk(object): |
| 634 | """A sequence of bytecodes with a single entrance. |
| 635 | |
| 636 | To analyze byte code, we have to divide it into chunks, sequences of byte |
| 637 | codes such that each basic block has only one entrance, the first |
| 638 | instruction in the block. |
| 639 | |
| 640 | This is almost the CS concept of `basic block`_, except that we're willing |
| 641 | to have many exits from a chunk, and "basic block" is a more cumbersome |
| 642 | term. |
| 643 | |
| 644 | .. _basic block: http://en.wikipedia.org/wiki/Basic_block |
| 645 | |
| 646 | An exit < 0 means the chunk can leave the code (return). The exit is |
| 647 | the negative of the starting line number of the code block. |
| 648 | |
| 649 | """ |
| 650 | def __init__(self, byte, line=0): |
| 651 | self.byte = byte |
| 652 | self.line = line |
| 653 | self.length = 0 |
| 654 | self.exits = set() |
| 655 | |
| 656 | def __repr__(self): |
| 657 | return "<%d+%d @%d %r>" % ( |
| 658 | self.byte, self.length, self.line, list(self.exits) |
| 659 | ) |
| 660 | |
| 661 | |
| 662 | class AdHocMain(object): # pragma: no cover |
| 663 | """An ad-hoc main for code parsing experiments.""" |
| 664 | |
| 665 | def main(self, args): |
| 666 | """A main function for trying the code from the command line.""" |
| 667 | |
| 668 | from optparse import OptionParser |
| 669 | |
| 670 | parser = OptionParser() |
| 671 | parser.add_option( |
| 672 | "-c", action="store_true", dest="chunks", |
| 673 | help="Show basic block chunks" |
| 674 | ) |
| 675 | parser.add_option( |
| 676 | "-d", action="store_true", dest="dis", |
| 677 | help="Disassemble" |
| 678 | ) |
| 679 | parser.add_option( |
| 680 | "-R", action="store_true", dest="recursive", |
| 681 | help="Recurse to find source files" |
| 682 | ) |
| 683 | parser.add_option( |
| 684 | "-s", action="store_true", dest="source", |
| 685 | help="Show analyzed source" |
| 686 | ) |
| 687 | parser.add_option( |
| 688 | "-t", action="store_true", dest="tokens", |
| 689 | help="Show tokens" |
| 690 | ) |
| 691 | |
| 692 | options, args = parser.parse_args() |
| 693 | if options.recursive: |
| 694 | if args: |
| 695 | root = args[0] |
| 696 | else: |
| 697 | root = "." |
| 698 | for root, _, _ in os.walk(root): |
| 699 | for f in glob.glob(root + "/*.py"): |
| 700 | self.adhoc_one_file(options, f) |
| 701 | else: |
| 702 | self.adhoc_one_file(options, args[0]) |
| 703 | |
| 704 | def adhoc_one_file(self, options, filename): |
| 705 | """Process just one file.""" |
| 706 | |
| 707 | if options.dis or options.chunks: |
| 708 | try: |
| 709 | bp = ByteParser(filename=filename) |
| 710 | except CoverageException: |
| 711 | _, err, _ = sys.exc_info() |
| 712 | print("%s" % (err,)) |
| 713 | return |
| 714 | |
| 715 | if options.dis: |
| 716 | print("Main code:") |
| 717 | bp._disassemble() |
| 718 | |
| 719 | if options.chunks: |
| 720 | chunks = bp._all_chunks() |
| 721 | if options.recursive: |
| 722 | print("%6d: %s" % (len(chunks), filename)) |
| 723 | else: |
| 724 | print("Chunks: %r" % chunks) |
| 725 | arcs = bp._all_arcs() |
| 726 | print("Arcs: %r" % sorted(arcs)) |
| 727 | |
| 728 | if options.source or options.tokens: |
| 729 | cp = CodeParser(filename=filename, exclude=r"no\s*cover") |
| 730 | cp.show_tokens = options.tokens |
| 731 | cp._raw_parse() |
| 732 | |
| 733 | if options.source: |
| 734 | if options.chunks: |
| 735 | arc_width, arc_chars = self.arc_ascii_art(arcs) |
| 736 | else: |
| 737 | arc_width, arc_chars = 0, {} |
| 738 | |
| 739 | exit_counts = cp.exit_counts() |
| 740 | |
| 741 | for i, ltext in enumerate(cp.lines): |
| 742 | lineno = i+1 |
| 743 | m0 = m1 = m2 = m3 = a = ' ' |
| 744 | if lineno in cp.statement_starts: |
| 745 | m0 = '-' |
| 746 | exits = exit_counts.get(lineno, 0) |
| 747 | if exits > 1: |
| 748 | m1 = str(exits) |
| 749 | if lineno in cp.docstrings: |
| 750 | m2 = '"' |
| 751 | if lineno in cp.classdefs: |
| 752 | m2 = 'C' |
| 753 | if lineno in cp.excluded: |
| 754 | m3 = 'x' |
| 755 | a = arc_chars.get(lineno, '').ljust(arc_width) |
| 756 | print("%4d %s%s%s%s%s %s" % |
| 757 | (lineno, m0, m1, m2, m3, a, ltext) |
| 758 | ) |
| 759 | |
| 760 | def arc_ascii_art(self, arcs): |
| 761 | """Draw arcs as ascii art. |
| 762 | |
| 763 | Returns a width of characters needed to draw all the arcs, and a |
| 764 | dictionary mapping line numbers to ascii strings to draw for that line. |
| 765 | |
| 766 | """ |
| 767 | arc_chars = {} |
| 768 | for lfrom, lto in sorted(arcs): |
| 769 | if lfrom < 0: |
| 770 | arc_chars[lto] = arc_chars.get(lto, '') + 'v' |
| 771 | elif lto < 0: |
| 772 | arc_chars[lfrom] = arc_chars.get(lfrom, '') + '^' |
| 773 | else: |
| 774 | if lfrom == lto - 1: |
| 775 | # Don't show obvious arcs. |
| 776 | continue |
| 777 | if lfrom < lto: |
| 778 | l1, l2 = lfrom, lto |
| 779 | else: |
| 780 | l1, l2 = lto, lfrom |
| 781 | w = max([len(arc_chars.get(l, '')) for l in range(l1, l2+1)]) |
| 782 | for l in range(l1, l2+1): |
| 783 | if l == lfrom: |
| 784 | ch = '<' |
| 785 | elif l == lto: |
| 786 | ch = '>' |
| 787 | else: |
| 788 | ch = '|' |
| 789 | arc_chars[l] = arc_chars.get(l, '').ljust(w) + ch |
| 790 | arc_width = 0 |
| 791 | |
| 792 | if arc_chars: |
| 793 | arc_width = max([len(a) for a in arc_chars.values()]) |
| 794 | else: |
| 795 | arc_width = 0 |
| 796 | |
| 797 | return arc_width, arc_chars |
| 798 | |
| 799 | if __name__ == '__main__': |
| 800 | AdHocMain().main(sys.argv[1:]) |