Benjamin Peterson | f37eb3a | 2010-10-14 23:00:04 +0000 | [diff] [blame] | 1 | """A bottom-up tree matching algorithm implementation meant to speed |
| 2 | up 2to3's matching process. After the tree patterns are reduced to |
| 3 | their rarest linear path, a linear Aho-Corasick automaton is |
| 4 | created. The linear automaton traverses the linear paths from the |
| 5 | leaves to the root of the AST and returns a set of nodes for further |
| 6 | matching. This reduces significantly the number of candidate nodes.""" |
| 7 | |
| 8 | __author__ = "George Boutsioukis <gboutsioukis@gmail.com>" |
| 9 | |
| 10 | import logging |
| 11 | import itertools |
| 12 | from collections import defaultdict |
| 13 | |
| 14 | from . import pytree |
| 15 | from .btm_utils import reduce_tree |
| 16 | |
| 17 | class BMNode(object): |
| 18 | """Class for a node of the Aho-Corasick automaton used in matching""" |
| 19 | count = itertools.count() |
| 20 | def __init__(self): |
| 21 | self.transition_table = {} |
| 22 | self.fixers = [] |
| 23 | self.id = next(BMNode.count) |
| 24 | self.content = '' |
| 25 | |
| 26 | class BottomMatcher(object): |
| 27 | """The main matcher class. After instantiating the patterns should |
| 28 | be added using the add_fixer method""" |
| 29 | |
| 30 | def __init__(self): |
| 31 | self.match = set() |
| 32 | self.root = BMNode() |
| 33 | self.nodes = [self.root] |
| 34 | self.fixers = [] |
| 35 | self.logger = logging.getLogger("RefactoringTool") |
| 36 | |
| 37 | def add_fixer(self, fixer): |
| 38 | """Reduces a fixer's pattern tree to a linear path and adds it |
| 39 | to the matcher(a common Aho-Corasick automaton). The fixer is |
| 40 | appended on the matching states and called when they are |
| 41 | reached""" |
| 42 | self.fixers.append(fixer) |
| 43 | tree = reduce_tree(fixer.pattern_tree) |
| 44 | linear = tree.get_linear_subpattern() |
| 45 | match_nodes = self.add(linear, start=self.root) |
| 46 | for match_node in match_nodes: |
| 47 | match_node.fixers.append(fixer) |
| 48 | |
| 49 | def add(self, pattern, start): |
| 50 | "Recursively adds a linear pattern to the AC automaton" |
| 51 | #print("adding pattern", pattern, "to", start) |
| 52 | if not pattern: |
| 53 | #print("empty pattern") |
| 54 | return [start] |
| 55 | if isinstance(pattern[0], tuple): |
| 56 | #alternatives |
| 57 | #print("alternatives") |
| 58 | match_nodes = [] |
| 59 | for alternative in pattern[0]: |
| 60 | #add all alternatives, and add the rest of the pattern |
| 61 | #to each end node |
| 62 | end_nodes = self.add(alternative, start=start) |
| 63 | for end in end_nodes: |
| 64 | match_nodes.extend(self.add(pattern[1:], end)) |
| 65 | return match_nodes |
| 66 | else: |
| 67 | #single token |
| 68 | #not last |
| 69 | if pattern[0] not in start.transition_table: |
| 70 | #transition did not exist, create new |
| 71 | next_node = BMNode() |
| 72 | start.transition_table[pattern[0]] = next_node |
| 73 | else: |
| 74 | #transition exists already, follow |
| 75 | next_node = start.transition_table[pattern[0]] |
| 76 | |
| 77 | if pattern[1:]: |
| 78 | end_nodes = self.add(pattern[1:], start=next_node) |
| 79 | else: |
| 80 | end_nodes = [next_node] |
| 81 | return end_nodes |
| 82 | |
| 83 | def run(self, leaves): |
| 84 | """The main interface with the bottom matcher. The tree is |
| 85 | traversed from the bottom using the constructed |
| 86 | automaton. Nodes are only checked once as the tree is |
| 87 | retraversed. When the automaton fails, we give it one more |
| 88 | shot(in case the above tree matches as a whole with the |
| 89 | rejected leaf), then we break for the next leaf. There is the |
| 90 | special case of multiple arguments(see code comments) where we |
| 91 | recheck the nodes |
| 92 | |
| 93 | Args: |
| 94 | The leaves of the AST tree to be matched |
| 95 | |
| 96 | Returns: |
| 97 | A dictionary of node matches with fixers as the keys |
| 98 | """ |
| 99 | current_ac_node = self.root |
| 100 | results = defaultdict(list) |
| 101 | for leaf in leaves: |
| 102 | current_ast_node = leaf |
| 103 | while current_ast_node: |
| 104 | current_ast_node.was_checked = True |
| 105 | for child in current_ast_node.children: |
| 106 | # multiple statements, recheck |
| 107 | if isinstance(child, pytree.Leaf) and child.value == ";": |
| 108 | current_ast_node.was_checked = False |
| 109 | break |
| 110 | if current_ast_node.type == 1: |
| 111 | #name |
| 112 | node_token = current_ast_node.value |
| 113 | else: |
| 114 | node_token = current_ast_node.type |
| 115 | |
| 116 | if node_token in current_ac_node.transition_table: |
| 117 | #token matches |
| 118 | current_ac_node = current_ac_node.transition_table[node_token] |
| 119 | for fixer in current_ac_node.fixers: |
| 120 | if not fixer in results: |
| 121 | results[fixer] = [] |
| 122 | results[fixer].append(current_ast_node) |
| 123 | |
| 124 | else: |
| 125 | #matching failed, reset automaton |
| 126 | current_ac_node = self.root |
| 127 | if (current_ast_node.parent is not None |
| 128 | and current_ast_node.parent.was_checked): |
| 129 | #the rest of the tree upwards has been checked, next leaf |
| 130 | break |
| 131 | |
| 132 | #recheck the rejected node once from the root |
| 133 | if node_token in current_ac_node.transition_table: |
| 134 | #token matches |
| 135 | current_ac_node = current_ac_node.transition_table[node_token] |
| 136 | for fixer in current_ac_node.fixers: |
| 137 | if not fixer in results.keys(): |
| 138 | results[fixer] = [] |
| 139 | results[fixer].append(current_ast_node) |
| 140 | |
| 141 | current_ast_node = current_ast_node.parent |
| 142 | return results |
| 143 | |
| 144 | def print_ac(self): |
| 145 | "Prints a graphviz diagram of the BM automaton(for debugging)" |
| 146 | print("digraph g{") |
| 147 | def print_node(node): |
| 148 | for subnode_key in node.transition_table.keys(): |
| 149 | subnode = node.transition_table[subnode_key] |
| 150 | print("%d -> %d [label=%s] //%s" % |
| 151 | (node.id, subnode.id, type_repr(subnode_key), str(subnode.fixers))) |
| 152 | if subnode_key == 1: |
| 153 | print(subnode.content) |
| 154 | print_node(subnode) |
| 155 | print_node(self.root) |
| 156 | print("}") |
| 157 | |
| 158 | # taken from pytree.py for debugging; only used by print_ac |
| 159 | _type_reprs = {} |
| 160 | def type_repr(type_num): |
| 161 | global _type_reprs |
| 162 | if not _type_reprs: |
| 163 | from .pygram import python_symbols |
| 164 | # printing tokens is possible but not as useful |
| 165 | # from .pgen2 import token // token.__dict__.items(): |
| 166 | for name, val in python_symbols.__dict__.items(): |
| 167 | if type(val) == int: _type_reprs[val] = name |
| 168 | return _type_reprs.setdefault(type_num, type_num) |