Benjamin Peterson | f37eb3a | 2010-10-14 23:00:04 +0000 | [diff] [blame] | 1 | "Utility functions used by the btm_matcher module" |
| 2 | |
| 3 | from . import pytree |
| 4 | from .pgen2 import grammar, token |
| 5 | from .pygram import pattern_symbols, python_symbols |
| 6 | |
| 7 | syms = pattern_symbols |
| 8 | pysyms = python_symbols |
| 9 | tokens = grammar.opmap |
| 10 | token_labels = token |
| 11 | |
| 12 | TYPE_ANY = -1 |
| 13 | TYPE_ALTERNATIVES = -2 |
| 14 | TYPE_GROUP = -3 |
| 15 | |
| 16 | class MinNode(object): |
| 17 | """This class serves as an intermediate representation of the |
| 18 | pattern tree during the conversion to sets of leaf-to-root |
| 19 | subpatterns""" |
| 20 | |
| 21 | def __init__(self, type=None, name=None): |
| 22 | self.type = type |
| 23 | self.name = name |
| 24 | self.children = [] |
| 25 | self.leaf = False |
| 26 | self.parent = None |
| 27 | self.alternatives = [] |
| 28 | self.group = [] |
| 29 | |
| 30 | def __repr__(self): |
| 31 | return str(self.type) + ' ' + str(self.name) |
| 32 | |
| 33 | def leaf_to_root(self): |
| 34 | """Internal method. Returns a characteristic path of the |
| 35 | pattern tree. This method must be run for all leaves until the |
| 36 | linear subpatterns are merged into a single""" |
| 37 | node = self |
| 38 | subp = [] |
| 39 | while node: |
| 40 | if node.type == TYPE_ALTERNATIVES: |
| 41 | node.alternatives.append(subp) |
| 42 | if len(node.alternatives) == len(node.children): |
| 43 | #last alternative |
| 44 | subp = [tuple(node.alternatives)] |
| 45 | node.alternatives = [] |
| 46 | node = node.parent |
| 47 | continue |
| 48 | else: |
| 49 | node = node.parent |
| 50 | subp = None |
| 51 | break |
| 52 | |
| 53 | if node.type == TYPE_GROUP: |
| 54 | node.group.append(subp) |
| 55 | #probably should check the number of leaves |
| 56 | if len(node.group) == len(node.children): |
| 57 | subp = get_characteristic_subpattern(node.group) |
| 58 | node.group = [] |
| 59 | node = node.parent |
| 60 | continue |
| 61 | else: |
| 62 | node = node.parent |
| 63 | subp = None |
| 64 | break |
| 65 | |
| 66 | if node.type == token_labels.NAME and node.name: |
| 67 | #in case of type=name, use the name instead |
| 68 | subp.append(node.name) |
| 69 | else: |
| 70 | subp.append(node.type) |
| 71 | |
| 72 | node = node.parent |
| 73 | return subp |
| 74 | |
| 75 | def get_linear_subpattern(self): |
| 76 | """Drives the leaf_to_root method. The reason that |
| 77 | leaf_to_root must be run multiple times is because we need to |
| 78 | reject 'group' matches; for example the alternative form |
| 79 | (a | b c) creates a group [b c] that needs to be matched. Since |
| 80 | matching multiple linear patterns overcomes the automaton's |
| 81 | capabilities, leaf_to_root merges each group into a single |
| 82 | choice based on 'characteristic'ity, |
| 83 | |
| 84 | i.e. (a|b c) -> (a|b) if b more characteristic than c |
| 85 | |
| 86 | Returns: The most 'characteristic'(as defined by |
| 87 | get_characteristic_subpattern) path for the compiled pattern |
| 88 | tree. |
| 89 | """ |
| 90 | |
| 91 | for l in self.leaves(): |
| 92 | subp = l.leaf_to_root() |
| 93 | if subp: |
| 94 | return subp |
| 95 | |
| 96 | def leaves(self): |
| 97 | "Generator that returns the leaves of the tree" |
| 98 | for child in self.children: |
Andrew Svetlov | 7d14015 | 2012-10-06 17:11:45 +0300 | [diff] [blame] | 99 | yield from child.leaves() |
Benjamin Peterson | f37eb3a | 2010-10-14 23:00:04 +0000 | [diff] [blame] | 100 | if not self.children: |
| 101 | yield self |
| 102 | |
| 103 | def reduce_tree(node, parent=None): |
| 104 | """ |
| 105 | Internal function. Reduces a compiled pattern tree to an |
| 106 | intermediate representation suitable for feeding the |
| 107 | automaton. This also trims off any optional pattern elements(like |
| 108 | [a], a*). |
| 109 | """ |
| 110 | |
| 111 | new_node = None |
| 112 | #switch on the node type |
| 113 | if node.type == syms.Matcher: |
| 114 | #skip |
| 115 | node = node.children[0] |
| 116 | |
| 117 | if node.type == syms.Alternatives : |
| 118 | #2 cases |
| 119 | if len(node.children) <= 2: |
| 120 | #just a single 'Alternative', skip this node |
| 121 | new_node = reduce_tree(node.children[0], parent) |
| 122 | else: |
| 123 | #real alternatives |
| 124 | new_node = MinNode(type=TYPE_ALTERNATIVES) |
| 125 | #skip odd children('|' tokens) |
| 126 | for child in node.children: |
| 127 | if node.children.index(child)%2: |
| 128 | continue |
| 129 | reduced = reduce_tree(child, new_node) |
| 130 | if reduced is not None: |
| 131 | new_node.children.append(reduced) |
| 132 | elif node.type == syms.Alternative: |
| 133 | if len(node.children) > 1: |
| 134 | |
| 135 | new_node = MinNode(type=TYPE_GROUP) |
| 136 | for child in node.children: |
| 137 | reduced = reduce_tree(child, new_node) |
| 138 | if reduced: |
| 139 | new_node.children.append(reduced) |
| 140 | if not new_node.children: |
| 141 | # delete the group if all of the children were reduced to None |
| 142 | new_node = None |
| 143 | |
| 144 | else: |
| 145 | new_node = reduce_tree(node.children[0], parent) |
| 146 | |
| 147 | elif node.type == syms.Unit: |
| 148 | if (isinstance(node.children[0], pytree.Leaf) and |
| 149 | node.children[0].value == '('): |
| 150 | #skip parentheses |
| 151 | return reduce_tree(node.children[1], parent) |
| 152 | if ((isinstance(node.children[0], pytree.Leaf) and |
| 153 | node.children[0].value == '[') |
| 154 | or |
| 155 | (len(node.children)>1 and |
| 156 | hasattr(node.children[1], "value") and |
| 157 | node.children[1].value == '[')): |
| 158 | #skip whole unit if its optional |
| 159 | return None |
| 160 | |
| 161 | leaf = True |
| 162 | details_node = None |
| 163 | alternatives_node = None |
| 164 | has_repeater = False |
| 165 | repeater_node = None |
| 166 | has_variable_name = False |
| 167 | |
| 168 | for child in node.children: |
| 169 | if child.type == syms.Details: |
| 170 | leaf = False |
| 171 | details_node = child |
| 172 | elif child.type == syms.Repeater: |
| 173 | has_repeater = True |
| 174 | repeater_node = child |
| 175 | elif child.type == syms.Alternatives: |
| 176 | alternatives_node = child |
| 177 | if hasattr(child, 'value') and child.value == '=': # variable name |
| 178 | has_variable_name = True |
| 179 | |
| 180 | #skip variable name |
| 181 | if has_variable_name: |
| 182 | #skip variable name, '=' |
| 183 | name_leaf = node.children[2] |
| 184 | if hasattr(name_leaf, 'value') and name_leaf.value == '(': |
| 185 | # skip parenthesis |
| 186 | name_leaf = node.children[3] |
| 187 | else: |
| 188 | name_leaf = node.children[0] |
| 189 | |
| 190 | #set node type |
| 191 | if name_leaf.type == token_labels.NAME: |
| 192 | #(python) non-name or wildcard |
| 193 | if name_leaf.value == 'any': |
| 194 | new_node = MinNode(type=TYPE_ANY) |
| 195 | else: |
| 196 | if hasattr(token_labels, name_leaf.value): |
| 197 | new_node = MinNode(type=getattr(token_labels, name_leaf.value)) |
| 198 | else: |
| 199 | new_node = MinNode(type=getattr(pysyms, name_leaf.value)) |
| 200 | |
| 201 | elif name_leaf.type == token_labels.STRING: |
| 202 | #(python) name or character; remove the apostrophes from |
| 203 | #the string value |
| 204 | name = name_leaf.value.strip("'") |
| 205 | if name in tokens: |
| 206 | new_node = MinNode(type=tokens[name]) |
| 207 | else: |
| 208 | new_node = MinNode(type=token_labels.NAME, name=name) |
| 209 | elif name_leaf.type == syms.Alternatives: |
| 210 | new_node = reduce_tree(alternatives_node, parent) |
| 211 | |
| 212 | #handle repeaters |
| 213 | if has_repeater: |
| 214 | if repeater_node.children[0].value == '*': |
| 215 | #reduce to None |
| 216 | new_node = None |
| 217 | elif repeater_node.children[0].value == '+': |
| 218 | #reduce to a single occurence i.e. do nothing |
| 219 | pass |
| 220 | else: |
| 221 | #TODO: handle {min, max} repeaters |
| 222 | raise NotImplementedError |
| 223 | pass |
| 224 | |
| 225 | #add children |
| 226 | if details_node and new_node is not None: |
| 227 | for child in details_node.children[1:-1]: |
| 228 | #skip '<', '>' markers |
| 229 | reduced = reduce_tree(child, new_node) |
| 230 | if reduced is not None: |
| 231 | new_node.children.append(reduced) |
| 232 | if new_node: |
| 233 | new_node.parent = parent |
| 234 | return new_node |
| 235 | |
| 236 | |
| 237 | def get_characteristic_subpattern(subpatterns): |
| 238 | """Picks the most characteristic from a list of linear patterns |
| 239 | Current order used is: |
| 240 | names > common_names > common_chars |
| 241 | """ |
| 242 | if not isinstance(subpatterns, list): |
| 243 | return subpatterns |
| 244 | if len(subpatterns)==1: |
| 245 | return subpatterns[0] |
| 246 | |
| 247 | # first pick out the ones containing variable names |
| 248 | subpatterns_with_names = [] |
| 249 | subpatterns_with_common_names = [] |
| 250 | common_names = ['in', 'for', 'if' , 'not', 'None'] |
| 251 | subpatterns_with_common_chars = [] |
| 252 | common_chars = "[]().,:" |
| 253 | for subpattern in subpatterns: |
| 254 | if any(rec_test(subpattern, lambda x: type(x) is str)): |
| 255 | if any(rec_test(subpattern, |
| 256 | lambda x: isinstance(x, str) and x in common_chars)): |
| 257 | subpatterns_with_common_chars.append(subpattern) |
| 258 | elif any(rec_test(subpattern, |
| 259 | lambda x: isinstance(x, str) and x in common_names)): |
| 260 | subpatterns_with_common_names.append(subpattern) |
| 261 | |
| 262 | else: |
| 263 | subpatterns_with_names.append(subpattern) |
| 264 | |
| 265 | if subpatterns_with_names: |
| 266 | subpatterns = subpatterns_with_names |
| 267 | elif subpatterns_with_common_names: |
| 268 | subpatterns = subpatterns_with_common_names |
| 269 | elif subpatterns_with_common_chars: |
| 270 | subpatterns = subpatterns_with_common_chars |
| 271 | # of the remaining subpatterns pick out the longest one |
| 272 | return max(subpatterns, key=len) |
| 273 | |
| 274 | def rec_test(sequence, test_func): |
| 275 | """Tests test_func on all items of sequence and items of included |
| 276 | sub-iterables""" |
| 277 | for x in sequence: |
| 278 | if isinstance(x, (list, tuple)): |
Andrew Svetlov | 7d14015 | 2012-10-06 17:11:45 +0300 | [diff] [blame] | 279 | yield from rec_test(x, test_func) |
Benjamin Peterson | f37eb3a | 2010-10-14 23:00:04 +0000 | [diff] [blame] | 280 | else: |
| 281 | yield test_func(x) |