| "Utility functions used by the btm_matcher module" |
| |
| from . import pytree |
| from .pgen2 import grammar, token |
| from .pygram import pattern_symbols, python_symbols |
| |
| syms = pattern_symbols |
| pysyms = python_symbols |
| tokens = grammar.opmap |
| token_labels = token |
| |
| TYPE_ANY = -1 |
| TYPE_ALTERNATIVES = -2 |
| TYPE_GROUP = -3 |
| |
| class MinNode(object): |
| """This class serves as an intermediate representation of the |
| pattern tree during the conversion to sets of leaf-to-root |
| subpatterns""" |
| |
| def __init__(self, type=None, name=None): |
| self.type = type |
| self.name = name |
| self.children = [] |
| self.leaf = False |
| self.parent = None |
| self.alternatives = [] |
| self.group = [] |
| |
| def __repr__(self): |
| return str(self.type) + ' ' + str(self.name) |
| |
| def leaf_to_root(self): |
| """Internal method. Returns a characteristic path of the |
| pattern tree. This method must be run for all leaves until the |
| linear subpatterns are merged into a single""" |
| node = self |
| subp = [] |
| while node: |
| if node.type == TYPE_ALTERNATIVES: |
| node.alternatives.append(subp) |
| if len(node.alternatives) == len(node.children): |
| #last alternative |
| subp = [tuple(node.alternatives)] |
| node.alternatives = [] |
| node = node.parent |
| continue |
| else: |
| node = node.parent |
| subp = None |
| break |
| |
| if node.type == TYPE_GROUP: |
| node.group.append(subp) |
| #probably should check the number of leaves |
| if len(node.group) == len(node.children): |
| subp = get_characteristic_subpattern(node.group) |
| node.group = [] |
| node = node.parent |
| continue |
| else: |
| node = node.parent |
| subp = None |
| break |
| |
| if node.type == token_labels.NAME and node.name: |
| #in case of type=name, use the name instead |
| subp.append(node.name) |
| else: |
| subp.append(node.type) |
| |
| node = node.parent |
| return subp |
| |
| def get_linear_subpattern(self): |
| """Drives the leaf_to_root method. The reason that |
| leaf_to_root must be run multiple times is because we need to |
| reject 'group' matches; for example the alternative form |
| (a | b c) creates a group [b c] that needs to be matched. Since |
| matching multiple linear patterns overcomes the automaton's |
| capabilities, leaf_to_root merges each group into a single |
| choice based on 'characteristic'ity, |
| |
| i.e. (a|b c) -> (a|b) if b more characteristic than c |
| |
| Returns: The most 'characteristic'(as defined by |
| get_characteristic_subpattern) path for the compiled pattern |
| tree. |
| """ |
| |
| for l in self.leaves(): |
| subp = l.leaf_to_root() |
| if subp: |
| return subp |
| |
| def leaves(self): |
| "Generator that returns the leaves of the tree" |
| for child in self.children: |
| for x in child.leaves(): |
| yield x |
| if not self.children: |
| yield self |
| |
| def reduce_tree(node, parent=None): |
| """ |
| Internal function. Reduces a compiled pattern tree to an |
| intermediate representation suitable for feeding the |
| automaton. This also trims off any optional pattern elements(like |
| [a], a*). |
| """ |
| |
| new_node = None |
| #switch on the node type |
| if node.type == syms.Matcher: |
| #skip |
| node = node.children[0] |
| |
| if node.type == syms.Alternatives : |
| #2 cases |
| if len(node.children) <= 2: |
| #just a single 'Alternative', skip this node |
| new_node = reduce_tree(node.children[0], parent) |
| else: |
| #real alternatives |
| new_node = MinNode(type=TYPE_ALTERNATIVES) |
| #skip odd children('|' tokens) |
| for child in node.children: |
| if node.children.index(child)%2: |
| continue |
| reduced = reduce_tree(child, new_node) |
| if reduced is not None: |
| new_node.children.append(reduced) |
| elif node.type == syms.Alternative: |
| if len(node.children) > 1: |
| |
| new_node = MinNode(type=TYPE_GROUP) |
| for child in node.children: |
| reduced = reduce_tree(child, new_node) |
| if reduced: |
| new_node.children.append(reduced) |
| if not new_node.children: |
| # delete the group if all of the children were reduced to None |
| new_node = None |
| |
| else: |
| new_node = reduce_tree(node.children[0], parent) |
| |
| elif node.type == syms.Unit: |
| if (isinstance(node.children[0], pytree.Leaf) and |
| node.children[0].value == '('): |
| #skip parentheses |
| return reduce_tree(node.children[1], parent) |
| if ((isinstance(node.children[0], pytree.Leaf) and |
| node.children[0].value == '[') |
| or |
| (len(node.children)>1 and |
| hasattr(node.children[1], "value") and |
| node.children[1].value == '[')): |
| #skip whole unit if its optional |
| return None |
| |
| leaf = True |
| details_node = None |
| alternatives_node = None |
| has_repeater = False |
| repeater_node = None |
| has_variable_name = False |
| |
| for child in node.children: |
| if child.type == syms.Details: |
| leaf = False |
| details_node = child |
| elif child.type == syms.Repeater: |
| has_repeater = True |
| repeater_node = child |
| elif child.type == syms.Alternatives: |
| alternatives_node = child |
| if hasattr(child, 'value') and child.value == '=': # variable name |
| has_variable_name = True |
| |
| #skip variable name |
| if has_variable_name: |
| #skip variable name, '=' |
| name_leaf = node.children[2] |
| if hasattr(name_leaf, 'value') and name_leaf.value == '(': |
| # skip parenthesis |
| name_leaf = node.children[3] |
| else: |
| name_leaf = node.children[0] |
| |
| #set node type |
| if name_leaf.type == token_labels.NAME: |
| #(python) non-name or wildcard |
| if name_leaf.value == 'any': |
| new_node = MinNode(type=TYPE_ANY) |
| else: |
| if hasattr(token_labels, name_leaf.value): |
| new_node = MinNode(type=getattr(token_labels, name_leaf.value)) |
| else: |
| new_node = MinNode(type=getattr(pysyms, name_leaf.value)) |
| |
| elif name_leaf.type == token_labels.STRING: |
| #(python) name or character; remove the apostrophes from |
| #the string value |
| name = name_leaf.value.strip("'") |
| if name in tokens: |
| new_node = MinNode(type=tokens[name]) |
| else: |
| new_node = MinNode(type=token_labels.NAME, name=name) |
| elif name_leaf.type == syms.Alternatives: |
| new_node = reduce_tree(alternatives_node, parent) |
| |
| #handle repeaters |
| if has_repeater: |
| if repeater_node.children[0].value == '*': |
| #reduce to None |
| new_node = None |
| elif repeater_node.children[0].value == '+': |
| #reduce to a single occurence i.e. do nothing |
| pass |
| else: |
| #TODO: handle {min, max} repeaters |
| raise NotImplementedError |
| pass |
| |
| #add children |
| if details_node and new_node is not None: |
| for child in details_node.children[1:-1]: |
| #skip '<', '>' markers |
| reduced = reduce_tree(child, new_node) |
| if reduced is not None: |
| new_node.children.append(reduced) |
| if new_node: |
| new_node.parent = parent |
| return new_node |
| |
| |
| def get_characteristic_subpattern(subpatterns): |
| """Picks the most characteristic from a list of linear patterns |
| Current order used is: |
| names > common_names > common_chars |
| """ |
| if not isinstance(subpatterns, list): |
| return subpatterns |
| if len(subpatterns)==1: |
| return subpatterns[0] |
| |
| # first pick out the ones containing variable names |
| subpatterns_with_names = [] |
| subpatterns_with_common_names = [] |
| common_names = ['in', 'for', 'if' , 'not', 'None'] |
| subpatterns_with_common_chars = [] |
| common_chars = "[]().,:" |
| for subpattern in subpatterns: |
| if any(rec_test(subpattern, lambda x: type(x) is str)): |
| if any(rec_test(subpattern, |
| lambda x: isinstance(x, str) and x in common_chars)): |
| subpatterns_with_common_chars.append(subpattern) |
| elif any(rec_test(subpattern, |
| lambda x: isinstance(x, str) and x in common_names)): |
| subpatterns_with_common_names.append(subpattern) |
| |
| else: |
| subpatterns_with_names.append(subpattern) |
| |
| if subpatterns_with_names: |
| subpatterns = subpatterns_with_names |
| elif subpatterns_with_common_names: |
| subpatterns = subpatterns_with_common_names |
| elif subpatterns_with_common_chars: |
| subpatterns = subpatterns_with_common_chars |
| # of the remaining subpatterns pick out the longest one |
| return max(subpatterns, key=len) |
| |
| def rec_test(sequence, test_func): |
| """Tests test_func on all items of sequence and items of included |
| sub-iterables""" |
| for x in sequence: |
| if isinstance(x, (list, tuple)): |
| for y in rec_test(x, test_func): |
| yield y |
| else: |
| yield test_func(x) |