Christian Heimes | 9054000 | 2008-05-08 14:29:10 +0000 | [diff] [blame] | 1 | """Iterator based sre token scanner |
| 2 | |
| 3 | """ |
| 4 | |
| 5 | import re |
| 6 | import sre_parse |
| 7 | import sre_compile |
| 8 | import sre_constants |
| 9 | |
| 10 | from re import VERBOSE, MULTILINE, DOTALL |
| 11 | from sre_constants import BRANCH, SUBPATTERN |
| 12 | |
| 13 | __all__ = ['Scanner', 'pattern'] |
| 14 | |
| 15 | FLAGS = (VERBOSE | MULTILINE | DOTALL) |
| 16 | |
| 17 | class Scanner(object): |
| 18 | def __init__(self, lexicon, flags=FLAGS): |
| 19 | self.actions = [None] |
| 20 | # Combine phrases into a compound pattern |
| 21 | s = sre_parse.Pattern() |
| 22 | s.flags = flags |
| 23 | p = [] |
| 24 | for idx, token in enumerate(lexicon): |
| 25 | phrase = token.pattern |
| 26 | try: |
| 27 | subpattern = sre_parse.SubPattern(s, |
| 28 | [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) |
| 29 | except sre_constants.error: |
| 30 | raise |
| 31 | p.append(subpattern) |
| 32 | self.actions.append(token) |
| 33 | |
| 34 | s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work |
| 35 | p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
| 36 | self.scanner = sre_compile.compile(p) |
| 37 | |
| 38 | def iterscan(self, string, idx=0, context=None): |
| 39 | """Yield match, end_idx for each match |
| 40 | |
| 41 | """ |
| 42 | match = self.scanner.scanner(string, idx).match |
| 43 | actions = self.actions |
| 44 | lastend = idx |
| 45 | end = len(string) |
| 46 | while True: |
| 47 | m = match() |
| 48 | if m is None: |
| 49 | break |
| 50 | matchbegin, matchend = m.span() |
| 51 | if lastend == matchend: |
| 52 | break |
| 53 | action = actions[m.lastindex] |
| 54 | if action is not None: |
| 55 | rval, next_pos = action(m, context) |
| 56 | if next_pos is not None and next_pos != matchend: |
| 57 | # "fast forward" the scanner |
| 58 | matchend = next_pos |
| 59 | match = self.scanner.scanner(string, matchend).match |
| 60 | yield rval, matchend |
| 61 | lastend = matchend |
| 62 | |
| 63 | |
| 64 | def pattern(pattern, flags=FLAGS): |
| 65 | def decorator(fn): |
| 66 | fn.pattern = pattern |
| 67 | fn.regex = re.compile(pattern, flags) |
| 68 | return fn |
| 69 | return decorator |