blob: 4b065abf753408b468672e2181dcbfa70f904aed [file] [log] [blame]
Christian Heimes90540002008-05-08 14:29:10 +00001"""Iterator based sre token scanner
2
3"""
4
5import re
6import sre_parse
7import sre_compile
8import sre_constants
9
10from re import VERBOSE, MULTILINE, DOTALL
11from sre_constants import BRANCH, SUBPATTERN
12
13__all__ = ['Scanner', 'pattern']
14
15FLAGS = (VERBOSE | MULTILINE | DOTALL)
16
17class Scanner(object):
18 def __init__(self, lexicon, flags=FLAGS):
19 self.actions = [None]
20 # Combine phrases into a compound pattern
21 s = sre_parse.Pattern()
22 s.flags = flags
23 p = []
24 for idx, token in enumerate(lexicon):
25 phrase = token.pattern
26 try:
27 subpattern = sre_parse.SubPattern(s,
28 [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))])
29 except sre_constants.error:
30 raise
31 p.append(subpattern)
32 self.actions.append(token)
33
34 s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work
35 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
36 self.scanner = sre_compile.compile(p)
37
38 def iterscan(self, string, idx=0, context=None):
39 """Yield match, end_idx for each match
40
41 """
42 match = self.scanner.scanner(string, idx).match
43 actions = self.actions
44 lastend = idx
45 end = len(string)
46 while True:
47 m = match()
48 if m is None:
49 break
50 matchbegin, matchend = m.span()
51 if lastend == matchend:
52 break
53 action = actions[m.lastindex]
54 if action is not None:
55 rval, next_pos = action(m, context)
56 if next_pos is not None and next_pos != matchend:
57 # "fast forward" the scanner
58 matchend = next_pos
59 match = self.scanner.scanner(string, matchend).match
60 yield rval, matchend
61 lastend = matchend
62
63
64def pattern(pattern, flags=FLAGS):
65 def decorator(fn):
66 fn.pattern = pattern
67 fn.regex = re.compile(pattern, flags)
68 return fn
69 return decorator