blob: 061d8e8e3e686836d0184dc486a206035881f69b [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# re-compatible interface for the sre matching engine
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# This version of the SRE library can be redistributed under CNRI's
9# Python 1.6 license. For any other use, please contact Secret Labs
10# AB (info@pythonware.com).
11#
Guido van Rossum7627c0d2000-03-31 14:58:54 +000012# Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000013# CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014# other compatibility work.
15#
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017import sre_compile
Fredrik Lundh436c3d582000-06-29 08:58:44 +000018import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000019
Skip Montanaro0de65802001-02-15 22:15:14 +000020__all__ = ["match","search","sub","subn","split","findall","compile",
21 "purge","template","escape","I","L","M","S","X","U","IGNORECASE",
22 "LOCALE","MULTILINE","DOTALL","VERBOSE","UNICODE","error"]
23
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000024# flags
Fredrik Lundh770617b2001-01-14 15:06:11 +000025I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
26L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
27U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
28M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
29S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
30X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000031
Fredrik Lundh770617b2001-01-14 15:06:11 +000032# sre extensions (experimental, don't rely on these)
33T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
34DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
Fredrik Lundh436c3d582000-06-29 08:58:44 +000035
36# sre exception
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000037error = sre_compile.error
Fredrik Lundh436c3d582000-06-29 08:58:44 +000038
Guido van Rossum7627c0d2000-03-31 14:58:54 +000039# --------------------------------------------------------------------
40# public interface
41
Guido van Rossum7627c0d2000-03-31 14:58:54 +000042def match(pattern, string, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000043 """Try to apply the pattern at the start of the string, returning
44 a match object, or None if no match was found."""
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000045 return _compile(pattern, flags).match(string)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000046
47def search(pattern, string, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000048 """Scan through string looking for a match to the pattern, returning
49 a match object, or None if no match was found."""
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000050 return _compile(pattern, flags).search(string)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000051
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000052def sub(pattern, repl, string, count=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000053 """Return the string obtained by replacing the leftmost
54 non-overlapping occurrences of the pattern in string by the
55 replacement repl"""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000056 return _compile(pattern, 0).sub(repl, string, count)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000057
58def subn(pattern, repl, string, count=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000059 """Return a 2-tuple containing (new_string, number).
60 new_string is the string obtained by replacing the leftmost
61 non-overlapping occurrences of the pattern in the source
62 string by the replacement repl. number is the number of
63 substitutions that were made."""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000064 return _compile(pattern, 0).subn(repl, string, count)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000065
66def split(pattern, string, maxsplit=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000067 """Split the source string by the occurrences of the pattern,
68 returning a list containing the resulting substrings."""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000069 return _compile(pattern, 0).split(string, maxsplit)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000070
71def findall(pattern, string, maxsplit=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000072 """Return a list of all non-overlapping matches in the string.
73
74 If one or more groups are present in the pattern, return a
75 list of groups; this will be a list of tuples if the pattern
76 has more than one group.
77
78 Empty matches are included in the result."""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000079 return _compile(pattern, 0).findall(string, maxsplit)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000080
81def compile(pattern, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000082 "Compile a regular expression pattern, returning a pattern object."
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000083 return _compile(pattern, flags)
84
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000085def purge():
Fredrik Lundh770617b2001-01-14 15:06:11 +000086 "Clear the regular expression cache"
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000087 _cache.clear()
88
Fredrik Lundh436c3d582000-06-29 08:58:44 +000089def template(pattern, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000090 "Compile a template pattern, returning a pattern object"
91
Fredrik Lundh436c3d582000-06-29 08:58:44 +000092 return _compile(pattern, flags|T)
93
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000094def escape(pattern):
Fredrik Lundh770617b2001-01-14 15:06:11 +000095 "Escape all non-alphanumeric characters in pattern."
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000096 s = list(pattern)
97 for i in range(len(pattern)):
98 c = pattern[i]
99 if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
100 if c == "\000":
101 s[i] = "\\000"
102 else:
103 s[i] = "\\" + c
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000104 return _join(s, pattern)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000105
106# --------------------------------------------------------------------
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000107# internals
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000108
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000109_cache = {}
110_MAXCACHE = 100
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000111
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000112def _join(seq, sep):
113 # internal: join into string having the same type as sep
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000114 return sep[:0].join(seq)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000115
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000116def _compile(*key):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000117 # internal: compile pattern
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000118 p = _cache.get(key)
119 if p is not None:
120 return p
121 pattern, flags = key
122 if type(pattern) not in sre_compile.STRING_TYPES:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000123 return pattern
Fredrik Lundhe1869832000-08-01 22:47:49 +0000124 try:
125 p = sre_compile.compile(pattern, flags)
126 except error, v:
127 raise error, v # invalid expression
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000128 if len(_cache) >= _MAXCACHE:
129 _cache.clear()
130 _cache[key] = p
131 return p
132
Fredrik Lundh5644b7f2000-09-21 17:03:25 +0000133def _expand(pattern, match, template):
134 # internal: match.expand implementation hook
135 template = sre_parse.parse_template(template, pattern)
136 return sre_parse.expand_template(template, match)
137
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000138def _sub(pattern, template, string, count=0):
139 # internal: pattern.sub implementation hook
140 return _subn(pattern, template, string, count)[0]
141
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000142def _subn(pattern, template, string, count=0):
143 # internal: pattern.subn implementation hook
144 if callable(template):
Andrew M. Kuchlinge8d52af2000-06-18 20:27:10 +0000145 filter = template
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000146 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000147 template = sre_parse.parse_template(template, pattern)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000148 def filter(match, template=template):
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000149 return sre_parse.expand_template(template, match)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000150 n = i = 0
151 s = []
152 append = s.append
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000153 c = pattern.scanner(string)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000154 while not count or n < count:
155 m = c.search()
156 if not m:
157 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000158 b, e = m.span()
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000159 if i < b:
160 append(string[i:b])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000161 append(filter(m))
Fredrik Lundh90a07912000-06-30 07:50:59 +0000162 i = e
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000163 n = n + 1
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000164 append(string[i:])
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000165 return _join(s, string[:0]), n
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000166
167def _split(pattern, string, maxsplit=0):
168 # internal: pattern.split implementation hook
169 n = i = 0
170 s = []
171 append = s.append
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000172 extend = s.extend
173 c = pattern.scanner(string)
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000174 g = pattern.groups
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000175 while not maxsplit or n < maxsplit:
176 m = c.search()
177 if not m:
178 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000179 b, e = m.span()
180 if b == e:
181 if i >= len(string):
182 break
183 continue
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000184 append(string[i:b])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000185 if g and b != e:
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000186 extend(list(m.groups()))
Fredrik Lundh90a07912000-06-30 07:50:59 +0000187 i = e
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000188 n = n + 1
Fredrik Lundh80946112000-06-29 18:03:25 +0000189 append(string[i:])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000190 return s
Fredrik Lundh0640e112000-06-30 13:55:15 +0000191
192# register myself for pickling
193
194import copy_reg
195
196def _pickle(p):
197 return _compile, (p.pattern, p.flags)
198
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000199copy_reg.pickle(type(_compile("", 0)), _pickle, _compile)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000200
201# --------------------------------------------------------------------
202# experimental stuff (see python-dev discussions for details)
203
204class Scanner:
205 def __init__(self, lexicon):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000206 from sre_constants import BRANCH, SUBPATTERN
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000207 self.lexicon = lexicon
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000208 # combine phrases into a compound pattern
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000209 p = []
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000210 s = sre_parse.Pattern()
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000211 for phrase, action in lexicon:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000212 p.append(sre_parse.SubPattern(s, [
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000213 (SUBPATTERN, (len(p), sre_parse.parse(phrase))),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000214 ]))
215 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
216 s.groups = len(p)
217 self.scanner = sre_compile.compile(p)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000218 def scan(self, string):
219 result = []
220 append = result.append
221 match = self.scanner.match
222 i = 0
223 while 1:
224 m = match(string, i)
225 if not m:
226 break
227 j = m.end()
228 if i == j:
229 break
Fredrik Lundh019bcb52000-07-02 22:59:57 +0000230 action = self.lexicon[m.lastindex][1]
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000231 if callable(action):
Fredrik Lundh770617b2001-01-14 15:06:11 +0000232 self.match = m
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000233 action = action(self, m.group())
234 if action is not None:
235 append(action)
236 i = j
237 return result, string[i:]