blob: 7b79f43d8d62be8593cf9bc9e9759f99dd445449 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# re-compatible interface for the sre matching engine
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# This version of the SRE library can be redistributed under CNRI's
9# Python 1.6 license. For any other use, please contact Secret Labs
10# AB (info@pythonware.com).
11#
Guido van Rossum7627c0d2000-03-31 14:58:54 +000012# Portions of this engine have been developed in cooperation with
Fredrik Lundh29c4ba92000-08-01 18:20:07 +000013# CNRI. Hewlett-Packard provided funding for 1.6 integration and
Guido van Rossum7627c0d2000-03-31 14:58:54 +000014# other compatibility work.
15#
16
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017import sre_compile
Fredrik Lundh436c3d582000-06-29 08:58:44 +000018import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000019
Fredrik Lundhf2989b22001-02-18 12:05:16 +000020# public symbols
21__all__ = [ "match", "search", "sub", "subn", "split", "findall",
22 "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
23 "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
24 "UNICODE", "error" ]
25
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000026__version__ = "2.1b2"
27
Fredrik Lundhf2989b22001-02-18 12:05:16 +000028# this module works under 1.5.2 and later. don't use string methods
29import string
Skip Montanaro0de65802001-02-15 22:15:14 +000030
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000031# flags
Fredrik Lundh770617b2001-01-14 15:06:11 +000032I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
33L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
34U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
35M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
36S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
37X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000038
Fredrik Lundh770617b2001-01-14 15:06:11 +000039# sre extensions (experimental, don't rely on these)
40T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
41DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
Fredrik Lundh436c3d582000-06-29 08:58:44 +000042
43# sre exception
Fredrik Lundhbe2211e2000-06-29 16:57:40 +000044error = sre_compile.error
Fredrik Lundh436c3d582000-06-29 08:58:44 +000045
Guido van Rossum7627c0d2000-03-31 14:58:54 +000046# --------------------------------------------------------------------
47# public interface
48
Guido van Rossum7627c0d2000-03-31 14:58:54 +000049def match(pattern, string, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000050 """Try to apply the pattern at the start of the string, returning
51 a match object, or None if no match was found."""
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000052 return _compile(pattern, flags).match(string)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000053
54def search(pattern, string, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000055 """Scan through string looking for a match to the pattern, returning
56 a match object, or None if no match was found."""
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000057 return _compile(pattern, flags).search(string)
Guido van Rossum7627c0d2000-03-31 14:58:54 +000058
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000059def sub(pattern, repl, string, count=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000060 """Return the string obtained by replacing the leftmost
61 non-overlapping occurrences of the pattern in string by the
62 replacement repl"""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000063 return _compile(pattern, 0).sub(repl, string, count)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000064
65def subn(pattern, repl, string, count=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000066 """Return a 2-tuple containing (new_string, number).
67 new_string is the string obtained by replacing the leftmost
68 non-overlapping occurrences of the pattern in the source
69 string by the replacement repl. number is the number of
70 substitutions that were made."""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000071 return _compile(pattern, 0).subn(repl, string, count)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000072
73def split(pattern, string, maxsplit=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000074 """Split the source string by the occurrences of the pattern,
75 returning a list containing the resulting substrings."""
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000076 return _compile(pattern, 0).split(string, maxsplit)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000077
Fredrik Lundhe06cbb82001-07-06 20:56:10 +000078def findall(pattern, string):
Fredrik Lundh770617b2001-01-14 15:06:11 +000079 """Return a list of all non-overlapping matches in the string.
80
81 If one or more groups are present in the pattern, return a
82 list of groups; this will be a list of tuples if the pattern
83 has more than one group.
84
85 Empty matches are included in the result."""
Fredrik Lundhe06cbb82001-07-06 20:56:10 +000086 return _compile(pattern, 0).findall(string)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000087
88def compile(pattern, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000089 "Compile a regular expression pattern, returning a pattern object."
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +000090 return _compile(pattern, flags)
91
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000092def purge():
Fredrik Lundh770617b2001-01-14 15:06:11 +000093 "Clear the regular expression cache"
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000094 _cache.clear()
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +000095 _cache_repl.clear()
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +000096
Fredrik Lundh436c3d582000-06-29 08:58:44 +000097def template(pattern, flags=0):
Fredrik Lundh770617b2001-01-14 15:06:11 +000098 "Compile a template pattern, returning a pattern object"
Fredrik Lundh436c3d582000-06-29 08:58:44 +000099 return _compile(pattern, flags|T)
100
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000101def escape(pattern):
Fredrik Lundh770617b2001-01-14 15:06:11 +0000102 "Escape all non-alphanumeric characters in pattern."
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000103 s = list(pattern)
104 for i in range(len(pattern)):
105 c = pattern[i]
106 if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
107 if c == "\000":
108 s[i] = "\\000"
109 else:
110 s[i] = "\\" + c
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000111 return _join(s, pattern)
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000112
113# --------------------------------------------------------------------
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000114# internals
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000115
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000116_cache = {}
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000117_cache_repl = {}
118
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000119_MAXCACHE = 100
Guido van Rossum7627c0d2000-03-31 14:58:54 +0000120
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000121def _join(seq, sep):
122 # internal: join into string having the same type as sep
Fredrik Lundhf2989b22001-02-18 12:05:16 +0000123 return string.join(seq, sep[:0])
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000124
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000125def _compile(*key):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000126 # internal: compile pattern
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000127 p = _cache.get(key)
128 if p is not None:
129 return p
130 pattern, flags = key
131 if type(pattern) not in sre_compile.STRING_TYPES:
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000132 return pattern
Fredrik Lundhe1869832000-08-01 22:47:49 +0000133 try:
134 p = sre_compile.compile(pattern, flags)
135 except error, v:
136 raise error, v # invalid expression
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000137 if len(_cache) >= _MAXCACHE:
138 _cache.clear()
139 _cache[key] = p
140 return p
141
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000142def _compile_repl(*key):
143 # internal: compile replacement pattern
144 p = _cache_repl.get(key)
145 if p is not None:
146 return p
147 repl, pattern = key
148 try:
149 p = sre_parse.parse_template(repl, pattern)
150 except error, v:
151 raise error, v # invalid expression
152 if len(_cache_repl) >= _MAXCACHE:
153 _cache_repl.clear()
154 _cache_repl[key] = p
155 return p
156
Fredrik Lundh5644b7f2000-09-21 17:03:25 +0000157def _expand(pattern, match, template):
158 # internal: match.expand implementation hook
159 template = sre_parse.parse_template(template, pattern)
160 return sre_parse.expand_template(template, match)
161
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000162def _sub(pattern, template, text, count=0):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000163 # internal: pattern.sub implementation hook
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000164 return _subn(pattern, template, text, count, 1)[0]
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000165
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000166def _subn(pattern, template, text, count=0, sub=0):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000167 # internal: pattern.subn implementation hook
168 if callable(template):
Andrew M. Kuchlinge8d52af2000-06-18 20:27:10 +0000169 filter = template
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000170 else:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000171 template = _compile_repl(template, pattern)
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000172 literals = template[1]
Guido van Rossum315cd292001-08-10 14:56:54 +0000173 sub = 0 # temporarly disabled, see bug #449000
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000174 if (sub and not count and pattern._isliteral() and
175 len(literals) == 1 and literals[0]):
176 # shortcut: both pattern and string are literals
177 return string.replace(text, pattern.pattern, literals[0]), 0
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000178 def filter(match, template=template):
Fredrik Lundh436c3d582000-06-29 08:58:44 +0000179 return sre_parse.expand_template(template, match)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000180 n = i = 0
181 s = []
182 append = s.append
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000183 c = pattern.scanner(text)
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000184 while not count or n < count:
185 m = c.search()
186 if not m:
187 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000188 b, e = m.span()
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000189 if i < b:
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000190 append(text[i:b])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000191 append(filter(m))
Fredrik Lundh90a07912000-06-30 07:50:59 +0000192 i = e
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000193 n = n + 1
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000194 append(text[i:])
195 return _join(s, text[:0]), n
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000196
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000197def _split(pattern, text, maxsplit=0):
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000198 # internal: pattern.split implementation hook
199 n = i = 0
200 s = []
201 append = s.append
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000202 extend = s.extend
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000203 c = pattern.scanner(text)
Fredrik Lundh01016fe2000-06-30 00:27:46 +0000204 g = pattern.groups
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000205 while not maxsplit or n < maxsplit:
206 m = c.search()
207 if not m:
208 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000209 b, e = m.span()
210 if b == e:
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000211 if i >= len(text):
Fredrik Lundh90a07912000-06-30 07:50:59 +0000212 break
213 continue
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000214 append(text[i:b])
Fredrik Lundh90a07912000-06-30 07:50:59 +0000215 if g and b != e:
Fredrik Lundh1c5aa692001-01-16 07:37:30 +0000216 extend(list(m.groups()))
Fredrik Lundh90a07912000-06-30 07:50:59 +0000217 i = e
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000218 n = n + 1
Fredrik Lundh2d96f112001-07-08 13:26:57 +0000219 append(text[i:])
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000220 return s
Fredrik Lundh0640e112000-06-30 13:55:15 +0000221
222# register myself for pickling
223
224import copy_reg
225
226def _pickle(p):
227 return _compile, (p.pattern, p.flags)
228
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000229copy_reg.pickle(type(_compile("", 0)), _pickle, _compile)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000230
231# --------------------------------------------------------------------
232# experimental stuff (see python-dev discussions for details)
233
234class Scanner:
235 def __init__(self, lexicon):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000236 from sre_constants import BRANCH, SUBPATTERN
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000237 self.lexicon = lexicon
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000238 # combine phrases into a compound pattern
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000239 p = []
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000240 s = sre_parse.Pattern()
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000241 for phrase, action in lexicon:
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000242 p.append(sre_parse.SubPattern(s, [
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000243 (SUBPATTERN, (len(p), sre_parse.parse(phrase))),
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000244 ]))
245 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
246 s.groups = len(p)
247 self.scanner = sre_compile.compile(p)
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000248 def scan(self, string):
249 result = []
250 append = result.append
251 match = self.scanner.match
252 i = 0
253 while 1:
254 m = match(string, i)
255 if not m:
256 break
257 j = m.end()
258 if i == j:
259 break
Fredrik Lundh019bcb52000-07-02 22:59:57 +0000260 action = self.lexicon[m.lastindex][1]
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000261 if callable(action):
Fredrik Lundh770617b2001-01-14 15:06:11 +0000262 self.match = m
Fredrik Lundh7cafe4d2000-07-02 17:33:27 +0000263 action = action(self, m.group())
264 if action is not None:
265 append(action)
266 i = j
267 return result, string[i:]