blob: a229e235ca202a22daa5411bcd483ae8ba221171 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00005from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02006import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04007import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02008import sys
9import string
10import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020011import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000012from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000013
Guido van Rossum23b22571997-07-17 22:36:14 +000014# Misc tests from Tim Peters' re.doc
15
Just van Rossum6802c6e2003-07-02 14:36:59 +000016# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020017# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000018# cover most of the code.
19
Serhiy Storchaka25324972013-10-16 12:46:28 +030020class S(str):
21 def __getitem__(self, index):
22 return S(super().__getitem__(index))
23
24class B(bytes):
25 def __getitem__(self, index):
26 return B(super().__getitem__(index))
27
Skip Montanaro8ed06da2003-04-24 19:43:18 +000028class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000029
Serhiy Storchaka25324972013-10-16 12:46:28 +030030 def assertTypedEqual(self, actual, expect, msg=None):
31 self.assertEqual(actual, expect, msg)
32 def recurse(actual, expect):
33 if isinstance(expect, (tuple, list)):
34 for x, y in zip(actual, expect):
35 recurse(x, y)
36 else:
37 self.assertIs(type(actual), type(expect), msg)
38 recurse(actual, expect)
39
Benjamin Petersone48944b2012-03-07 14:50:25 -060040 def test_keep_buffer(self):
41 # See bug 14212
42 b = bytearray(b'x')
43 it = re.finditer(b'a', b)
44 with self.assertRaises(BufferError):
45 b.extend(b'x'*400)
46 list(it)
47 del it
48 gc_collect()
49 b.extend(b'x'*400)
50
Raymond Hettinger027bb632004-05-31 03:09:25 +000051 def test_weakref(self):
52 s = 'QabbbcR'
53 x = re.compile('ab+c')
54 y = proxy(x)
55 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
56
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 def test_search_star_plus(self):
58 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
59 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
60 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
61 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000062 self.assertEqual(re.search('x', 'aaa'), None)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000063 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
64 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
65 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
66 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000067 self.assertEqual(re.match('a+', 'xxx'), None)
Guido van Rossum8430c581998-04-03 21:47:12 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000070 int_value = int(matchobj.group(0))
71 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030074 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
75 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
76 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
77 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030080 for y in ("\xe0", "\u0430", "\U0001d49c"):
81 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030082
Skip Montanaro8ed06da2003-04-24 19:43:18 +000083 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
84 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
85 '9.3 -3 24x100y')
86 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
87 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
90 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 s = r"\1\1"
93 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
94 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
95 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000096
Skip Montanaro8ed06da2003-04-24 19:43:18 +000097 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
98 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
103 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
104 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
106 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000109
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000110 def test_bug_449964(self):
111 # fails for group followed by other escape
112 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
113 'xx\bxx\b')
114
115 def test_bug_449000(self):
116 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000117 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
118 'abc\ndef\n')
119 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
120 'abc\ndef\n')
121 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
122 'abc\ndef\n')
123 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
124 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000125
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000126 def test_bug_1661(self):
127 # Verify that flags do not get silently ignored with compiled patterns
128 pattern = re.compile('.')
129 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
130 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.compile, pattern, re.I)
133
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000134 def test_bug_3629(self):
135 # A regex that triggered a bug in the sre-code validator
136 re.compile("(?P<quote>)(?(quote))")
137
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000138 def test_sub_template_numeric_escape(self):
139 # bug 776311 and friends
140 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
141 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
142 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
143 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
144 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
145 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
146 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
147
148 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
150
151 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
152 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
153 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
154 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
155 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
156
157 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
158 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
169 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
170 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
171 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
172
173 # in python2.3 (etc), these loop endlessly in sre_parser.py
174 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
175 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
176 'xz8')
177 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
178 'xza')
179
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000180 def test_qualified_re_sub(self):
181 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
182 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000183
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000184 def test_bug_114660(self):
185 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
186 'hello there')
187
188 def test_bug_462270(self):
189 # Test for empty sub() behaviour, see SF bug #462270
190 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
191 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
192
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200193 def test_symbolic_groups(self):
194 re.compile('(?P<a>x)(?P=a)(?(a)y)')
195 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
196 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
197 self.assertRaises(re.error, re.compile, '(?Px)')
198 self.assertRaises(re.error, re.compile, '(?P=)')
199 self.assertRaises(re.error, re.compile, '(?P=1)')
200 self.assertRaises(re.error, re.compile, '(?P=a)')
201 self.assertRaises(re.error, re.compile, '(?P=a1)')
202 self.assertRaises(re.error, re.compile, '(?P=a.)')
203 self.assertRaises(re.error, re.compile, '(?P<)')
204 self.assertRaises(re.error, re.compile, '(?P<>)')
205 self.assertRaises(re.error, re.compile, '(?P<1>)')
206 self.assertRaises(re.error, re.compile, '(?P<a.>)')
207 self.assertRaises(re.error, re.compile, '(?())')
208 self.assertRaises(re.error, re.compile, '(?(a))')
209 self.assertRaises(re.error, re.compile, '(?(1a))')
210 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200211 # New valid/invalid identifiers in Python 3
212 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
213 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
214 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200215
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000216 def test_symbolic_refs(self):
217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
223 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
224 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000226 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200227 # New valid/invalid identifiers in Python 3
228 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
229 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
230 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000231
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000232 def test_re_subn(self):
233 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
234 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
235 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
236 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
237 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000238
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000239 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300240 for string in ":a:b::c", S(":a:b::c"):
241 self.assertTypedEqual(re.split(":", string),
242 ['', 'a', 'b', '', 'c'])
243 self.assertTypedEqual(re.split(":*", string),
244 ['', 'a', 'b', 'c'])
245 self.assertTypedEqual(re.split("(:*)", string),
246 ['', ':', 'a', ':', 'b', '::', 'c'])
247 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
248 memoryview(b":a:b::c")):
249 self.assertTypedEqual(re.split(b":", string),
250 [b'', b'a', b'b', b'', b'c'])
251 self.assertTypedEqual(re.split(b":*", string),
252 [b'', b'a', b'b', b'c'])
253 self.assertTypedEqual(re.split(b"(:*)", string),
254 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
256 "\U0001d49c\U0001d49e\U0001d4b5"):
257 string = ":%s:%s::%s" % (a, b, c)
258 self.assertEqual(re.split(":", string), ['', a, b, '', c])
259 self.assertEqual(re.split(":*", string), ['', a, b, c])
260 self.assertEqual(re.split("(:*)", string),
261 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300262
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000263 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
264 self.assertEqual(re.split("(:)*", ":a:b::c"),
265 ['', ':', 'a', ':', 'b', ':', 'c'])
266 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
267 ['', ':', 'a', ':b::', 'c'])
268 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
269 ['', None, ':', 'a', None, ':', '', 'b', None, '',
270 None, '::', 'c'])
271 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
272 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 def test_qualified_re_split(self):
275 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
276 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
277 self.assertEqual(re.split("(:)", ":a:b::c", 2),
278 ['', ':', 'a', ':', 'b::c'])
279 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
280 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000281
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000282 def test_re_findall(self):
283 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300284 for string in "a:b::c:::d", S("a:b::c:::d"):
285 self.assertTypedEqual(re.findall(":+", string),
286 [":", "::", ":::"])
287 self.assertTypedEqual(re.findall("(:+)", string),
288 [":", "::", ":::"])
289 self.assertTypedEqual(re.findall("(:)(:*)", string),
290 [(":", ""), (":", ":"), (":", "::")])
291 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
292 memoryview(b"a:b::c:::d")):
293 self.assertTypedEqual(re.findall(b":+", string),
294 [b":", b"::", b":::"])
295 self.assertTypedEqual(re.findall(b"(:+)", string),
296 [b":", b"::", b":::"])
297 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
298 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300299 for x in ("\xe0", "\u0430", "\U0001d49c"):
300 xx = x * 2
301 xxx = x * 3
302 string = "a%sb%sc%sd" % (x, xx, xxx)
303 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
304 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
306 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000307
Skip Montanaro5ba00542003-04-25 16:00:14 +0000308 def test_bug_117612(self):
309 self.assertEqual(re.findall(r"(a|(b))", "aba"),
310 [("a", ""),("b", "b"),("a", "")])
311
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000312 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 for string in 'a', S('a'):
314 self.assertEqual(re.match('a', string).groups(), ())
315 self.assertEqual(re.match('(a)', string).groups(), ('a',))
316 self.assertEqual(re.match('(a)', string).group(0), 'a')
317 self.assertEqual(re.match('(a)', string).group(1), 'a')
318 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
319 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
320 self.assertEqual(re.match(b'a', string).groups(), ())
321 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
322 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
323 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 for a in ("\xe0", "\u0430", "\U0001d49c"):
326 self.assertEqual(re.match(a, a).groups(), ())
327 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
328 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
329 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000331
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000332 pat = re.compile('((a)|(b))(c)?')
333 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
334 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
335 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
336 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
337 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000338
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000339 # A single group
340 m = re.match('(a)', 'a')
341 self.assertEqual(m.group(0), 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(1), 'a')
344 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000345
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000346 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
347 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
348 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
349 (None, 'b', None))
350 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000351
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200352 def test_re_fullmatch(self):
353 # Issue 16203: Proposal: add re.fullmatch() method.
354 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
355 for string in "ab", S("ab"):
356 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
357 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
358 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
359 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
360 r = r"%s|%s" % (a, a + b)
361 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
362 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
363 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
365 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
366 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
367 self.assertIsNone(re.fullmatch(r"a+", "ab"))
368 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
369 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
371 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
372 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
374
375 self.assertEqual(
376 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
377 self.assertEqual(
378 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
379 self.assertEqual(
380 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
381
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000382 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
384 ('(', 'a'))
385 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
386 (None, 'a'))
387 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
388 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
389 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
390 ('a', 'b'))
391 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
392 (None, 'd'))
393 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
394 (None, 'd'))
395 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
396 ('a', ''))
397
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000398 # Tests for bug #1177831: exercise groups other than the first group
399 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
400 self.assertEqual(p.match('abc').groups(),
401 ('a', 'b', 'c'))
402 self.assertEqual(p.match('ad').groups(),
403 ('a', None, 'd'))
404 self.assertEqual(p.match('abd'), None)
405 self.assertEqual(p.match('ac'), None)
406
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000407
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000408 def test_re_groupref(self):
409 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
410 ('|', 'a'))
411 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
412 (None, 'a'))
413 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
414 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
415 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
416 ('a', 'a'))
417 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
418 (None, None))
419
420 def test_groupdict(self):
421 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
422 'first second').groupdict(),
423 {'first':'first', 'second':'second'})
424
425 def test_expand(self):
426 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
427 "first second")
428 .expand(r"\2 \1 \g<second> \g<first>"),
429 "second first second first")
430
431 def test_repeat_minmax(self):
432 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
433 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
434 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
435 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
436
437 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
438 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
445
446 self.assertEqual(re.match("^x{1}$", "xxx"), None)
447 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
448 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
449 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
450
451 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
452 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
453 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
454 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
455 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
456 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
457 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
458 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
459
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000460 self.assertEqual(re.match("^x{}$", "xxx"), None)
461 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
462
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000463 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000464 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
467 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
468 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
469 {'first': 1, 'other': 2})
470
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000471 self.assertEqual(re.match("(a)", "a").pos, 0)
472 self.assertEqual(re.match("(a)", "a").endpos, 1)
473 self.assertEqual(re.match("(a)", "a").string, "a")
474 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
475 self.assertNotEqual(re.match("(a)", "a").re, None)
476
477 def test_special_escapes(self):
478 self.assertEqual(re.search(r"\b(b.)\b",
479 "abcd abc bcd bx").group(1), "bx")
480 self.assertEqual(re.search(r"\B(b.)\B",
481 "abc bcd bc abxd").group(1), "bx")
482 self.assertEqual(re.search(r"\b(b.)\b",
483 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
484 self.assertEqual(re.search(r"\B(b.)\B",
485 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
486 self.assertEqual(re.search(r"\b(b.)\b",
487 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
488 self.assertEqual(re.search(r"\B(b.)\B",
489 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
490 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
491 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
493 self.assertEqual(re.search(r"\b(b.)\b",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 "abcd abc bcd bx").group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000495 self.assertEqual(re.search(r"\B(b.)\B",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000496 "abc bcd bc abxd").group(1), "bx")
497 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
498 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
499 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000500 self.assertEqual(re.search(r"\d\D\w\W\s\S",
501 "1aa! a").group(0), "1aa! a")
502 self.assertEqual(re.search(r"\d\D\w\W\s\S",
503 "1aa! a", re.LOCALE).group(0), "1aa! a")
504 self.assertEqual(re.search(r"\d\D\w\W\s\S",
505 "1aa! a", re.UNICODE).group(0), "1aa! a")
506
Ezio Melotti5a045b92012-02-29 11:48:44 +0200507 def test_string_boundaries(self):
508 # See http://bugs.python.org/issue10713
509 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
510 "abc")
511 # There's a word boundary at the start of a string.
512 self.assertTrue(re.match(r"\b", "abc"))
513 # A non-empty string includes a non-boundary zero-length match.
514 self.assertTrue(re.search(r"\B", "abc"))
515 # There is no non-boundary match at the start of a string.
516 self.assertFalse(re.match(r"\B", "abc"))
517 # However, an empty string contains no word boundaries, and also no
518 # non-boundaries.
519 self.assertEqual(re.search(r"\B", ""), None)
520 # This one is questionable and different from the perlre behaviour,
521 # but describes current behavior.
522 self.assertEqual(re.search(r"\b", ""), None)
523 # A single word-character string has two boundaries, but no
524 # non-boundary gaps.
525 self.assertEqual(len(re.findall(r"\b", "a")), 2)
526 self.assertEqual(len(re.findall(r"\B", "a")), 0)
527 # If there are no words, there are no boundaries
528 self.assertEqual(len(re.findall(r"\b", " ")), 0)
529 self.assertEqual(len(re.findall(r"\b", " ")), 0)
530 # Can match around the whitespace.
531 self.assertEqual(len(re.findall(r"\B", " ")), 2)
532
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000533 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 self.assertEqual(re.match("([\u2222\u2223])",
535 "\u2222").group(1), "\u2222")
536 self.assertEqual(re.match("([\u2222\u2223])",
537 "\u2222", re.UNICODE).group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300538 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
539 self.assertEqual(re.match(r,
540 "\uff01", re.UNICODE).group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000541
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100542 def test_big_codesize(self):
543 # Issue #1160
544 r = re.compile('|'.join(('%d'%x for x in range(10000))))
545 self.assertIsNotNone(r.match('1000'))
546 self.assertIsNotNone(r.match('9999'))
547
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000548 def test_anyall(self):
549 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
550 "a\nb")
551 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
552 "a\n\nb")
553
554 def test_non_consuming(self):
555 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
556 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
557 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
558 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
559 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
560 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
561 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
562
563 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
564 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
567
568 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000569 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
570 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000571 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
572 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
573 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
574 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
575 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
576 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
577 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
578 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
579
580 def test_category(self):
581 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
582
583 def test_getlower(self):
584 import _sre
585 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
586 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
587 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
588
589 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000591
592 def test_not_literal(self):
593 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
594 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
595
596 def test_search_coverage(self):
597 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
598 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
599
Ezio Melottid2114eb2011-03-25 14:08:44 +0200600 def assertMatch(self, pattern, text, match=None, span=None,
601 matcher=re.match):
602 if match is None and span is None:
603 # the pattern matches the whole text
604 match = text
605 span = (0, len(text))
606 elif match is None or span is None:
607 raise ValueError('If match is not None, span should be specified '
608 '(and vice versa).')
609 m = matcher(pattern, text)
610 self.assertTrue(m)
611 self.assertEqual(m.group(), match)
612 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000613
Ezio Melottid2114eb2011-03-25 14:08:44 +0200614 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300615 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200616 p = ''.join(chr(i) for i in range(256))
617 for c in p:
618 if c in alnum_chars:
619 self.assertEqual(re.escape(c), c)
620 elif c == '\x00':
621 self.assertEqual(re.escape(c), '\\000')
622 else:
623 self.assertEqual(re.escape(c), '\\' + c)
624 self.assertMatch(re.escape(c), c)
625 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000626
Guido van Rossum698280d2008-09-10 17:44:35 +0000627 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300628 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200629 p = bytes(range(256))
630 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000631 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200632 if b in alnum_chars:
633 self.assertEqual(re.escape(b), b)
634 elif i == 0:
635 self.assertEqual(re.escape(b), b'\\000')
636 else:
637 self.assertEqual(re.escape(b), b'\\' + b)
638 self.assertMatch(re.escape(b), b)
639 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000640
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200641 def test_re_escape_non_ascii(self):
642 s = 'xxx\u2620\u2620\u2620xxx'
643 s_escaped = re.escape(s)
644 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
645 self.assertMatch(s_escaped, s)
646 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
647 'x\u2620\u2620\u2620x', (2, 7), re.search)
648
649 def test_re_escape_non_ascii_bytes(self):
650 b = 'y\u2620y\u2620y'.encode('utf-8')
651 b_escaped = re.escape(b)
652 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
653 self.assertMatch(b_escaped, b)
654 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
655 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000656
Skip Montanaro1e703c62003-04-25 15:40:28 +0000657 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000658 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
659 s = pickle.dumps(oldpat)
660 newpat = pickle.loads(s)
661 self.assertEqual(oldpat, newpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000662
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000663 def test_constants(self):
664 self.assertEqual(re.I, re.IGNORECASE)
665 self.assertEqual(re.L, re.LOCALE)
666 self.assertEqual(re.M, re.MULTILINE)
667 self.assertEqual(re.S, re.DOTALL)
668 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000669
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000670 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000671 for flag in [re.I, re.M, re.X, re.S, re.L]:
672 self.assertNotEqual(re.compile('^pattern$', flag), None)
Guido van Rossumf473cb01998-01-14 16:42:17 +0000673
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000674 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200675 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
676 if i < 256:
677 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
678 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
679 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
680 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
681 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
682 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
683 if i < 0x10000:
684 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
685 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
686 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
687 self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
688 self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
689 self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
690 self.assertIsNotNone(re.match(r"\0", "\000"))
691 self.assertIsNotNone(re.match(r"\08", "\0008"))
692 self.assertIsNotNone(re.match(r"\01", "\001"))
693 self.assertIsNotNone(re.match(r"\018", "\0018"))
694 self.assertIsNotNone(re.match(r"\567", chr(0o167)))
695 self.assertRaises(re.error, re.match, r"\911", "")
696 self.assertRaises(re.error, re.match, r"\x1", "")
697 self.assertRaises(re.error, re.match, r"\x1z", "")
698 self.assertRaises(re.error, re.match, r"\u123", "")
699 self.assertRaises(re.error, re.match, r"\u123z", "")
700 self.assertRaises(re.error, re.match, r"\U0001234", "")
701 self.assertRaises(re.error, re.match, r"\U0001234z", "")
702 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000703
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000704 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200705 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
706 if i < 256:
707 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
708 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
709 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
710 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
711 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
712 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
713 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
714 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
715 if i < 0x10000:
716 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
717 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
718 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
719 self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
720 self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
721 self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Ezio Melottieadece22013-02-23 08:40:07 +0200722 self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200723 self.assertRaises(re.error, re.match, r"[\911]", "")
724 self.assertRaises(re.error, re.match, r"[\x1z]", "")
725 self.assertRaises(re.error, re.match, r"[\u123z]", "")
726 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
727 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
728
729 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000730 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Antoine Pitrou463badf2012-06-23 13:29:19 +0200731 self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
732 self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
733 self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
734 self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
735 self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
736 self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
737 self.assertIsNotNone(re.match(br"\u", b'u'))
738 self.assertIsNotNone(re.match(br"\U", b'U'))
739 self.assertIsNotNone(re.match(br"\0", b"\000"))
740 self.assertIsNotNone(re.match(br"\08", b"\0008"))
741 self.assertIsNotNone(re.match(br"\01", b"\001"))
742 self.assertIsNotNone(re.match(br"\018", b"\0018"))
743 self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
744 self.assertRaises(re.error, re.match, br"\911", b"")
745 self.assertRaises(re.error, re.match, br"\x1", b"")
746 self.assertRaises(re.error, re.match, br"\x1z", b"")
747
748 def test_sre_byte_class_literals(self):
749 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
750 self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
751 self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
752 self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
753 self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
754 self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
755 self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
756 self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
757 self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
758 self.assertIsNotNone(re.match(br"[\u]", b'u'))
759 self.assertIsNotNone(re.match(br"[\U]", b'U'))
760 self.assertRaises(re.error, re.match, br"[\911]", "")
761 self.assertRaises(re.error, re.match, br"[\x1z]", "")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000762
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000763 def test_bug_113254(self):
764 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
765 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
766 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
767
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000768 def test_bug_527371(self):
769 # bug described in patches 527371/672491
770 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
771 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
772 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
773 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
774 self.assertEqual(re.match("((a))", "a").lastindex, 1)
775
776 def test_bug_545855(self):
777 # bug 545855 -- This pattern failed to cause a compile error as it
778 # should, instead provoking a TypeError.
779 self.assertRaises(re.error, re.compile, 'foo[a-')
780
781 def test_bug_418626(self):
782 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
783 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
784 # pattern '*?' on a long string.
785 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
786 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
787 20003)
788 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000790 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000791 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000792
793 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000795 self.assertEqual(re.compile(pat) and 1, 1)
796
Skip Montanaro1e703c62003-04-25 15:40:28 +0000797 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000799 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000800 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
801 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
802 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000803
Serhiy Storchakafa468162013-02-16 21:23:53 +0200804 def test_unlimited_zero_width_repeat(self):
805 # Issue #9669
806 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
807 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
808 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
809 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
810 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
811 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
812
Skip Montanaro1e703c62003-04-25 15:40:28 +0000813 def test_scanner(self):
814 def s_ident(scanner, token): return token
815 def s_operator(scanner, token): return "op%s" % token
816 def s_float(scanner, token): return float(token)
817 def s_int(scanner, token): return int(token)
818
819 scanner = Scanner([
820 (r"[a-zA-Z_]\w*", s_ident),
821 (r"\d+\.\d*", s_float),
822 (r"\d+", s_int),
823 (r"=|\+|-|\*|/", s_operator),
824 (r"\s+", None),
825 ])
826
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000827 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
828
Skip Montanaro1e703c62003-04-25 15:40:28 +0000829 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
830 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
831 'op+', 'bar'], ''))
832
Skip Montanaro5ba00542003-04-25 16:00:14 +0000833 def test_bug_448951(self):
834 # bug 448951 (similar to 429357, but with single char match)
835 # (Also test greedy matches.)
836 for op in '','?','*':
837 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
838 (None, None))
839 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
840 ('a:', 'a'))
841
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000842 def test_bug_725106(self):
843 # capturing groups in alternatives in repeats
844 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
845 ('b', 'a'))
846 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
847 ('c', 'b'))
848 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
849 ('b', None))
850 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
851 ('b', None))
852 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
853 ('b', 'a'))
854 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
855 ('c', 'b'))
856 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
857 ('b', None))
858 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
859 ('b', None))
860
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000861 def test_bug_725149(self):
862 # mark_stack_base restoring before restoring marks
863 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
864 ('a', None))
865 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
866 ('a', None, None))
867
Just van Rossum12723ba2003-07-02 20:03:04 +0000868 def test_bug_764548(self):
869 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000870 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000871 pat = re.compile(my_unicode("abc"))
872 self.assertEqual(pat.match("xyz"), None)
873
Skip Montanaro5ba00542003-04-25 16:00:14 +0000874 def test_finditer(self):
875 iter = re.finditer(r":+", "a:b::c:::d")
876 self.assertEqual([item.group(0) for item in iter],
877 [":", "::", ":::"])
878
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600879 pat = re.compile(r":+")
880 iter = pat.finditer("a:b::c:::d", 1, 10)
881 self.assertEqual([item.group(0) for item in iter],
882 [":", "::", ":::"])
883
884 pat = re.compile(r":+")
885 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
886 self.assertEqual([item.group(0) for item in iter],
887 [":", "::", ":::"])
888
889 pat = re.compile(r":+")
890 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
891 self.assertEqual([item.group(0) for item in iter],
892 [":", "::", ":::"])
893
894 pat = re.compile(r":+")
895 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
896 self.assertEqual([item.group(0) for item in iter],
897 ["::", "::"])
898
Thomas Wouters40a088d2008-03-18 20:19:54 +0000899 def test_bug_926075(self):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000900 self.assertTrue(re.compile('bug_926075') is not
Thomas Wouters40a088d2008-03-18 20:19:54 +0000901 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000902
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000903 def test_bug_931848(self):
Guido van Rossum7ebb9702007-05-15 21:39:58 +0000904 pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000905 self.assertEqual(re.compile(pattern).split("a.b.c"),
906 ['a','b','c'])
907
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000908 def test_bug_581080(self):
909 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000910 self.assertEqual(next(iter).span(), (1,2))
911 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000912
913 scanner = re.compile(r"\s").scanner("a b")
914 self.assertEqual(scanner.search().span(), (1, 2))
915 self.assertEqual(scanner.search(), None)
916
917 def test_bug_817234(self):
918 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000919 self.assertEqual(next(iter).span(), (0, 4))
920 self.assertEqual(next(iter).span(), (4, 4))
921 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000922
Mark Dickinson1f268282009-07-28 17:22:36 +0000923 def test_bug_6561(self):
924 # '\d' should match characters in Unicode category 'Nd'
925 # (Number, Decimal Digit), but not those in 'Nl' (Number,
926 # Letter) or 'No' (Number, Other).
927 decimal_digits = [
928 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
929 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
930 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
931 ]
932 for x in decimal_digits:
933 self.assertEqual(re.match('^\d$', x).group(0), x)
934
935 not_decimal_digits = [
936 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
937 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
938 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
939 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
940 ]
941 for x in not_decimal_digits:
942 self.assertIsNone(re.match('^\d$', x))
943
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 def test_empty_array(self):
945 # SF buf 1647541
946 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000947 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000948 a = array.array(typecode)
Antoine Pitroufd036452008-08-19 17:56:33 +0000949 self.assertEqual(re.compile(b"bla").match(a), None)
950 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000951
Christian Heimes072c0f12008-01-03 23:01:04 +0000952 def test_inline_flags(self):
953 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000954 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
955 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000956
957 p = re.compile(upper_char, re.I | re.U)
958 q = p.match(lower_char)
959 self.assertNotEqual(q, None)
960
961 p = re.compile(lower_char, re.I | re.U)
962 q = p.match(upper_char)
963 self.assertNotEqual(q, None)
964
965 p = re.compile('(?i)' + upper_char, re.U)
966 q = p.match(lower_char)
967 self.assertNotEqual(q, None)
968
969 p = re.compile('(?i)' + lower_char, re.U)
970 q = p.match(upper_char)
971 self.assertNotEqual(q, None)
972
973 p = re.compile('(?iu)' + upper_char)
974 q = p.match(lower_char)
975 self.assertNotEqual(q, None)
976
977 p = re.compile('(?iu)' + lower_char)
978 q = p.match(upper_char)
979 self.assertNotEqual(q, None)
980
Christian Heimes25bb7832008-01-11 16:17:00 +0000981 def test_dollar_matches_twice(self):
982 "$ matches the end of string, and just before the terminating \n"
983 pattern = re.compile('$')
984 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
985 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
986 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
987
988 pattern = re.compile('$', re.MULTILINE)
989 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
990 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
991 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
992
Antoine Pitroufd036452008-08-19 17:56:33 +0000993 def test_bytes_str_mixing(self):
994 # Mixing str and bytes is disallowed
995 pat = re.compile('.')
996 bpat = re.compile(b'.')
997 self.assertRaises(TypeError, pat.match, b'b')
998 self.assertRaises(TypeError, bpat.match, 'b')
999 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1000 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1001 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1002 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1003 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1004 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1005
1006 def test_ascii_and_unicode_flag(self):
1007 # String patterns
1008 for flags in (0, re.UNICODE):
1009 pat = re.compile('\xc0', flags | re.IGNORECASE)
1010 self.assertNotEqual(pat.match('\xe0'), None)
1011 pat = re.compile('\w', flags)
1012 self.assertNotEqual(pat.match('\xe0'), None)
1013 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1014 self.assertEqual(pat.match('\xe0'), None)
1015 pat = re.compile('(?a)\xc0', re.IGNORECASE)
1016 self.assertEqual(pat.match('\xe0'), None)
1017 pat = re.compile('\w', re.ASCII)
1018 self.assertEqual(pat.match('\xe0'), None)
1019 pat = re.compile('(?a)\w')
1020 self.assertEqual(pat.match('\xe0'), None)
1021 # Bytes patterns
1022 for flags in (0, re.ASCII):
1023 pat = re.compile(b'\xc0', re.IGNORECASE)
1024 self.assertEqual(pat.match(b'\xe0'), None)
1025 pat = re.compile(b'\w')
1026 self.assertEqual(pat.match(b'\xe0'), None)
1027 # Incompatibilities
1028 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1029 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1030 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1031 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1032 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1033 self.assertRaises(ValueError, re.compile, '(?au)\w')
1034
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001035 def test_bug_6509(self):
1036 # Replacement strings of both types must parse properly.
1037 # all strings
1038 pat = re.compile('a(\w)')
1039 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1040 pat = re.compile('a(.)')
1041 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1042 pat = re.compile('..')
1043 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1044
1045 # all bytes
1046 pat = re.compile(b'a(\w)')
1047 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1048 pat = re.compile(b'a(.)')
1049 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1050 pat = re.compile(b'..')
1051 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1052
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001053 def test_dealloc(self):
1054 # issue 3299: check for segfault in debug build
1055 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001056 # the overflow limit is different on wide and narrow builds and it
1057 # depends on the definition of SRE_CODE (see sre.h).
1058 # 2**128 should be big enough to overflow on both. For smaller values
1059 # a RuntimeError is raised instead of OverflowError.
1060 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001061 self.assertRaises(TypeError, re.finditer, "a", {})
1062 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001063 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 def test_search_dot_unicode(self):
1066 self.assertIsNotNone(re.search("123.*-", '123abc-'))
1067 self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
1068 self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
1069 self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
1070 self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1071
Ezio Melottidf723e12012-03-13 01:29:48 +02001072 def test_compile(self):
1073 # Test return value when given string and pattern as parameter
1074 pattern = re.compile('random pattern')
1075 self.assertIsInstance(pattern, re._pattern_type)
1076 same_pattern = re.compile(pattern)
1077 self.assertIsInstance(same_pattern, re._pattern_type)
1078 self.assertIs(same_pattern, pattern)
1079 # Test behaviour when not given a string or pattern as parameter
1080 self.assertRaises(TypeError, re.compile, 0)
1081
Ezio Melottife8e6e72013-01-11 08:32:01 +02001082 def test_bug_13899(self):
1083 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1084 # nothing. Ditto B and Z.
1085 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1086 ['A', 'B', '\b', 'C', 'Z'])
1087
Antoine Pitroub33941a2012-12-03 20:55:56 +01001088 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001089 def test_large_search(self, size):
1090 # Issue #10182: indices were 32-bit-truncated.
1091 s = 'a' * size
1092 m = re.search('$', s)
1093 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001094 self.assertEqual(m.start(), size)
1095 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001096
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001097 # The huge memuse is because of re.sub() using a list and a join()
1098 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001099 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001100 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001101 # Issue #10182: indices were 32-bit-truncated.
1102 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001103 r, n = re.subn('', '', s)
1104 self.assertEqual(r, s)
1105 self.assertEqual(n, size + 1)
1106
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001107 def test_bug_16688(self):
1108 # Issue 16688: Backreferences make case-insensitive regex fail on
1109 # non-ASCII strings.
1110 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1111 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001112
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001113 def test_repeat_minmax_overflow(self):
1114 # Issue #13169
1115 string = "x" * 100000
1116 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1117 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1118 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1119 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1120 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1121 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1122 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1123 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1124 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1125 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1126 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1127
1128 @cpython_only
1129 def test_repeat_minmax_overflow_maxrepeat(self):
1130 try:
1131 from _sre import MAXREPEAT
1132 except ImportError:
1133 self.skipTest('requires _sre.MAXREPEAT constant')
1134 string = "x" * 100000
1135 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1136 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1137 (0, 100000))
1138 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1139 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1140 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1141 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1142
R David Murray26dfaac92013-04-14 13:00:54 -04001143 def test_backref_group_name_in_exception(self):
1144 # Issue 17341: Poor error message when compiling invalid regex
1145 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1146 re.compile('(?P=<foo>)')
1147
1148 def test_group_name_in_exception(self):
1149 # Issue 17341: Poor error message when compiling invalid regex
1150 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1151 re.compile('(?P<?foo>)')
1152
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001153 def test_issue17998(self):
1154 for reps in '*', '+', '?', '{1}':
1155 for mod in '', '?':
1156 pattern = '.' + reps + mod + 'yz'
1157 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1158 ['xyz'], msg=pattern)
1159 pattern = pattern.encode()
1160 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1161 [b'xyz'], msg=pattern)
1162
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001163 def test_match_repr(self):
1164 for string in '[abracadabra]', S('[abracadabra]'):
1165 m = re.search(r'(.+)(.*?)\1', string)
1166 self.assertEqual(repr(m), "<%s.%s object; "
1167 "span=(1, 12), match='abracadabra'>" %
1168 (type(m).__module__, type(m).__qualname__))
1169 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1170 bytearray(b'[abracadabra]'),
1171 memoryview(b'[abracadabra]')):
1172 m = re.search(rb'(.+)(.*?)\1', string)
1173 self.assertEqual(repr(m), "<%s.%s object; "
1174 "span=(1, 12), match=b'abracadabra'>" %
1175 (type(m).__module__, type(m).__qualname__))
1176
1177 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1178 self.assertEqual(repr(first), "<%s.%s object; "
1179 "span=(0, 2), match='aa'>" %
1180 (type(second).__module__, type(first).__qualname__))
1181 self.assertEqual(repr(second), "<%s.%s object; "
1182 "span=(3, 5), match='bb'>" %
1183 (type(second).__module__, type(second).__qualname__))
1184
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001185
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001186 def test_bug_2537(self):
1187 # issue 2537: empty submatches
1188 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1189 for inner_op in ('{0,}', '*', '?'):
1190 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1191 m = r.match("xyyzy")
1192 self.assertEqual(m.group(0), "xyy")
1193 self.assertEqual(m.group(1), "")
1194 self.assertEqual(m.group(2), "y")
1195
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001196 def test_debug_flag(self):
1197 with captured_stdout() as out:
1198 re.compile('foo', re.DEBUG)
1199 self.assertEqual(out.getvalue().splitlines(),
1200 ['literal 102 ', 'literal 111 ', 'literal 111 '])
1201 # Debug output is output again even a second time (bypassing
1202 # the cache -- issue #20426).
1203 with captured_stdout() as out:
1204 re.compile('foo', re.DEBUG)
1205 self.assertEqual(out.getvalue().splitlines(),
1206 ['literal 102 ', 'literal 111 ', 'literal 111 '])
1207
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001208
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001209class PatternReprTests(unittest.TestCase):
1210 def check(self, pattern, expected):
1211 self.assertEqual(repr(re.compile(pattern)), expected)
1212
1213 def check_flags(self, pattern, flags, expected):
1214 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1215
1216 def test_without_flags(self):
1217 self.check('random pattern',
1218 "re.compile('random pattern')")
1219
1220 def test_single_flag(self):
1221 self.check_flags('random pattern', re.IGNORECASE,
1222 "re.compile('random pattern', re.IGNORECASE)")
1223
1224 def test_multiple_flags(self):
1225 self.check_flags('random pattern', re.I|re.S|re.X,
1226 "re.compile('random pattern', "
1227 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1228
1229 def test_unicode_flag(self):
1230 self.check_flags('random pattern', re.U,
1231 "re.compile('random pattern')")
1232 self.check_flags('random pattern', re.I|re.S|re.U,
1233 "re.compile('random pattern', "
1234 "re.IGNORECASE|re.DOTALL)")
1235
1236 def test_inline_flags(self):
1237 self.check('(?i)pattern',
1238 "re.compile('(?i)pattern', re.IGNORECASE)")
1239
1240 def test_unknown_flags(self):
1241 self.check_flags('random pattern', 0x123000,
1242 "re.compile('random pattern', 0x123000)")
1243 self.check_flags('random pattern', 0x123000|re.I,
1244 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1245
1246 def test_bytes(self):
1247 self.check(b'bytes pattern',
1248 "re.compile(b'bytes pattern')")
1249 self.check_flags(b'bytes pattern', re.A,
1250 "re.compile(b'bytes pattern', re.ASCII)")
1251
1252 def test_quotes(self):
1253 self.check('random "double quoted" pattern',
1254 '''re.compile('random "double quoted" pattern')''')
1255 self.check("random 'single quoted' pattern",
1256 '''re.compile("random 'single quoted' pattern")''')
1257 self.check('''both 'single' and "double" quotes''',
1258 '''re.compile('both \\'single\\' and "double" quotes')''')
1259
1260 def test_long_pattern(self):
1261 pattern = 'Very %spattern' % ('long ' * 1000)
1262 r = repr(re.compile(pattern))
1263 self.assertLess(len(r), 300)
1264 self.assertEqual(r[:30], "re.compile('Very long long lon")
1265 r = repr(re.compile(pattern, re.I))
1266 self.assertLess(len(r), 300)
1267 self.assertEqual(r[:30], "re.compile('Very long long lon")
1268 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1269
1270
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001271class ImplementationTest(unittest.TestCase):
1272 """
1273 Test implementation details of the re module.
1274 """
1275
1276 def test_overlap_table(self):
1277 f = sre_compile._generate_overlap_table
1278 self.assertEqual(f(""), [])
1279 self.assertEqual(f("a"), [0])
1280 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1281 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1282 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1283 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1284
1285
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001286def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001287 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001288 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001289 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001290 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001291 # To save time, only run the first and last 10 tests
1292 #tests = tests[:10] + tests[-10:]
1293 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001294
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001295 for t in tests:
1296 sys.stdout.flush()
1297 pattern = s = outcome = repl = expected = None
1298 if len(t) == 5:
1299 pattern, s, outcome, repl, expected = t
1300 elif len(t) == 3:
1301 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001302 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001303 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001304
Guido van Rossum41360a41998-03-26 19:42:58 +00001305 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001306 obj = re.compile(pattern)
1307 except re.error:
1308 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001309 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001310 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001311 except KeyboardInterrupt: raise KeyboardInterrupt
1312 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001313 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001314 if verbose:
1315 traceback.print_exc(file=sys.stdout)
1316 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001317 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001318 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001319 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001320 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001321 if outcome == SYNTAX_ERROR:
1322 # This should have been a syntax error; forget it.
1323 pass
1324 elif outcome == FAIL:
1325 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001326 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001327 elif outcome == SUCCEED:
1328 if result is not None:
1329 # Matched, as expected, so now we compute the
1330 # result string and compare it to our expected result.
1331 start, end = result.span(0)
1332 vardict={'found': result.group(0),
1333 'groups': result.group(),
1334 'flags': result.re.flags}
1335 for i in range(1, 100):
1336 try:
1337 gi = result.group(i)
1338 # Special hack because else the string concat fails:
1339 if gi is None:
1340 gi = "None"
1341 except IndexError:
1342 gi = "Error"
1343 vardict['g%d' % i] = gi
1344 for i in result.re.groupindex.keys():
1345 try:
1346 gi = result.group(i)
1347 if gi is None:
1348 gi = "None"
1349 except IndexError:
1350 gi = "Error"
1351 vardict[i] = gi
1352 repl = eval(repl, vardict)
1353 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001354 print('=== grouping error', t, end=' ')
1355 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001356 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001357 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001358
Antoine Pitrou22628c42008-07-22 17:53:22 +00001359 # Try the match with both pattern and string converted to
1360 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001361 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001362 bpat = bytes(pattern, "ascii")
1363 bs = bytes(s, "ascii")
1364 except UnicodeEncodeError:
1365 # skip non-ascii tests
1366 pass
1367 else:
1368 try:
1369 bpat = re.compile(bpat)
1370 except Exception:
1371 print('=== Fails on bytes pattern compile', t)
1372 if verbose:
1373 traceback.print_exc(file=sys.stdout)
1374 else:
1375 bytes_result = bpat.search(bs)
1376 if bytes_result is None:
1377 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001378
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001379 # Try the match with the search area limited to the extent
1380 # of the match and see if it still succeeds. \B will
1381 # break (because it won't match at the end or start of a
1382 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001383
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001384 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1385 and result is not None:
1386 obj = re.compile(pattern)
1387 result = obj.search(s, result.start(0), result.end(0) + 1)
1388 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001389 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001390
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001391 # Try the match with IGNORECASE enabled, and check that it
1392 # still succeeds.
1393 obj = re.compile(pattern, re.IGNORECASE)
1394 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001395 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001396 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001397
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001398 # Try the match with LOCALE enabled, and check that it
1399 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001400 if '(?u)' not in pattern:
1401 obj = re.compile(pattern, re.LOCALE)
1402 result = obj.search(s)
1403 if result is None:
1404 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001405
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001406 # Try the match with UNICODE locale enabled, and check
1407 # that it still succeeds.
1408 obj = re.compile(pattern, re.UNICODE)
1409 result = obj.search(s)
1410 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001411 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001412
Gregory P. Smith5a631832010-07-27 05:31:29 +00001413
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001414def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001415 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001416 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001417
1418if __name__ == "__main__":
1419 test_main()