blob: 0c8a52f23a5ae62a8330d5c92601abbdfa3cd064 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00005from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02006import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04007import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02008import sys
9import string
10import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020011import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000012from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000013
Guido van Rossum23b22571997-07-17 22:36:14 +000014# Misc tests from Tim Peters' re.doc
15
Just van Rossum6802c6e2003-07-02 14:36:59 +000016# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020017# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000018# cover most of the code.
19
Serhiy Storchaka25324972013-10-16 12:46:28 +030020class S(str):
21 def __getitem__(self, index):
22 return S(super().__getitem__(index))
23
24class B(bytes):
25 def __getitem__(self, index):
26 return B(super().__getitem__(index))
27
Skip Montanaro8ed06da2003-04-24 19:43:18 +000028class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000029
Serhiy Storchaka25324972013-10-16 12:46:28 +030030 def assertTypedEqual(self, actual, expect, msg=None):
31 self.assertEqual(actual, expect, msg)
32 def recurse(actual, expect):
33 if isinstance(expect, (tuple, list)):
34 for x, y in zip(actual, expect):
35 recurse(x, y)
36 else:
37 self.assertIs(type(actual), type(expect), msg)
38 recurse(actual, expect)
39
Benjamin Petersone48944b2012-03-07 14:50:25 -060040 def test_keep_buffer(self):
41 # See bug 14212
42 b = bytearray(b'x')
43 it = re.finditer(b'a', b)
44 with self.assertRaises(BufferError):
45 b.extend(b'x'*400)
46 list(it)
47 del it
48 gc_collect()
49 b.extend(b'x'*400)
50
Raymond Hettinger027bb632004-05-31 03:09:25 +000051 def test_weakref(self):
52 s = 'QabbbcR'
53 x = re.compile('ab+c')
54 y = proxy(x)
55 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
56
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 def test_search_star_plus(self):
58 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
59 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
60 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
61 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000062 self.assertEqual(re.search('x', 'aaa'), None)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000063 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
64 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
65 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
66 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000067 self.assertEqual(re.match('a+', 'xxx'), None)
Guido van Rossum8430c581998-04-03 21:47:12 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000070 int_value = int(matchobj.group(0))
71 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030074 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
75 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
76 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
77 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030080 for y in ("\xe0", "\u0430", "\U0001d49c"):
81 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030082
Skip Montanaro8ed06da2003-04-24 19:43:18 +000083 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
84 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
85 '9.3 -3 24x100y')
86 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
87 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
90 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 s = r"\1\1"
93 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
94 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
95 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000096
Skip Montanaro8ed06da2003-04-24 19:43:18 +000097 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
98 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
103 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
104 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
106 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000109
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000110 def test_bug_449964(self):
111 # fails for group followed by other escape
112 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
113 'xx\bxx\b')
114
115 def test_bug_449000(self):
116 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000117 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
118 'abc\ndef\n')
119 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
120 'abc\ndef\n')
121 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
122 'abc\ndef\n')
123 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
124 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000125
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000126 def test_bug_1661(self):
127 # Verify that flags do not get silently ignored with compiled patterns
128 pattern = re.compile('.')
129 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
130 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.compile, pattern, re.I)
133
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000134 def test_bug_3629(self):
135 # A regex that triggered a bug in the sre-code validator
136 re.compile("(?P<quote>)(?(quote))")
137
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000138 def test_sub_template_numeric_escape(self):
139 # bug 776311 and friends
140 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
141 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
142 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
143 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
144 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
145 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
146 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
147
148 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
150
151 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
152 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
153 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
154 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
155 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
156
157 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
158 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
169 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
170 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
171 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
172
173 # in python2.3 (etc), these loop endlessly in sre_parser.py
174 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
175 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
176 'xz8')
177 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
178 'xza')
179
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000180 def test_qualified_re_sub(self):
181 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
182 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000183
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000184 def test_bug_114660(self):
185 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
186 'hello there')
187
188 def test_bug_462270(self):
189 # Test for empty sub() behaviour, see SF bug #462270
190 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
191 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
192
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200193 def test_symbolic_groups(self):
194 re.compile('(?P<a>x)(?P=a)(?(a)y)')
195 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
196 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
197 self.assertRaises(re.error, re.compile, '(?Px)')
198 self.assertRaises(re.error, re.compile, '(?P=)')
199 self.assertRaises(re.error, re.compile, '(?P=1)')
200 self.assertRaises(re.error, re.compile, '(?P=a)')
201 self.assertRaises(re.error, re.compile, '(?P=a1)')
202 self.assertRaises(re.error, re.compile, '(?P=a.)')
203 self.assertRaises(re.error, re.compile, '(?P<)')
204 self.assertRaises(re.error, re.compile, '(?P<>)')
205 self.assertRaises(re.error, re.compile, '(?P<1>)')
206 self.assertRaises(re.error, re.compile, '(?P<a.>)')
207 self.assertRaises(re.error, re.compile, '(?())')
208 self.assertRaises(re.error, re.compile, '(?(a))')
209 self.assertRaises(re.error, re.compile, '(?(1a))')
210 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200211 # New valid/invalid identifiers in Python 3
212 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
213 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
214 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200215
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000216 def test_symbolic_refs(self):
217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
223 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
224 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000226 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200227 # New valid/invalid identifiers in Python 3
228 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
229 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
230 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000231
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000232 def test_re_subn(self):
233 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
234 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
235 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
236 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
237 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000238
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000239 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300240 for string in ":a:b::c", S(":a:b::c"):
241 self.assertTypedEqual(re.split(":", string),
242 ['', 'a', 'b', '', 'c'])
243 self.assertTypedEqual(re.split(":*", string),
244 ['', 'a', 'b', 'c'])
245 self.assertTypedEqual(re.split("(:*)", string),
246 ['', ':', 'a', ':', 'b', '::', 'c'])
247 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
248 memoryview(b":a:b::c")):
249 self.assertTypedEqual(re.split(b":", string),
250 [b'', b'a', b'b', b'', b'c'])
251 self.assertTypedEqual(re.split(b":*", string),
252 [b'', b'a', b'b', b'c'])
253 self.assertTypedEqual(re.split(b"(:*)", string),
254 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
256 "\U0001d49c\U0001d49e\U0001d4b5"):
257 string = ":%s:%s::%s" % (a, b, c)
258 self.assertEqual(re.split(":", string), ['', a, b, '', c])
259 self.assertEqual(re.split(":*", string), ['', a, b, c])
260 self.assertEqual(re.split("(:*)", string),
261 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300262
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000263 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
264 self.assertEqual(re.split("(:)*", ":a:b::c"),
265 ['', ':', 'a', ':', 'b', ':', 'c'])
266 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
267 ['', ':', 'a', ':b::', 'c'])
268 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
269 ['', None, ':', 'a', None, ':', '', 'b', None, '',
270 None, '::', 'c'])
271 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
272 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 def test_qualified_re_split(self):
275 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
276 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
277 self.assertEqual(re.split("(:)", ":a:b::c", 2),
278 ['', ':', 'a', ':', 'b::c'])
279 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
280 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000281
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000282 def test_re_findall(self):
283 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300284 for string in "a:b::c:::d", S("a:b::c:::d"):
285 self.assertTypedEqual(re.findall(":+", string),
286 [":", "::", ":::"])
287 self.assertTypedEqual(re.findall("(:+)", string),
288 [":", "::", ":::"])
289 self.assertTypedEqual(re.findall("(:)(:*)", string),
290 [(":", ""), (":", ":"), (":", "::")])
291 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
292 memoryview(b"a:b::c:::d")):
293 self.assertTypedEqual(re.findall(b":+", string),
294 [b":", b"::", b":::"])
295 self.assertTypedEqual(re.findall(b"(:+)", string),
296 [b":", b"::", b":::"])
297 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
298 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300299 for x in ("\xe0", "\u0430", "\U0001d49c"):
300 xx = x * 2
301 xxx = x * 3
302 string = "a%sb%sc%sd" % (x, xx, xxx)
303 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
304 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
306 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000307
Skip Montanaro5ba00542003-04-25 16:00:14 +0000308 def test_bug_117612(self):
309 self.assertEqual(re.findall(r"(a|(b))", "aba"),
310 [("a", ""),("b", "b"),("a", "")])
311
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000312 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 for string in 'a', S('a'):
314 self.assertEqual(re.match('a', string).groups(), ())
315 self.assertEqual(re.match('(a)', string).groups(), ('a',))
316 self.assertEqual(re.match('(a)', string).group(0), 'a')
317 self.assertEqual(re.match('(a)', string).group(1), 'a')
318 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
319 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
320 self.assertEqual(re.match(b'a', string).groups(), ())
321 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
322 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
323 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 for a in ("\xe0", "\u0430", "\U0001d49c"):
326 self.assertEqual(re.match(a, a).groups(), ())
327 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
328 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
329 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000331
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000332 pat = re.compile('((a)|(b))(c)?')
333 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
334 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
335 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
336 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
337 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000338
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000339 # A single group
340 m = re.match('(a)', 'a')
341 self.assertEqual(m.group(0), 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(1), 'a')
344 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000345
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000346 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
347 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
348 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
349 (None, 'b', None))
350 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000351
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200352 def test_re_fullmatch(self):
353 # Issue 16203: Proposal: add re.fullmatch() method.
354 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
355 for string in "ab", S("ab"):
356 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
357 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
358 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
359 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
360 r = r"%s|%s" % (a, a + b)
361 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
362 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
363 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
365 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
366 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
367 self.assertIsNone(re.fullmatch(r"a+", "ab"))
368 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
369 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
371 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
372 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
374
375 self.assertEqual(
376 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
377 self.assertEqual(
378 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
379 self.assertEqual(
380 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
381
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000382 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
384 ('(', 'a'))
385 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
386 (None, 'a'))
387 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
388 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
389 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
390 ('a', 'b'))
391 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
392 (None, 'd'))
393 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
394 (None, 'd'))
395 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
396 ('a', ''))
397
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000398 # Tests for bug #1177831: exercise groups other than the first group
399 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
400 self.assertEqual(p.match('abc').groups(),
401 ('a', 'b', 'c'))
402 self.assertEqual(p.match('ad').groups(),
403 ('a', None, 'd'))
404 self.assertEqual(p.match('abd'), None)
405 self.assertEqual(p.match('ac'), None)
406
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000407
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000408 def test_re_groupref(self):
409 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
410 ('|', 'a'))
411 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
412 (None, 'a'))
413 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
414 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
415 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
416 ('a', 'a'))
417 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
418 (None, None))
419
420 def test_groupdict(self):
421 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
422 'first second').groupdict(),
423 {'first':'first', 'second':'second'})
424
425 def test_expand(self):
426 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
427 "first second")
428 .expand(r"\2 \1 \g<second> \g<first>"),
429 "second first second first")
430
431 def test_repeat_minmax(self):
432 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
433 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
434 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
435 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
436
437 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
438 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
445
446 self.assertEqual(re.match("^x{1}$", "xxx"), None)
447 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
448 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
449 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
450
451 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
452 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
453 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
454 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
455 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
456 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
457 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
458 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
459
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000460 self.assertEqual(re.match("^x{}$", "xxx"), None)
461 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
462
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000463 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000464 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
467 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
468 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
469 {'first': 1, 'other': 2})
470
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000471 self.assertEqual(re.match("(a)", "a").pos, 0)
472 self.assertEqual(re.match("(a)", "a").endpos, 1)
473 self.assertEqual(re.match("(a)", "a").string, "a")
474 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
475 self.assertNotEqual(re.match("(a)", "a").re, None)
476
477 def test_special_escapes(self):
478 self.assertEqual(re.search(r"\b(b.)\b",
479 "abcd abc bcd bx").group(1), "bx")
480 self.assertEqual(re.search(r"\B(b.)\B",
481 "abc bcd bc abxd").group(1), "bx")
482 self.assertEqual(re.search(r"\b(b.)\b",
483 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
484 self.assertEqual(re.search(r"\B(b.)\B",
485 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
486 self.assertEqual(re.search(r"\b(b.)\b",
487 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
488 self.assertEqual(re.search(r"\B(b.)\B",
489 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
490 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
491 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
493 self.assertEqual(re.search(r"\b(b.)\b",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000494 "abcd abc bcd bx").group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000495 self.assertEqual(re.search(r"\B(b.)\B",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000496 "abc bcd bc abxd").group(1), "bx")
497 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
498 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
499 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000500 self.assertEqual(re.search(r"\d\D\w\W\s\S",
501 "1aa! a").group(0), "1aa! a")
502 self.assertEqual(re.search(r"\d\D\w\W\s\S",
503 "1aa! a", re.LOCALE).group(0), "1aa! a")
504 self.assertEqual(re.search(r"\d\D\w\W\s\S",
505 "1aa! a", re.UNICODE).group(0), "1aa! a")
506
Ezio Melotti5a045b92012-02-29 11:48:44 +0200507 def test_string_boundaries(self):
508 # See http://bugs.python.org/issue10713
509 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
510 "abc")
511 # There's a word boundary at the start of a string.
512 self.assertTrue(re.match(r"\b", "abc"))
513 # A non-empty string includes a non-boundary zero-length match.
514 self.assertTrue(re.search(r"\B", "abc"))
515 # There is no non-boundary match at the start of a string.
516 self.assertFalse(re.match(r"\B", "abc"))
517 # However, an empty string contains no word boundaries, and also no
518 # non-boundaries.
519 self.assertEqual(re.search(r"\B", ""), None)
520 # This one is questionable and different from the perlre behaviour,
521 # but describes current behavior.
522 self.assertEqual(re.search(r"\b", ""), None)
523 # A single word-character string has two boundaries, but no
524 # non-boundary gaps.
525 self.assertEqual(len(re.findall(r"\b", "a")), 2)
526 self.assertEqual(len(re.findall(r"\B", "a")), 0)
527 # If there are no words, there are no boundaries
528 self.assertEqual(len(re.findall(r"\b", " ")), 0)
529 self.assertEqual(len(re.findall(r"\b", " ")), 0)
530 # Can match around the whitespace.
531 self.assertEqual(len(re.findall(r"\B", " ")), 2)
532
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000533 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 self.assertEqual(re.match("([\u2222\u2223])",
535 "\u2222").group(1), "\u2222")
536 self.assertEqual(re.match("([\u2222\u2223])",
537 "\u2222", re.UNICODE).group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300538 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
539 self.assertEqual(re.match(r,
540 "\uff01", re.UNICODE).group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000541
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100542 def test_big_codesize(self):
543 # Issue #1160
544 r = re.compile('|'.join(('%d'%x for x in range(10000))))
545 self.assertIsNotNone(r.match('1000'))
546 self.assertIsNotNone(r.match('9999'))
547
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000548 def test_anyall(self):
549 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
550 "a\nb")
551 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
552 "a\n\nb")
553
554 def test_non_consuming(self):
555 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
556 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
557 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
558 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
559 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
560 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
561 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
562
563 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
564 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
567
568 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000569 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
570 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000571 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
572 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
573 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
574 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
575 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
576 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
577 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
578 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
579
580 def test_category(self):
581 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
582
583 def test_getlower(self):
584 import _sre
585 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
586 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
587 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
588
589 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000590 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000591
592 def test_not_literal(self):
593 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
594 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
595
596 def test_search_coverage(self):
597 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
598 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
599
Ezio Melottid2114eb2011-03-25 14:08:44 +0200600 def assertMatch(self, pattern, text, match=None, span=None,
601 matcher=re.match):
602 if match is None and span is None:
603 # the pattern matches the whole text
604 match = text
605 span = (0, len(text))
606 elif match is None or span is None:
607 raise ValueError('If match is not None, span should be specified '
608 '(and vice versa).')
609 m = matcher(pattern, text)
610 self.assertTrue(m)
611 self.assertEqual(m.group(), match)
612 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000613
Ezio Melottid2114eb2011-03-25 14:08:44 +0200614 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300615 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200616 p = ''.join(chr(i) for i in range(256))
617 for c in p:
618 if c in alnum_chars:
619 self.assertEqual(re.escape(c), c)
620 elif c == '\x00':
621 self.assertEqual(re.escape(c), '\\000')
622 else:
623 self.assertEqual(re.escape(c), '\\' + c)
624 self.assertMatch(re.escape(c), c)
625 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000626
Guido van Rossum698280d2008-09-10 17:44:35 +0000627 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300628 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200629 p = bytes(range(256))
630 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000631 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200632 if b in alnum_chars:
633 self.assertEqual(re.escape(b), b)
634 elif i == 0:
635 self.assertEqual(re.escape(b), b'\\000')
636 else:
637 self.assertEqual(re.escape(b), b'\\' + b)
638 self.assertMatch(re.escape(b), b)
639 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000640
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200641 def test_re_escape_non_ascii(self):
642 s = 'xxx\u2620\u2620\u2620xxx'
643 s_escaped = re.escape(s)
644 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
645 self.assertMatch(s_escaped, s)
646 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
647 'x\u2620\u2620\u2620x', (2, 7), re.search)
648
649 def test_re_escape_non_ascii_bytes(self):
650 b = 'y\u2620y\u2620y'.encode('utf-8')
651 b_escaped = re.escape(b)
652 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
653 self.assertMatch(b_escaped, b)
654 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
655 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000656
Skip Montanaro1e703c62003-04-25 15:40:28 +0000657 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000658 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
659 s = pickle.dumps(oldpat)
660 newpat = pickle.loads(s)
661 self.assertEqual(oldpat, newpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000662
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000663 def test_constants(self):
664 self.assertEqual(re.I, re.IGNORECASE)
665 self.assertEqual(re.L, re.LOCALE)
666 self.assertEqual(re.M, re.MULTILINE)
667 self.assertEqual(re.S, re.DOTALL)
668 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000669
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000670 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000671 for flag in [re.I, re.M, re.X, re.S, re.L]:
672 self.assertNotEqual(re.compile('^pattern$', flag), None)
Guido van Rossumf473cb01998-01-14 16:42:17 +0000673
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000674 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200675 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
676 if i < 256:
677 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
678 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
679 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
680 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
681 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
682 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
683 if i < 0x10000:
684 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
685 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
686 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
687 self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
688 self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
689 self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
690 self.assertIsNotNone(re.match(r"\0", "\000"))
691 self.assertIsNotNone(re.match(r"\08", "\0008"))
692 self.assertIsNotNone(re.match(r"\01", "\001"))
693 self.assertIsNotNone(re.match(r"\018", "\0018"))
694 self.assertIsNotNone(re.match(r"\567", chr(0o167)))
695 self.assertRaises(re.error, re.match, r"\911", "")
696 self.assertRaises(re.error, re.match, r"\x1", "")
697 self.assertRaises(re.error, re.match, r"\x1z", "")
698 self.assertRaises(re.error, re.match, r"\u123", "")
699 self.assertRaises(re.error, re.match, r"\u123z", "")
700 self.assertRaises(re.error, re.match, r"\U0001234", "")
701 self.assertRaises(re.error, re.match, r"\U0001234z", "")
702 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000703
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000704 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200705 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
706 if i < 256:
707 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
708 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
709 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
710 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
711 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
712 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
713 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
714 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
715 if i < 0x10000:
716 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
717 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
718 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
719 self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
720 self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
721 self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Ezio Melottieadece22013-02-23 08:40:07 +0200722 self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200723 self.assertRaises(re.error, re.match, r"[\911]", "")
724 self.assertRaises(re.error, re.match, r"[\x1z]", "")
725 self.assertRaises(re.error, re.match, r"[\u123z]", "")
726 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
727 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
728
729 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000730 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Antoine Pitrou463badf2012-06-23 13:29:19 +0200731 self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
732 self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
733 self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
734 self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
735 self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
736 self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
737 self.assertIsNotNone(re.match(br"\u", b'u'))
738 self.assertIsNotNone(re.match(br"\U", b'U'))
739 self.assertIsNotNone(re.match(br"\0", b"\000"))
740 self.assertIsNotNone(re.match(br"\08", b"\0008"))
741 self.assertIsNotNone(re.match(br"\01", b"\001"))
742 self.assertIsNotNone(re.match(br"\018", b"\0018"))
743 self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
744 self.assertRaises(re.error, re.match, br"\911", b"")
745 self.assertRaises(re.error, re.match, br"\x1", b"")
746 self.assertRaises(re.error, re.match, br"\x1z", b"")
747
748 def test_sre_byte_class_literals(self):
749 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
750 self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
751 self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
752 self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
753 self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
754 self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
755 self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
756 self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
757 self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
758 self.assertIsNotNone(re.match(br"[\u]", b'u'))
759 self.assertIsNotNone(re.match(br"[\U]", b'U'))
760 self.assertRaises(re.error, re.match, br"[\911]", "")
761 self.assertRaises(re.error, re.match, br"[\x1z]", "")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000762
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000763 def test_bug_113254(self):
764 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
765 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
766 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
767
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000768 def test_bug_527371(self):
769 # bug described in patches 527371/672491
770 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
771 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
772 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
773 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
774 self.assertEqual(re.match("((a))", "a").lastindex, 1)
775
776 def test_bug_545855(self):
777 # bug 545855 -- This pattern failed to cause a compile error as it
778 # should, instead provoking a TypeError.
779 self.assertRaises(re.error, re.compile, 'foo[a-')
780
781 def test_bug_418626(self):
782 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
783 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
784 # pattern '*?' on a long string.
785 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
786 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
787 20003)
788 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000789 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000790 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000791 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000792
793 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000794 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000795 self.assertEqual(re.compile(pat) and 1, 1)
796
Skip Montanaro1e703c62003-04-25 15:40:28 +0000797 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000799 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000800 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
801 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
802 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000803
Serhiy Storchakafa468162013-02-16 21:23:53 +0200804 def test_unlimited_zero_width_repeat(self):
805 # Issue #9669
806 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
807 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
808 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
809 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
810 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
811 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
812
Skip Montanaro1e703c62003-04-25 15:40:28 +0000813 def test_scanner(self):
814 def s_ident(scanner, token): return token
815 def s_operator(scanner, token): return "op%s" % token
816 def s_float(scanner, token): return float(token)
817 def s_int(scanner, token): return int(token)
818
819 scanner = Scanner([
820 (r"[a-zA-Z_]\w*", s_ident),
821 (r"\d+\.\d*", s_float),
822 (r"\d+", s_int),
823 (r"=|\+|-|\*|/", s_operator),
824 (r"\s+", None),
825 ])
826
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000827 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
828
Skip Montanaro1e703c62003-04-25 15:40:28 +0000829 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
830 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
831 'op+', 'bar'], ''))
832
Skip Montanaro5ba00542003-04-25 16:00:14 +0000833 def test_bug_448951(self):
834 # bug 448951 (similar to 429357, but with single char match)
835 # (Also test greedy matches.)
836 for op in '','?','*':
837 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
838 (None, None))
839 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
840 ('a:', 'a'))
841
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000842 def test_bug_725106(self):
843 # capturing groups in alternatives in repeats
844 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
845 ('b', 'a'))
846 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
847 ('c', 'b'))
848 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
849 ('b', None))
850 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
851 ('b', None))
852 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
853 ('b', 'a'))
854 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
855 ('c', 'b'))
856 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
857 ('b', None))
858 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
859 ('b', None))
860
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000861 def test_bug_725149(self):
862 # mark_stack_base restoring before restoring marks
863 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
864 ('a', None))
865 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
866 ('a', None, None))
867
Just van Rossum12723ba2003-07-02 20:03:04 +0000868 def test_bug_764548(self):
869 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000870 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000871 pat = re.compile(my_unicode("abc"))
872 self.assertEqual(pat.match("xyz"), None)
873
Skip Montanaro5ba00542003-04-25 16:00:14 +0000874 def test_finditer(self):
875 iter = re.finditer(r":+", "a:b::c:::d")
876 self.assertEqual([item.group(0) for item in iter],
877 [":", "::", ":::"])
878
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600879 pat = re.compile(r":+")
880 iter = pat.finditer("a:b::c:::d", 1, 10)
881 self.assertEqual([item.group(0) for item in iter],
882 [":", "::", ":::"])
883
884 pat = re.compile(r":+")
885 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
886 self.assertEqual([item.group(0) for item in iter],
887 [":", "::", ":::"])
888
889 pat = re.compile(r":+")
890 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
891 self.assertEqual([item.group(0) for item in iter],
892 [":", "::", ":::"])
893
894 pat = re.compile(r":+")
895 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
896 self.assertEqual([item.group(0) for item in iter],
897 ["::", "::"])
898
Thomas Wouters40a088d2008-03-18 20:19:54 +0000899 def test_bug_926075(self):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000900 self.assertTrue(re.compile('bug_926075') is not
Thomas Wouters40a088d2008-03-18 20:19:54 +0000901 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000902
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000903 def test_bug_931848(self):
Guido van Rossum7ebb9702007-05-15 21:39:58 +0000904 pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000905 self.assertEqual(re.compile(pattern).split("a.b.c"),
906 ['a','b','c'])
907
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000908 def test_bug_581080(self):
909 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000910 self.assertEqual(next(iter).span(), (1,2))
911 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000912
913 scanner = re.compile(r"\s").scanner("a b")
914 self.assertEqual(scanner.search().span(), (1, 2))
915 self.assertEqual(scanner.search(), None)
916
917 def test_bug_817234(self):
918 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000919 self.assertEqual(next(iter).span(), (0, 4))
920 self.assertEqual(next(iter).span(), (4, 4))
921 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000922
Mark Dickinson1f268282009-07-28 17:22:36 +0000923 def test_bug_6561(self):
924 # '\d' should match characters in Unicode category 'Nd'
925 # (Number, Decimal Digit), but not those in 'Nl' (Number,
926 # Letter) or 'No' (Number, Other).
927 decimal_digits = [
928 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
929 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
930 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
931 ]
932 for x in decimal_digits:
933 self.assertEqual(re.match('^\d$', x).group(0), x)
934
935 not_decimal_digits = [
936 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
937 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
938 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
939 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
940 ]
941 for x in not_decimal_digits:
942 self.assertIsNone(re.match('^\d$', x))
943
Guido van Rossumd8faa362007-04-27 19:54:29 +0000944 def test_empty_array(self):
945 # SF buf 1647541
946 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000947 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000948 a = array.array(typecode)
Antoine Pitroufd036452008-08-19 17:56:33 +0000949 self.assertEqual(re.compile(b"bla").match(a), None)
950 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000951
Christian Heimes072c0f12008-01-03 23:01:04 +0000952 def test_inline_flags(self):
953 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000954 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
955 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000956
957 p = re.compile(upper_char, re.I | re.U)
958 q = p.match(lower_char)
959 self.assertNotEqual(q, None)
960
961 p = re.compile(lower_char, re.I | re.U)
962 q = p.match(upper_char)
963 self.assertNotEqual(q, None)
964
965 p = re.compile('(?i)' + upper_char, re.U)
966 q = p.match(lower_char)
967 self.assertNotEqual(q, None)
968
969 p = re.compile('(?i)' + lower_char, re.U)
970 q = p.match(upper_char)
971 self.assertNotEqual(q, None)
972
973 p = re.compile('(?iu)' + upper_char)
974 q = p.match(lower_char)
975 self.assertNotEqual(q, None)
976
977 p = re.compile('(?iu)' + lower_char)
978 q = p.match(upper_char)
979 self.assertNotEqual(q, None)
980
Christian Heimes25bb7832008-01-11 16:17:00 +0000981 def test_dollar_matches_twice(self):
982 "$ matches the end of string, and just before the terminating \n"
983 pattern = re.compile('$')
984 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
985 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
986 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
987
988 pattern = re.compile('$', re.MULTILINE)
989 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
990 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
991 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
992
Antoine Pitroufd036452008-08-19 17:56:33 +0000993 def test_bytes_str_mixing(self):
994 # Mixing str and bytes is disallowed
995 pat = re.compile('.')
996 bpat = re.compile(b'.')
997 self.assertRaises(TypeError, pat.match, b'b')
998 self.assertRaises(TypeError, bpat.match, 'b')
999 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1000 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1001 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1002 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1003 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1004 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1005
1006 def test_ascii_and_unicode_flag(self):
1007 # String patterns
1008 for flags in (0, re.UNICODE):
1009 pat = re.compile('\xc0', flags | re.IGNORECASE)
1010 self.assertNotEqual(pat.match('\xe0'), None)
1011 pat = re.compile('\w', flags)
1012 self.assertNotEqual(pat.match('\xe0'), None)
1013 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
1014 self.assertEqual(pat.match('\xe0'), None)
1015 pat = re.compile('(?a)\xc0', re.IGNORECASE)
1016 self.assertEqual(pat.match('\xe0'), None)
1017 pat = re.compile('\w', re.ASCII)
1018 self.assertEqual(pat.match('\xe0'), None)
1019 pat = re.compile('(?a)\w')
1020 self.assertEqual(pat.match('\xe0'), None)
1021 # Bytes patterns
1022 for flags in (0, re.ASCII):
1023 pat = re.compile(b'\xc0', re.IGNORECASE)
1024 self.assertEqual(pat.match(b'\xe0'), None)
1025 pat = re.compile(b'\w')
1026 self.assertEqual(pat.match(b'\xe0'), None)
1027 # Incompatibilities
1028 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1029 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1030 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1031 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1032 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1033 self.assertRaises(ValueError, re.compile, '(?au)\w')
1034
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001035 def test_bug_6509(self):
1036 # Replacement strings of both types must parse properly.
1037 # all strings
1038 pat = re.compile('a(\w)')
1039 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1040 pat = re.compile('a(.)')
1041 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1042 pat = re.compile('..')
1043 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1044
1045 # all bytes
1046 pat = re.compile(b'a(\w)')
1047 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1048 pat = re.compile(b'a(.)')
1049 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1050 pat = re.compile(b'..')
1051 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1052
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001053 def test_dealloc(self):
1054 # issue 3299: check for segfault in debug build
1055 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001056 # the overflow limit is different on wide and narrow builds and it
1057 # depends on the definition of SRE_CODE (see sre.h).
1058 # 2**128 should be big enough to overflow on both. For smaller values
1059 # a RuntimeError is raised instead of OverflowError.
1060 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001061 self.assertRaises(TypeError, re.finditer, "a", {})
1062 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001063 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 def test_search_dot_unicode(self):
1066 self.assertIsNotNone(re.search("123.*-", '123abc-'))
1067 self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
1068 self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
1069 self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
1070 self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1071
Ezio Melottidf723e12012-03-13 01:29:48 +02001072 def test_compile(self):
1073 # Test return value when given string and pattern as parameter
1074 pattern = re.compile('random pattern')
1075 self.assertIsInstance(pattern, re._pattern_type)
1076 same_pattern = re.compile(pattern)
1077 self.assertIsInstance(same_pattern, re._pattern_type)
1078 self.assertIs(same_pattern, pattern)
1079 # Test behaviour when not given a string or pattern as parameter
1080 self.assertRaises(TypeError, re.compile, 0)
1081
Ezio Melottife8e6e72013-01-11 08:32:01 +02001082 def test_bug_13899(self):
1083 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1084 # nothing. Ditto B and Z.
1085 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1086 ['A', 'B', '\b', 'C', 'Z'])
1087
Antoine Pitroub33941a2012-12-03 20:55:56 +01001088 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001089 def test_large_search(self, size):
1090 # Issue #10182: indices were 32-bit-truncated.
1091 s = 'a' * size
1092 m = re.search('$', s)
1093 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001094 self.assertEqual(m.start(), size)
1095 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001096
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001097 # The huge memuse is because of re.sub() using a list and a join()
1098 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001099 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001100 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001101 # Issue #10182: indices were 32-bit-truncated.
1102 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001103 r, n = re.subn('', '', s)
1104 self.assertEqual(r, s)
1105 self.assertEqual(n, size + 1)
1106
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001107 def test_bug_16688(self):
1108 # Issue 16688: Backreferences make case-insensitive regex fail on
1109 # non-ASCII strings.
1110 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1111 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001112
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001113 def test_repeat_minmax_overflow(self):
1114 # Issue #13169
1115 string = "x" * 100000
1116 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1117 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1118 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1119 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1120 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1121 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1122 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1123 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1124 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1125 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1126 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1127
1128 @cpython_only
1129 def test_repeat_minmax_overflow_maxrepeat(self):
1130 try:
1131 from _sre import MAXREPEAT
1132 except ImportError:
1133 self.skipTest('requires _sre.MAXREPEAT constant')
1134 string = "x" * 100000
1135 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1136 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1137 (0, 100000))
1138 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1139 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1140 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1141 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1142
R David Murray26dfaac92013-04-14 13:00:54 -04001143 def test_backref_group_name_in_exception(self):
1144 # Issue 17341: Poor error message when compiling invalid regex
1145 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1146 re.compile('(?P=<foo>)')
1147
1148 def test_group_name_in_exception(self):
1149 # Issue 17341: Poor error message when compiling invalid regex
1150 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1151 re.compile('(?P<?foo>)')
1152
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001153 def test_issue17998(self):
1154 for reps in '*', '+', '?', '{1}':
1155 for mod in '', '?':
1156 pattern = '.' + reps + mod + 'yz'
1157 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1158 ['xyz'], msg=pattern)
1159 pattern = pattern.encode()
1160 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1161 [b'xyz'], msg=pattern)
1162
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001163 def test_match_repr(self):
1164 for string in '[abracadabra]', S('[abracadabra]'):
1165 m = re.search(r'(.+)(.*?)\1', string)
1166 self.assertEqual(repr(m), "<%s.%s object; "
1167 "span=(1, 12), match='abracadabra'>" %
1168 (type(m).__module__, type(m).__qualname__))
1169 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1170 bytearray(b'[abracadabra]'),
1171 memoryview(b'[abracadabra]')):
1172 m = re.search(rb'(.+)(.*?)\1', string)
1173 self.assertEqual(repr(m), "<%s.%s object; "
1174 "span=(1, 12), match=b'abracadabra'>" %
1175 (type(m).__module__, type(m).__qualname__))
1176
1177 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1178 self.assertEqual(repr(first), "<%s.%s object; "
1179 "span=(0, 2), match='aa'>" %
1180 (type(second).__module__, type(first).__qualname__))
1181 self.assertEqual(repr(second), "<%s.%s object; "
1182 "span=(3, 5), match='bb'>" %
1183 (type(second).__module__, type(second).__qualname__))
1184
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001185
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001186 def test_bug_2537(self):
1187 # issue 2537: empty submatches
1188 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1189 for inner_op in ('{0,}', '*', '?'):
1190 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1191 m = r.match("xyyzy")
1192 self.assertEqual(m.group(0), "xyy")
1193 self.assertEqual(m.group(1), "")
1194 self.assertEqual(m.group(2), "y")
1195
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001196 def test_debug_flag(self):
1197 with captured_stdout() as out:
1198 re.compile('foo', re.DEBUG)
1199 self.assertEqual(out.getvalue().splitlines(),
1200 ['literal 102 ', 'literal 111 ', 'literal 111 '])
1201 # Debug output is output again even a second time (bypassing
1202 # the cache -- issue #20426).
1203 with captured_stdout() as out:
1204 re.compile('foo', re.DEBUG)
1205 self.assertEqual(out.getvalue().splitlines(),
1206 ['literal 102 ', 'literal 111 ', 'literal 111 '])
1207
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001208 def test_keyword_parameters(self):
1209 # Issue #20283: Accepting the string keyword parameter.
1210 pat = re.compile(r'(ab)')
1211 self.assertEqual(
1212 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1213 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001214 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1215 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001216 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1217 self.assertEqual(
1218 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1219 self.assertEqual(
1220 pat.split(string='abracadabra', maxsplit=1),
1221 ['', 'ab', 'racadabra'])
1222 self.assertEqual(
1223 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1224 (7, 9))
1225
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001226 def test_bug_20998(self):
1227 # Issue #20998: Fullmatch of repeated single character pattern
1228 # with ignore case.
1229 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1230
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001231
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001232class PatternReprTests(unittest.TestCase):
1233 def check(self, pattern, expected):
1234 self.assertEqual(repr(re.compile(pattern)), expected)
1235
1236 def check_flags(self, pattern, flags, expected):
1237 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1238
1239 def test_without_flags(self):
1240 self.check('random pattern',
1241 "re.compile('random pattern')")
1242
1243 def test_single_flag(self):
1244 self.check_flags('random pattern', re.IGNORECASE,
1245 "re.compile('random pattern', re.IGNORECASE)")
1246
1247 def test_multiple_flags(self):
1248 self.check_flags('random pattern', re.I|re.S|re.X,
1249 "re.compile('random pattern', "
1250 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1251
1252 def test_unicode_flag(self):
1253 self.check_flags('random pattern', re.U,
1254 "re.compile('random pattern')")
1255 self.check_flags('random pattern', re.I|re.S|re.U,
1256 "re.compile('random pattern', "
1257 "re.IGNORECASE|re.DOTALL)")
1258
1259 def test_inline_flags(self):
1260 self.check('(?i)pattern',
1261 "re.compile('(?i)pattern', re.IGNORECASE)")
1262
1263 def test_unknown_flags(self):
1264 self.check_flags('random pattern', 0x123000,
1265 "re.compile('random pattern', 0x123000)")
1266 self.check_flags('random pattern', 0x123000|re.I,
1267 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1268
1269 def test_bytes(self):
1270 self.check(b'bytes pattern',
1271 "re.compile(b'bytes pattern')")
1272 self.check_flags(b'bytes pattern', re.A,
1273 "re.compile(b'bytes pattern', re.ASCII)")
1274
1275 def test_quotes(self):
1276 self.check('random "double quoted" pattern',
1277 '''re.compile('random "double quoted" pattern')''')
1278 self.check("random 'single quoted' pattern",
1279 '''re.compile("random 'single quoted' pattern")''')
1280 self.check('''both 'single' and "double" quotes''',
1281 '''re.compile('both \\'single\\' and "double" quotes')''')
1282
1283 def test_long_pattern(self):
1284 pattern = 'Very %spattern' % ('long ' * 1000)
1285 r = repr(re.compile(pattern))
1286 self.assertLess(len(r), 300)
1287 self.assertEqual(r[:30], "re.compile('Very long long lon")
1288 r = repr(re.compile(pattern, re.I))
1289 self.assertLess(len(r), 300)
1290 self.assertEqual(r[:30], "re.compile('Very long long lon")
1291 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1292
1293
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001294class ImplementationTest(unittest.TestCase):
1295 """
1296 Test implementation details of the re module.
1297 """
1298
1299 def test_overlap_table(self):
1300 f = sre_compile._generate_overlap_table
1301 self.assertEqual(f(""), [])
1302 self.assertEqual(f("a"), [0])
1303 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1304 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1305 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1306 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1307
1308
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001309def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001310 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001311 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001312 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001313 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001314 # To save time, only run the first and last 10 tests
1315 #tests = tests[:10] + tests[-10:]
1316 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001317
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001318 for t in tests:
1319 sys.stdout.flush()
1320 pattern = s = outcome = repl = expected = None
1321 if len(t) == 5:
1322 pattern, s, outcome, repl, expected = t
1323 elif len(t) == 3:
1324 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001325 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001326 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001327
Guido van Rossum41360a41998-03-26 19:42:58 +00001328 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001329 obj = re.compile(pattern)
1330 except re.error:
1331 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001332 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001333 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001334 except KeyboardInterrupt: raise KeyboardInterrupt
1335 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001336 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001337 if verbose:
1338 traceback.print_exc(file=sys.stdout)
1339 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001340 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001341 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001342 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001343 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001344 if outcome == SYNTAX_ERROR:
1345 # This should have been a syntax error; forget it.
1346 pass
1347 elif outcome == FAIL:
1348 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001349 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001350 elif outcome == SUCCEED:
1351 if result is not None:
1352 # Matched, as expected, so now we compute the
1353 # result string and compare it to our expected result.
1354 start, end = result.span(0)
1355 vardict={'found': result.group(0),
1356 'groups': result.group(),
1357 'flags': result.re.flags}
1358 for i in range(1, 100):
1359 try:
1360 gi = result.group(i)
1361 # Special hack because else the string concat fails:
1362 if gi is None:
1363 gi = "None"
1364 except IndexError:
1365 gi = "Error"
1366 vardict['g%d' % i] = gi
1367 for i in result.re.groupindex.keys():
1368 try:
1369 gi = result.group(i)
1370 if gi is None:
1371 gi = "None"
1372 except IndexError:
1373 gi = "Error"
1374 vardict[i] = gi
1375 repl = eval(repl, vardict)
1376 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001377 print('=== grouping error', t, end=' ')
1378 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001379 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001380 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001381
Antoine Pitrou22628c42008-07-22 17:53:22 +00001382 # Try the match with both pattern and string converted to
1383 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001384 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001385 bpat = bytes(pattern, "ascii")
1386 bs = bytes(s, "ascii")
1387 except UnicodeEncodeError:
1388 # skip non-ascii tests
1389 pass
1390 else:
1391 try:
1392 bpat = re.compile(bpat)
1393 except Exception:
1394 print('=== Fails on bytes pattern compile', t)
1395 if verbose:
1396 traceback.print_exc(file=sys.stdout)
1397 else:
1398 bytes_result = bpat.search(bs)
1399 if bytes_result is None:
1400 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001401
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001402 # Try the match with the search area limited to the extent
1403 # of the match and see if it still succeeds. \B will
1404 # break (because it won't match at the end or start of a
1405 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001406
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001407 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1408 and result is not None:
1409 obj = re.compile(pattern)
1410 result = obj.search(s, result.start(0), result.end(0) + 1)
1411 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001412 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001413
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001414 # Try the match with IGNORECASE enabled, and check that it
1415 # still succeeds.
1416 obj = re.compile(pattern, re.IGNORECASE)
1417 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001418 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001419 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001420
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001421 # Try the match with LOCALE enabled, and check that it
1422 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001423 if '(?u)' not in pattern:
1424 obj = re.compile(pattern, re.LOCALE)
1425 result = obj.search(s)
1426 if result is None:
1427 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001428
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001429 # Try the match with UNICODE locale enabled, and check
1430 # that it still succeeds.
1431 obj = re.compile(pattern, re.UNICODE)
1432 result = obj.search(s)
1433 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001434 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001435
Gregory P. Smith5a631832010-07-27 05:31:29 +00001436
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001437def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001438 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001439 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001440
1441if __name__ == "__main__":
1442 test_main()