blob: 8d63fac0beefe3ab40c9d8328f69d582971e6815 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
2 cpython_only
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00005from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02006import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04007import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02008import sys
9import string
10import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020011import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000012from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000013
Guido van Rossum23b22571997-07-17 22:36:14 +000014# Misc tests from Tim Peters' re.doc
15
Just van Rossum6802c6e2003-07-02 14:36:59 +000016# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020017# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000018# cover most of the code.
19
Serhiy Storchaka25324972013-10-16 12:46:28 +030020class S(str):
21 def __getitem__(self, index):
22 return S(super().__getitem__(index))
23
24class B(bytes):
25 def __getitem__(self, index):
26 return B(super().__getitem__(index))
27
Skip Montanaro8ed06da2003-04-24 19:43:18 +000028class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000029
Serhiy Storchaka25324972013-10-16 12:46:28 +030030 def assertTypedEqual(self, actual, expect, msg=None):
31 self.assertEqual(actual, expect, msg)
32 def recurse(actual, expect):
33 if isinstance(expect, (tuple, list)):
34 for x, y in zip(actual, expect):
35 recurse(x, y)
36 else:
37 self.assertIs(type(actual), type(expect), msg)
38 recurse(actual, expect)
39
Benjamin Petersone48944b2012-03-07 14:50:25 -060040 def test_keep_buffer(self):
41 # See bug 14212
42 b = bytearray(b'x')
43 it = re.finditer(b'a', b)
44 with self.assertRaises(BufferError):
45 b.extend(b'x'*400)
46 list(it)
47 del it
48 gc_collect()
49 b.extend(b'x'*400)
50
Raymond Hettinger027bb632004-05-31 03:09:25 +000051 def test_weakref(self):
52 s = 'QabbbcR'
53 x = re.compile('ab+c')
54 y = proxy(x)
55 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
56
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 def test_search_star_plus(self):
58 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
59 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
60 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
61 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000062 self.assertEqual(re.search('x', 'aaa'), None)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000063 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
64 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
65 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
66 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000067 self.assertEqual(re.match('a+', 'xxx'), None)
Guido van Rossum8430c581998-04-03 21:47:12 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000070 int_value = int(matchobj.group(0))
71 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030074 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
75 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
76 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
77 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030080 for y in ("\xe0", "\u0430", "\U0001d49c"):
81 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030082
Skip Montanaro8ed06da2003-04-24 19:43:18 +000083 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
84 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
85 '9.3 -3 24x100y')
86 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
87 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
90 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 s = r"\1\1"
93 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
94 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
95 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000096
Skip Montanaro8ed06da2003-04-24 19:43:18 +000097 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
98 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
103 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
104 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
106 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000109
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000110 def test_bug_449964(self):
111 # fails for group followed by other escape
112 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
113 'xx\bxx\b')
114
115 def test_bug_449000(self):
116 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000117 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
118 'abc\ndef\n')
119 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
120 'abc\ndef\n')
121 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
122 'abc\ndef\n')
123 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
124 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000125
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000126 def test_bug_1661(self):
127 # Verify that flags do not get silently ignored with compiled patterns
128 pattern = re.compile('.')
129 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
130 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.compile, pattern, re.I)
133
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000134 def test_bug_3629(self):
135 # A regex that triggered a bug in the sre-code validator
136 re.compile("(?P<quote>)(?(quote))")
137
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000138 def test_sub_template_numeric_escape(self):
139 # bug 776311 and friends
140 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
141 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
142 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
143 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
144 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
145 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
146 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
147
148 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
150
151 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
152 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
153 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
154 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
155 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
156
157 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
158 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
169 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
170 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
171 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
172
173 # in python2.3 (etc), these loop endlessly in sre_parser.py
174 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
175 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
176 'xz8')
177 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
178 'xza')
179
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000180 def test_qualified_re_sub(self):
181 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
182 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000183
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000184 def test_bug_114660(self):
185 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
186 'hello there')
187
188 def test_bug_462270(self):
189 # Test for empty sub() behaviour, see SF bug #462270
190 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
191 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
192
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200193 def test_symbolic_groups(self):
194 re.compile('(?P<a>x)(?P=a)(?(a)y)')
195 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
196 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
197 self.assertRaises(re.error, re.compile, '(?Px)')
198 self.assertRaises(re.error, re.compile, '(?P=)')
199 self.assertRaises(re.error, re.compile, '(?P=1)')
200 self.assertRaises(re.error, re.compile, '(?P=a)')
201 self.assertRaises(re.error, re.compile, '(?P=a1)')
202 self.assertRaises(re.error, re.compile, '(?P=a.)')
203 self.assertRaises(re.error, re.compile, '(?P<)')
204 self.assertRaises(re.error, re.compile, '(?P<>)')
205 self.assertRaises(re.error, re.compile, '(?P<1>)')
206 self.assertRaises(re.error, re.compile, '(?P<a.>)')
207 self.assertRaises(re.error, re.compile, '(?())')
208 self.assertRaises(re.error, re.compile, '(?(a))')
209 self.assertRaises(re.error, re.compile, '(?(1a))')
210 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200211 # New valid/invalid identifiers in Python 3
212 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
213 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
214 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200215
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000216 def test_symbolic_refs(self):
217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
223 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
224 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000226 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200227 # New valid/invalid identifiers in Python 3
228 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
229 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
230 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000231
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000232 def test_re_subn(self):
233 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
234 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
235 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
236 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
237 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000238
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000239 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300240 for string in ":a:b::c", S(":a:b::c"):
241 self.assertTypedEqual(re.split(":", string),
242 ['', 'a', 'b', '', 'c'])
243 self.assertTypedEqual(re.split(":*", string),
244 ['', 'a', 'b', 'c'])
245 self.assertTypedEqual(re.split("(:*)", string),
246 ['', ':', 'a', ':', 'b', '::', 'c'])
247 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
248 memoryview(b":a:b::c")):
249 self.assertTypedEqual(re.split(b":", string),
250 [b'', b'a', b'b', b'', b'c'])
251 self.assertTypedEqual(re.split(b":*", string),
252 [b'', b'a', b'b', b'c'])
253 self.assertTypedEqual(re.split(b"(:*)", string),
254 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300255 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
256 "\U0001d49c\U0001d49e\U0001d4b5"):
257 string = ":%s:%s::%s" % (a, b, c)
258 self.assertEqual(re.split(":", string), ['', a, b, '', c])
259 self.assertEqual(re.split(":*", string), ['', a, b, c])
260 self.assertEqual(re.split("(:*)", string),
261 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300262
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000263 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
264 self.assertEqual(re.split("(:)*", ":a:b::c"),
265 ['', ':', 'a', ':', 'b', ':', 'c'])
266 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
267 ['', ':', 'a', ':b::', 'c'])
268 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
269 ['', None, ':', 'a', None, ':', '', 'b', None, '',
270 None, '::', 'c'])
271 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
272 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 def test_qualified_re_split(self):
275 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
276 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
277 self.assertEqual(re.split("(:)", ":a:b::c", 2),
278 ['', ':', 'a', ':', 'b::c'])
279 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
280 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000281
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000282 def test_re_findall(self):
283 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300284 for string in "a:b::c:::d", S("a:b::c:::d"):
285 self.assertTypedEqual(re.findall(":+", string),
286 [":", "::", ":::"])
287 self.assertTypedEqual(re.findall("(:+)", string),
288 [":", "::", ":::"])
289 self.assertTypedEqual(re.findall("(:)(:*)", string),
290 [(":", ""), (":", ":"), (":", "::")])
291 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
292 memoryview(b"a:b::c:::d")):
293 self.assertTypedEqual(re.findall(b":+", string),
294 [b":", b"::", b":::"])
295 self.assertTypedEqual(re.findall(b"(:+)", string),
296 [b":", b"::", b":::"])
297 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
298 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300299 for x in ("\xe0", "\u0430", "\U0001d49c"):
300 xx = x * 2
301 xxx = x * 3
302 string = "a%sb%sc%sd" % (x, xx, xxx)
303 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
304 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
306 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000307
Skip Montanaro5ba00542003-04-25 16:00:14 +0000308 def test_bug_117612(self):
309 self.assertEqual(re.findall(r"(a|(b))", "aba"),
310 [("a", ""),("b", "b"),("a", "")])
311
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000312 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 for string in 'a', S('a'):
314 self.assertEqual(re.match('a', string).groups(), ())
315 self.assertEqual(re.match('(a)', string).groups(), ('a',))
316 self.assertEqual(re.match('(a)', string).group(0), 'a')
317 self.assertEqual(re.match('(a)', string).group(1), 'a')
318 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
319 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
320 self.assertEqual(re.match(b'a', string).groups(), ())
321 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
322 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
323 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300325 for a in ("\xe0", "\u0430", "\U0001d49c"):
326 self.assertEqual(re.match(a, a).groups(), ())
327 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
328 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
329 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000331
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000332 pat = re.compile('((a)|(b))(c)?')
333 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
334 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
335 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
336 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
337 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000338
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000339 # A single group
340 m = re.match('(a)', 'a')
341 self.assertEqual(m.group(0), 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(1), 'a')
344 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000345
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000346 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
347 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
348 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
349 (None, 'b', None))
350 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000351
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000352 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000353 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
354 ('(', 'a'))
355 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
356 (None, 'a'))
357 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
358 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
359 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
360 ('a', 'b'))
361 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
362 (None, 'd'))
363 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
364 (None, 'd'))
365 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
366 ('a', ''))
367
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000368 # Tests for bug #1177831: exercise groups other than the first group
369 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
370 self.assertEqual(p.match('abc').groups(),
371 ('a', 'b', 'c'))
372 self.assertEqual(p.match('ad').groups(),
373 ('a', None, 'd'))
374 self.assertEqual(p.match('abd'), None)
375 self.assertEqual(p.match('ac'), None)
376
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000377
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000378 def test_re_groupref(self):
379 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
380 ('|', 'a'))
381 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
382 (None, 'a'))
383 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
384 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
385 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
386 ('a', 'a'))
387 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
388 (None, None))
389
390 def test_groupdict(self):
391 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
392 'first second').groupdict(),
393 {'first':'first', 'second':'second'})
394
395 def test_expand(self):
396 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
397 "first second")
398 .expand(r"\2 \1 \g<second> \g<first>"),
399 "second first second first")
400
401 def test_repeat_minmax(self):
402 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
403 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
404 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
405 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
406
407 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
408 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
409 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
410 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
411 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
412 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
413 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
414 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
415
416 self.assertEqual(re.match("^x{1}$", "xxx"), None)
417 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
418 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
419 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
420
421 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
422 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
423 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
424 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
425 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
426 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
427 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
428 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
429
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000430 self.assertEqual(re.match("^x{}$", "xxx"), None)
431 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
432
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000433 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000434 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000435 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000436 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
437 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
438 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
439 {'first': 1, 'other': 2})
440
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000441 self.assertEqual(re.match("(a)", "a").pos, 0)
442 self.assertEqual(re.match("(a)", "a").endpos, 1)
443 self.assertEqual(re.match("(a)", "a").string, "a")
444 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
445 self.assertNotEqual(re.match("(a)", "a").re, None)
446
447 def test_special_escapes(self):
448 self.assertEqual(re.search(r"\b(b.)\b",
449 "abcd abc bcd bx").group(1), "bx")
450 self.assertEqual(re.search(r"\B(b.)\B",
451 "abc bcd bc abxd").group(1), "bx")
452 self.assertEqual(re.search(r"\b(b.)\b",
453 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
454 self.assertEqual(re.search(r"\B(b.)\B",
455 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
456 self.assertEqual(re.search(r"\b(b.)\b",
457 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
458 self.assertEqual(re.search(r"\B(b.)\B",
459 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
460 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
461 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
462 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
463 self.assertEqual(re.search(r"\b(b.)\b",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000464 "abcd abc bcd bx").group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000465 self.assertEqual(re.search(r"\B(b.)\B",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000466 "abc bcd bc abxd").group(1), "bx")
467 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
468 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
469 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000470 self.assertEqual(re.search(r"\d\D\w\W\s\S",
471 "1aa! a").group(0), "1aa! a")
472 self.assertEqual(re.search(r"\d\D\w\W\s\S",
473 "1aa! a", re.LOCALE).group(0), "1aa! a")
474 self.assertEqual(re.search(r"\d\D\w\W\s\S",
475 "1aa! a", re.UNICODE).group(0), "1aa! a")
476
Ezio Melotti5a045b92012-02-29 11:48:44 +0200477 def test_string_boundaries(self):
478 # See http://bugs.python.org/issue10713
479 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
480 "abc")
481 # There's a word boundary at the start of a string.
482 self.assertTrue(re.match(r"\b", "abc"))
483 # A non-empty string includes a non-boundary zero-length match.
484 self.assertTrue(re.search(r"\B", "abc"))
485 # There is no non-boundary match at the start of a string.
486 self.assertFalse(re.match(r"\B", "abc"))
487 # However, an empty string contains no word boundaries, and also no
488 # non-boundaries.
489 self.assertEqual(re.search(r"\B", ""), None)
490 # This one is questionable and different from the perlre behaviour,
491 # but describes current behavior.
492 self.assertEqual(re.search(r"\b", ""), None)
493 # A single word-character string has two boundaries, but no
494 # non-boundary gaps.
495 self.assertEqual(len(re.findall(r"\b", "a")), 2)
496 self.assertEqual(len(re.findall(r"\B", "a")), 0)
497 # If there are no words, there are no boundaries
498 self.assertEqual(len(re.findall(r"\b", " ")), 0)
499 self.assertEqual(len(re.findall(r"\b", " ")), 0)
500 # Can match around the whitespace.
501 self.assertEqual(len(re.findall(r"\B", " ")), 2)
502
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000503 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000504 self.assertEqual(re.match("([\u2222\u2223])",
505 "\u2222").group(1), "\u2222")
506 self.assertEqual(re.match("([\u2222\u2223])",
507 "\u2222", re.UNICODE).group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300508 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
509 self.assertEqual(re.match(r,
510 "\uff01", re.UNICODE).group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100512 def test_big_codesize(self):
513 # Issue #1160
514 r = re.compile('|'.join(('%d'%x for x in range(10000))))
515 self.assertIsNotNone(r.match('1000'))
516 self.assertIsNotNone(r.match('9999'))
517
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000518 def test_anyall(self):
519 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
520 "a\nb")
521 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
522 "a\n\nb")
523
524 def test_non_consuming(self):
525 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
526 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
527 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
528 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
529 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
530 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
531 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
532
533 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
534 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
535 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
536 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
537
538 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000539 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
540 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000541 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
542 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
543 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
544 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
545 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
546 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
547 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
548 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
549
550 def test_category(self):
551 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
552
553 def test_getlower(self):
554 import _sre
555 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
556 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
557 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
558
559 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000560 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561
562 def test_not_literal(self):
563 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
564 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
565
566 def test_search_coverage(self):
567 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
568 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
569
Ezio Melottid2114eb2011-03-25 14:08:44 +0200570 def assertMatch(self, pattern, text, match=None, span=None,
571 matcher=re.match):
572 if match is None and span is None:
573 # the pattern matches the whole text
574 match = text
575 span = (0, len(text))
576 elif match is None or span is None:
577 raise ValueError('If match is not None, span should be specified '
578 '(and vice versa).')
579 m = matcher(pattern, text)
580 self.assertTrue(m)
581 self.assertEqual(m.group(), match)
582 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000583
Ezio Melottid2114eb2011-03-25 14:08:44 +0200584 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300585 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200586 p = ''.join(chr(i) for i in range(256))
587 for c in p:
588 if c in alnum_chars:
589 self.assertEqual(re.escape(c), c)
590 elif c == '\x00':
591 self.assertEqual(re.escape(c), '\\000')
592 else:
593 self.assertEqual(re.escape(c), '\\' + c)
594 self.assertMatch(re.escape(c), c)
595 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000596
Guido van Rossum698280d2008-09-10 17:44:35 +0000597 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300598 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200599 p = bytes(range(256))
600 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000601 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200602 if b in alnum_chars:
603 self.assertEqual(re.escape(b), b)
604 elif i == 0:
605 self.assertEqual(re.escape(b), b'\\000')
606 else:
607 self.assertEqual(re.escape(b), b'\\' + b)
608 self.assertMatch(re.escape(b), b)
609 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000610
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200611 def test_re_escape_non_ascii(self):
612 s = 'xxx\u2620\u2620\u2620xxx'
613 s_escaped = re.escape(s)
614 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
615 self.assertMatch(s_escaped, s)
616 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
617 'x\u2620\u2620\u2620x', (2, 7), re.search)
618
619 def test_re_escape_non_ascii_bytes(self):
620 b = 'y\u2620y\u2620y'.encode('utf-8')
621 b_escaped = re.escape(b)
622 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
623 self.assertMatch(b_escaped, b)
624 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
625 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000626
Skip Montanaro1e703c62003-04-25 15:40:28 +0000627 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000628 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
629 s = pickle.dumps(oldpat)
630 newpat = pickle.loads(s)
631 self.assertEqual(oldpat, newpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000632
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000633 def test_constants(self):
634 self.assertEqual(re.I, re.IGNORECASE)
635 self.assertEqual(re.L, re.LOCALE)
636 self.assertEqual(re.M, re.MULTILINE)
637 self.assertEqual(re.S, re.DOTALL)
638 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000639
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000640 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000641 for flag in [re.I, re.M, re.X, re.S, re.L]:
642 self.assertNotEqual(re.compile('^pattern$', flag), None)
Guido van Rossumf473cb01998-01-14 16:42:17 +0000643
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000644 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200645 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
646 if i < 256:
647 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
648 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
649 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
650 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
651 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
652 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
653 if i < 0x10000:
654 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
655 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
656 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
657 self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
658 self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
659 self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
660 self.assertIsNotNone(re.match(r"\0", "\000"))
661 self.assertIsNotNone(re.match(r"\08", "\0008"))
662 self.assertIsNotNone(re.match(r"\01", "\001"))
663 self.assertIsNotNone(re.match(r"\018", "\0018"))
664 self.assertIsNotNone(re.match(r"\567", chr(0o167)))
665 self.assertRaises(re.error, re.match, r"\911", "")
666 self.assertRaises(re.error, re.match, r"\x1", "")
667 self.assertRaises(re.error, re.match, r"\x1z", "")
668 self.assertRaises(re.error, re.match, r"\u123", "")
669 self.assertRaises(re.error, re.match, r"\u123z", "")
670 self.assertRaises(re.error, re.match, r"\U0001234", "")
671 self.assertRaises(re.error, re.match, r"\U0001234z", "")
672 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000673
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000674 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200675 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
676 if i < 256:
677 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
678 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
679 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
680 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
681 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
682 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
683 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
684 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
685 if i < 0x10000:
686 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
687 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
688 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
689 self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
690 self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
691 self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Ezio Melottieadece22013-02-23 08:40:07 +0200692 self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200693 self.assertRaises(re.error, re.match, r"[\911]", "")
694 self.assertRaises(re.error, re.match, r"[\x1z]", "")
695 self.assertRaises(re.error, re.match, r"[\u123z]", "")
696 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
697 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
698
699 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000700 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Antoine Pitrou463badf2012-06-23 13:29:19 +0200701 self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
702 self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
703 self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
704 self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
705 self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
706 self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
707 self.assertIsNotNone(re.match(br"\u", b'u'))
708 self.assertIsNotNone(re.match(br"\U", b'U'))
709 self.assertIsNotNone(re.match(br"\0", b"\000"))
710 self.assertIsNotNone(re.match(br"\08", b"\0008"))
711 self.assertIsNotNone(re.match(br"\01", b"\001"))
712 self.assertIsNotNone(re.match(br"\018", b"\0018"))
713 self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
714 self.assertRaises(re.error, re.match, br"\911", b"")
715 self.assertRaises(re.error, re.match, br"\x1", b"")
716 self.assertRaises(re.error, re.match, br"\x1z", b"")
717
718 def test_sre_byte_class_literals(self):
719 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
720 self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
721 self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
722 self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
723 self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
724 self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
725 self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
726 self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
727 self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
728 self.assertIsNotNone(re.match(br"[\u]", b'u'))
729 self.assertIsNotNone(re.match(br"[\U]", b'U'))
730 self.assertRaises(re.error, re.match, br"[\911]", "")
731 self.assertRaises(re.error, re.match, br"[\x1z]", "")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000732
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000733 def test_bug_113254(self):
734 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
735 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
736 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
737
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000738 def test_bug_527371(self):
739 # bug described in patches 527371/672491
740 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
741 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
742 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
743 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
744 self.assertEqual(re.match("((a))", "a").lastindex, 1)
745
746 def test_bug_545855(self):
747 # bug 545855 -- This pattern failed to cause a compile error as it
748 # should, instead provoking a TypeError.
749 self.assertRaises(re.error, re.compile, 'foo[a-')
750
751 def test_bug_418626(self):
752 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
753 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
754 # pattern '*?' on a long string.
755 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
756 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
757 20003)
758 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000759 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000760 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000761 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000762
763 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000764 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000765 self.assertEqual(re.compile(pat) and 1, 1)
766
Skip Montanaro1e703c62003-04-25 15:40:28 +0000767 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000768 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000769 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000770 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
771 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
772 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000773
Serhiy Storchakafa468162013-02-16 21:23:53 +0200774 def test_unlimited_zero_width_repeat(self):
775 # Issue #9669
776 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
777 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
778 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
779 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
780 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
781 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
782
Skip Montanaro1e703c62003-04-25 15:40:28 +0000783 def test_scanner(self):
784 def s_ident(scanner, token): return token
785 def s_operator(scanner, token): return "op%s" % token
786 def s_float(scanner, token): return float(token)
787 def s_int(scanner, token): return int(token)
788
789 scanner = Scanner([
790 (r"[a-zA-Z_]\w*", s_ident),
791 (r"\d+\.\d*", s_float),
792 (r"\d+", s_int),
793 (r"=|\+|-|\*|/", s_operator),
794 (r"\s+", None),
795 ])
796
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000797 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
798
Skip Montanaro1e703c62003-04-25 15:40:28 +0000799 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
800 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
801 'op+', 'bar'], ''))
802
Skip Montanaro5ba00542003-04-25 16:00:14 +0000803 def test_bug_448951(self):
804 # bug 448951 (similar to 429357, but with single char match)
805 # (Also test greedy matches.)
806 for op in '','?','*':
807 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
808 (None, None))
809 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
810 ('a:', 'a'))
811
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000812 def test_bug_725106(self):
813 # capturing groups in alternatives in repeats
814 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
815 ('b', 'a'))
816 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
817 ('c', 'b'))
818 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
819 ('b', None))
820 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
821 ('b', None))
822 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
823 ('b', 'a'))
824 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
825 ('c', 'b'))
826 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
827 ('b', None))
828 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
829 ('b', None))
830
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000831 def test_bug_725149(self):
832 # mark_stack_base restoring before restoring marks
833 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
834 ('a', None))
835 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
836 ('a', None, None))
837
Just van Rossum12723ba2003-07-02 20:03:04 +0000838 def test_bug_764548(self):
839 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000840 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000841 pat = re.compile(my_unicode("abc"))
842 self.assertEqual(pat.match("xyz"), None)
843
Skip Montanaro5ba00542003-04-25 16:00:14 +0000844 def test_finditer(self):
845 iter = re.finditer(r":+", "a:b::c:::d")
846 self.assertEqual([item.group(0) for item in iter],
847 [":", "::", ":::"])
848
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600849 pat = re.compile(r":+")
850 iter = pat.finditer("a:b::c:::d", 1, 10)
851 self.assertEqual([item.group(0) for item in iter],
852 [":", "::", ":::"])
853
854 pat = re.compile(r":+")
855 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
856 self.assertEqual([item.group(0) for item in iter],
857 [":", "::", ":::"])
858
859 pat = re.compile(r":+")
860 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
861 self.assertEqual([item.group(0) for item in iter],
862 [":", "::", ":::"])
863
864 pat = re.compile(r":+")
865 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
866 self.assertEqual([item.group(0) for item in iter],
867 ["::", "::"])
868
Thomas Wouters40a088d2008-03-18 20:19:54 +0000869 def test_bug_926075(self):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000870 self.assertTrue(re.compile('bug_926075') is not
Thomas Wouters40a088d2008-03-18 20:19:54 +0000871 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000872
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000873 def test_bug_931848(self):
Guido van Rossum7ebb9702007-05-15 21:39:58 +0000874 pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000875 self.assertEqual(re.compile(pattern).split("a.b.c"),
876 ['a','b','c'])
877
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000878 def test_bug_581080(self):
879 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000880 self.assertEqual(next(iter).span(), (1,2))
881 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000882
883 scanner = re.compile(r"\s").scanner("a b")
884 self.assertEqual(scanner.search().span(), (1, 2))
885 self.assertEqual(scanner.search(), None)
886
887 def test_bug_817234(self):
888 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000889 self.assertEqual(next(iter).span(), (0, 4))
890 self.assertEqual(next(iter).span(), (4, 4))
891 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000892
Mark Dickinson1f268282009-07-28 17:22:36 +0000893 def test_bug_6561(self):
894 # '\d' should match characters in Unicode category 'Nd'
895 # (Number, Decimal Digit), but not those in 'Nl' (Number,
896 # Letter) or 'No' (Number, Other).
897 decimal_digits = [
898 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
899 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
900 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
901 ]
902 for x in decimal_digits:
903 self.assertEqual(re.match('^\d$', x).group(0), x)
904
905 not_decimal_digits = [
906 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
907 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
908 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
909 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
910 ]
911 for x in not_decimal_digits:
912 self.assertIsNone(re.match('^\d$', x))
913
Guido van Rossumd8faa362007-04-27 19:54:29 +0000914 def test_empty_array(self):
915 # SF buf 1647541
916 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000917 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000918 a = array.array(typecode)
Antoine Pitroufd036452008-08-19 17:56:33 +0000919 self.assertEqual(re.compile(b"bla").match(a), None)
920 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000921
Christian Heimes072c0f12008-01-03 23:01:04 +0000922 def test_inline_flags(self):
923 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000924 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
925 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000926
927 p = re.compile(upper_char, re.I | re.U)
928 q = p.match(lower_char)
929 self.assertNotEqual(q, None)
930
931 p = re.compile(lower_char, re.I | re.U)
932 q = p.match(upper_char)
933 self.assertNotEqual(q, None)
934
935 p = re.compile('(?i)' + upper_char, re.U)
936 q = p.match(lower_char)
937 self.assertNotEqual(q, None)
938
939 p = re.compile('(?i)' + lower_char, re.U)
940 q = p.match(upper_char)
941 self.assertNotEqual(q, None)
942
943 p = re.compile('(?iu)' + upper_char)
944 q = p.match(lower_char)
945 self.assertNotEqual(q, None)
946
947 p = re.compile('(?iu)' + lower_char)
948 q = p.match(upper_char)
949 self.assertNotEqual(q, None)
950
Christian Heimes25bb7832008-01-11 16:17:00 +0000951 def test_dollar_matches_twice(self):
952 "$ matches the end of string, and just before the terminating \n"
953 pattern = re.compile('$')
954 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
955 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
956 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
957
958 pattern = re.compile('$', re.MULTILINE)
959 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
960 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
961 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
962
Antoine Pitroufd036452008-08-19 17:56:33 +0000963 def test_bytes_str_mixing(self):
964 # Mixing str and bytes is disallowed
965 pat = re.compile('.')
966 bpat = re.compile(b'.')
967 self.assertRaises(TypeError, pat.match, b'b')
968 self.assertRaises(TypeError, bpat.match, 'b')
969 self.assertRaises(TypeError, pat.sub, b'b', 'c')
970 self.assertRaises(TypeError, pat.sub, 'b', b'c')
971 self.assertRaises(TypeError, pat.sub, b'b', b'c')
972 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
973 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
974 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
975
976 def test_ascii_and_unicode_flag(self):
977 # String patterns
978 for flags in (0, re.UNICODE):
979 pat = re.compile('\xc0', flags | re.IGNORECASE)
980 self.assertNotEqual(pat.match('\xe0'), None)
981 pat = re.compile('\w', flags)
982 self.assertNotEqual(pat.match('\xe0'), None)
983 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
984 self.assertEqual(pat.match('\xe0'), None)
985 pat = re.compile('(?a)\xc0', re.IGNORECASE)
986 self.assertEqual(pat.match('\xe0'), None)
987 pat = re.compile('\w', re.ASCII)
988 self.assertEqual(pat.match('\xe0'), None)
989 pat = re.compile('(?a)\w')
990 self.assertEqual(pat.match('\xe0'), None)
991 # Bytes patterns
992 for flags in (0, re.ASCII):
993 pat = re.compile(b'\xc0', re.IGNORECASE)
994 self.assertEqual(pat.match(b'\xe0'), None)
995 pat = re.compile(b'\w')
996 self.assertEqual(pat.match(b'\xe0'), None)
997 # Incompatibilities
998 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
999 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1000 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1001 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1002 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1003 self.assertRaises(ValueError, re.compile, '(?au)\w')
1004
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001005 def test_bug_6509(self):
1006 # Replacement strings of both types must parse properly.
1007 # all strings
1008 pat = re.compile('a(\w)')
1009 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1010 pat = re.compile('a(.)')
1011 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1012 pat = re.compile('..')
1013 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1014
1015 # all bytes
1016 pat = re.compile(b'a(\w)')
1017 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1018 pat = re.compile(b'a(.)')
1019 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1020 pat = re.compile(b'..')
1021 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1022
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001023 def test_dealloc(self):
1024 # issue 3299: check for segfault in debug build
1025 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001026 # the overflow limit is different on wide and narrow builds and it
1027 # depends on the definition of SRE_CODE (see sre.h).
1028 # 2**128 should be big enough to overflow on both. For smaller values
1029 # a RuntimeError is raised instead of OverflowError.
1030 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001031 self.assertRaises(TypeError, re.finditer, "a", {})
1032 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001033 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 def test_search_dot_unicode(self):
1036 self.assertIsNotNone(re.search("123.*-", '123abc-'))
1037 self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
1038 self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
1039 self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
1040 self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1041
Ezio Melottidf723e12012-03-13 01:29:48 +02001042 def test_compile(self):
1043 # Test return value when given string and pattern as parameter
1044 pattern = re.compile('random pattern')
1045 self.assertIsInstance(pattern, re._pattern_type)
1046 same_pattern = re.compile(pattern)
1047 self.assertIsInstance(same_pattern, re._pattern_type)
1048 self.assertIs(same_pattern, pattern)
1049 # Test behaviour when not given a string or pattern as parameter
1050 self.assertRaises(TypeError, re.compile, 0)
1051
Ezio Melottife8e6e72013-01-11 08:32:01 +02001052 def test_bug_13899(self):
1053 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1054 # nothing. Ditto B and Z.
1055 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1056 ['A', 'B', '\b', 'C', 'Z'])
1057
Antoine Pitroub33941a2012-12-03 20:55:56 +01001058 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001059 def test_large_search(self, size):
1060 # Issue #10182: indices were 32-bit-truncated.
1061 s = 'a' * size
1062 m = re.search('$', s)
1063 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001064 self.assertEqual(m.start(), size)
1065 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001066
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001067 # The huge memuse is because of re.sub() using a list and a join()
1068 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001069 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001070 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001071 # Issue #10182: indices were 32-bit-truncated.
1072 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001073 r, n = re.subn('', '', s)
1074 self.assertEqual(r, s)
1075 self.assertEqual(n, size + 1)
1076
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001077 def test_bug_16688(self):
1078 # Issue 16688: Backreferences make case-insensitive regex fail on
1079 # non-ASCII strings.
1080 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1081 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001082
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001083 def test_repeat_minmax_overflow(self):
1084 # Issue #13169
1085 string = "x" * 100000
1086 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1087 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1088 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1089 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1090 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1091 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1092 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1093 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1094 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1095 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1096 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1097
1098 @cpython_only
1099 def test_repeat_minmax_overflow_maxrepeat(self):
1100 try:
1101 from _sre import MAXREPEAT
1102 except ImportError:
1103 self.skipTest('requires _sre.MAXREPEAT constant')
1104 string = "x" * 100000
1105 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1106 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1107 (0, 100000))
1108 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1109 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1110 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1111 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1112
R David Murray26dfaac92013-04-14 13:00:54 -04001113 def test_backref_group_name_in_exception(self):
1114 # Issue 17341: Poor error message when compiling invalid regex
1115 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1116 re.compile('(?P=<foo>)')
1117
1118 def test_group_name_in_exception(self):
1119 # Issue 17341: Poor error message when compiling invalid regex
1120 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1121 re.compile('(?P<?foo>)')
1122
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001123 def test_issue17998(self):
1124 for reps in '*', '+', '?', '{1}':
1125 for mod in '', '?':
1126 pattern = '.' + reps + mod + 'yz'
1127 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1128 ['xyz'], msg=pattern)
1129 pattern = pattern.encode()
1130 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1131 [b'xyz'], msg=pattern)
1132
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001133 def test_match_repr(self):
1134 for string in '[abracadabra]', S('[abracadabra]'):
1135 m = re.search(r'(.+)(.*?)\1', string)
1136 self.assertEqual(repr(m), "<%s.%s object; "
1137 "span=(1, 12), match='abracadabra'>" %
1138 (type(m).__module__, type(m).__qualname__))
1139 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1140 bytearray(b'[abracadabra]'),
1141 memoryview(b'[abracadabra]')):
1142 m = re.search(rb'(.+)(.*?)\1', string)
1143 self.assertEqual(repr(m), "<%s.%s object; "
1144 "span=(1, 12), match=b'abracadabra'>" %
1145 (type(m).__module__, type(m).__qualname__))
1146
1147 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1148 self.assertEqual(repr(first), "<%s.%s object; "
1149 "span=(0, 2), match='aa'>" %
1150 (type(second).__module__, type(first).__qualname__))
1151 self.assertEqual(repr(second), "<%s.%s object; "
1152 "span=(3, 5), match='bb'>" %
1153 (type(second).__module__, type(second).__qualname__))
1154
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001155
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001156 def test_bug_2537(self):
1157 # issue 2537: empty submatches
1158 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1159 for inner_op in ('{0,}', '*', '?'):
1160 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1161 m = r.match("xyyzy")
1162 self.assertEqual(m.group(0), "xyy")
1163 self.assertEqual(m.group(1), "")
1164 self.assertEqual(m.group(2), "y")
1165
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001166
1167class ImplementationTest(unittest.TestCase):
1168 """
1169 Test implementation details of the re module.
1170 """
1171
1172 def test_overlap_table(self):
1173 f = sre_compile._generate_overlap_table
1174 self.assertEqual(f(""), [])
1175 self.assertEqual(f("a"), [0])
1176 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1177 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1178 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1179 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1180
1181
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001182def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001183 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001184 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001185 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001186 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001187 # To save time, only run the first and last 10 tests
1188 #tests = tests[:10] + tests[-10:]
1189 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001190
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001191 for t in tests:
1192 sys.stdout.flush()
1193 pattern = s = outcome = repl = expected = None
1194 if len(t) == 5:
1195 pattern, s, outcome, repl, expected = t
1196 elif len(t) == 3:
1197 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001198 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001199 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001200
Guido van Rossum41360a41998-03-26 19:42:58 +00001201 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001202 obj = re.compile(pattern)
1203 except re.error:
1204 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001205 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001206 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001207 except KeyboardInterrupt: raise KeyboardInterrupt
1208 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001209 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001210 if verbose:
1211 traceback.print_exc(file=sys.stdout)
1212 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001213 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001214 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001215 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001216 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001217 if outcome == SYNTAX_ERROR:
1218 # This should have been a syntax error; forget it.
1219 pass
1220 elif outcome == FAIL:
1221 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001222 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001223 elif outcome == SUCCEED:
1224 if result is not None:
1225 # Matched, as expected, so now we compute the
1226 # result string and compare it to our expected result.
1227 start, end = result.span(0)
1228 vardict={'found': result.group(0),
1229 'groups': result.group(),
1230 'flags': result.re.flags}
1231 for i in range(1, 100):
1232 try:
1233 gi = result.group(i)
1234 # Special hack because else the string concat fails:
1235 if gi is None:
1236 gi = "None"
1237 except IndexError:
1238 gi = "Error"
1239 vardict['g%d' % i] = gi
1240 for i in result.re.groupindex.keys():
1241 try:
1242 gi = result.group(i)
1243 if gi is None:
1244 gi = "None"
1245 except IndexError:
1246 gi = "Error"
1247 vardict[i] = gi
1248 repl = eval(repl, vardict)
1249 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001250 print('=== grouping error', t, end=' ')
1251 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001252 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001253 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001254
Antoine Pitrou22628c42008-07-22 17:53:22 +00001255 # Try the match with both pattern and string converted to
1256 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001257 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001258 bpat = bytes(pattern, "ascii")
1259 bs = bytes(s, "ascii")
1260 except UnicodeEncodeError:
1261 # skip non-ascii tests
1262 pass
1263 else:
1264 try:
1265 bpat = re.compile(bpat)
1266 except Exception:
1267 print('=== Fails on bytes pattern compile', t)
1268 if verbose:
1269 traceback.print_exc(file=sys.stdout)
1270 else:
1271 bytes_result = bpat.search(bs)
1272 if bytes_result is None:
1273 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001274
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001275 # Try the match with the search area limited to the extent
1276 # of the match and see if it still succeeds. \B will
1277 # break (because it won't match at the end or start of a
1278 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001279
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001280 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1281 and result is not None:
1282 obj = re.compile(pattern)
1283 result = obj.search(s, result.start(0), result.end(0) + 1)
1284 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001285 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001286
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001287 # Try the match with IGNORECASE enabled, and check that it
1288 # still succeeds.
1289 obj = re.compile(pattern, re.IGNORECASE)
1290 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001291 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001292 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001293
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001294 # Try the match with LOCALE enabled, and check that it
1295 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001296 if '(?u)' not in pattern:
1297 obj = re.compile(pattern, re.LOCALE)
1298 result = obj.search(s)
1299 if result is None:
1300 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001301
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001302 # Try the match with UNICODE locale enabled, and check
1303 # that it still succeeds.
1304 obj = re.compile(pattern, re.UNICODE)
1305 result = obj.search(s)
1306 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001307 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001308
Gregory P. Smith5a631832010-07-27 05:31:29 +00001309
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001310def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001311 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001312 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001313
1314if __name__ == "__main__":
1315 test_main()