blob: 20b1a1488dc00824b413057dc29c84b680a3fd3f [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
2 cpython_only
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00005from re import Scanner
R David Murray26dfaac92013-04-14 13:00:54 -04006import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02007import sys
8import string
9import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000010from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000011
Guido van Rossum23b22571997-07-17 22:36:14 +000012# Misc tests from Tim Peters' re.doc
13
Just van Rossum6802c6e2003-07-02 14:36:59 +000014# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020015# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000016# cover most of the code.
17
Skip Montanaro8ed06da2003-04-24 19:43:18 +000018import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000019
Serhiy Storchaka25324972013-10-16 12:46:28 +030020class S(str):
21 def __getitem__(self, index):
22 return S(super().__getitem__(index))
23
24class B(bytes):
25 def __getitem__(self, index):
26 return B(super().__getitem__(index))
27
Skip Montanaro8ed06da2003-04-24 19:43:18 +000028class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000029
Serhiy Storchaka25324972013-10-16 12:46:28 +030030 def assertTypedEqual(self, actual, expect, msg=None):
31 self.assertEqual(actual, expect, msg)
32 def recurse(actual, expect):
33 if isinstance(expect, (tuple, list)):
34 for x, y in zip(actual, expect):
35 recurse(x, y)
36 else:
37 self.assertIs(type(actual), type(expect), msg)
38 recurse(actual, expect)
39
Benjamin Petersone48944b2012-03-07 14:50:25 -060040 def test_keep_buffer(self):
41 # See bug 14212
42 b = bytearray(b'x')
43 it = re.finditer(b'a', b)
44 with self.assertRaises(BufferError):
45 b.extend(b'x'*400)
46 list(it)
47 del it
48 gc_collect()
49 b.extend(b'x'*400)
50
Raymond Hettinger027bb632004-05-31 03:09:25 +000051 def test_weakref(self):
52 s = 'QabbbcR'
53 x = re.compile('ab+c')
54 y = proxy(x)
55 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
56
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 def test_search_star_plus(self):
58 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
59 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
60 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
61 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000062 self.assertEqual(re.search('x', 'aaa'), None)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000063 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
64 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
65 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
66 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000067 self.assertEqual(re.match('a+', 'xxx'), None)
Guido van Rossum8430c581998-04-03 21:47:12 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000070 int_value = int(matchobj.group(0))
71 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030074 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
75 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
76 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
77 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
80
Skip Montanaro8ed06da2003-04-24 19:43:18 +000081 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
82 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
83 '9.3 -3 24x100y')
84 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
85 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000086
Skip Montanaro8ed06da2003-04-24 19:43:18 +000087 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
88 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 s = r"\1\1"
91 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
92 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
93 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000094
Skip Montanaro8ed06da2003-04-24 19:43:18 +000095 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
96 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
97 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
98 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000099
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000100 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
101 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
102 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
103 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
104 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000105
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000107
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000108 def test_bug_449964(self):
109 # fails for group followed by other escape
110 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
111 'xx\bxx\b')
112
113 def test_bug_449000(self):
114 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000115 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
116 'abc\ndef\n')
117 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
118 'abc\ndef\n')
119 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
120 'abc\ndef\n')
121 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
122 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000123
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000124 def test_bug_1661(self):
125 # Verify that flags do not get silently ignored with compiled patterns
126 pattern = re.compile('.')
127 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
128 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
129 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
130 self.assertRaises(ValueError, re.compile, pattern, re.I)
131
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000132 def test_bug_3629(self):
133 # A regex that triggered a bug in the sre-code validator
134 re.compile("(?P<quote>)(?(quote))")
135
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000136 def test_sub_template_numeric_escape(self):
137 # bug 776311 and friends
138 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
139 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
140 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
141 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
142 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
143 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
144 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
145
146 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
147 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
148
149 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
150 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
151 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
152 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
153 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
154
155 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
156 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000157
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000158 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
160 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
167 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
169 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
170
171 # in python2.3 (etc), these loop endlessly in sre_parser.py
172 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
173 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
174 'xz8')
175 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
176 'xza')
177
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000178 def test_qualified_re_sub(self):
179 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
180 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000181
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000182 def test_bug_114660(self):
183 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
184 'hello there')
185
186 def test_bug_462270(self):
187 # Test for empty sub() behaviour, see SF bug #462270
188 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
189 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
190
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200191 def test_symbolic_groups(self):
192 re.compile('(?P<a>x)(?P=a)(?(a)y)')
193 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
194 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
195 self.assertRaises(re.error, re.compile, '(?Px)')
196 self.assertRaises(re.error, re.compile, '(?P=)')
197 self.assertRaises(re.error, re.compile, '(?P=1)')
198 self.assertRaises(re.error, re.compile, '(?P=a)')
199 self.assertRaises(re.error, re.compile, '(?P=a1)')
200 self.assertRaises(re.error, re.compile, '(?P=a.)')
201 self.assertRaises(re.error, re.compile, '(?P<)')
202 self.assertRaises(re.error, re.compile, '(?P<>)')
203 self.assertRaises(re.error, re.compile, '(?P<1>)')
204 self.assertRaises(re.error, re.compile, '(?P<a.>)')
205 self.assertRaises(re.error, re.compile, '(?())')
206 self.assertRaises(re.error, re.compile, '(?(a))')
207 self.assertRaises(re.error, re.compile, '(?(1a))')
208 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200209 # New valid/invalid identifiers in Python 3
210 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
211 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
212 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200213
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000214 def test_symbolic_refs(self):
215 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
216 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
221 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
222 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
223 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000224 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200225 # New valid/invalid identifiers in Python 3
226 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
227 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
228 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000229
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000230 def test_re_subn(self):
231 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
232 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
233 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
234 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
235 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000236
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000237 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300238 for string in ":a:b::c", S(":a:b::c"):
239 self.assertTypedEqual(re.split(":", string),
240 ['', 'a', 'b', '', 'c'])
241 self.assertTypedEqual(re.split(":*", string),
242 ['', 'a', 'b', 'c'])
243 self.assertTypedEqual(re.split("(:*)", string),
244 ['', ':', 'a', ':', 'b', '::', 'c'])
245 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
246 memoryview(b":a:b::c")):
247 self.assertTypedEqual(re.split(b":", string),
248 [b'', b'a', b'b', b'', b'c'])
249 self.assertTypedEqual(re.split(b":*", string),
250 [b'', b'a', b'b', b'c'])
251 self.assertTypedEqual(re.split(b"(:*)", string),
252 [b'', b':', b'a', b':', b'b', b'::', b'c'])
253
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000254 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
255 self.assertEqual(re.split("(:)*", ":a:b::c"),
256 ['', ':', 'a', ':', 'b', ':', 'c'])
257 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
258 ['', ':', 'a', ':b::', 'c'])
259 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
260 ['', None, ':', 'a', None, ':', '', 'b', None, '',
261 None, '::', 'c'])
262 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
263 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000264
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000265 def test_qualified_re_split(self):
266 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
267 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
268 self.assertEqual(re.split("(:)", ":a:b::c", 2),
269 ['', ':', 'a', ':', 'b::c'])
270 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
271 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000272
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000273 def test_re_findall(self):
274 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300275 for string in "a:b::c:::d", S("a:b::c:::d"):
276 self.assertTypedEqual(re.findall(":+", string),
277 [":", "::", ":::"])
278 self.assertTypedEqual(re.findall("(:+)", string),
279 [":", "::", ":::"])
280 self.assertTypedEqual(re.findall("(:)(:*)", string),
281 [(":", ""), (":", ":"), (":", "::")])
282 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
283 memoryview(b"a:b::c:::d")):
284 self.assertTypedEqual(re.findall(b":+", string),
285 [b":", b"::", b":::"])
286 self.assertTypedEqual(re.findall(b"(:+)", string),
287 [b":", b"::", b":::"])
288 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
289 [(b":", b""), (b":", b":"), (b":", b"::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000290
Skip Montanaro5ba00542003-04-25 16:00:14 +0000291 def test_bug_117612(self):
292 self.assertEqual(re.findall(r"(a|(b))", "aba"),
293 [("a", ""),("b", "b"),("a", "")])
294
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000295 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300296 for string in 'a', S('a'):
297 self.assertEqual(re.match('a', string).groups(), ())
298 self.assertEqual(re.match('(a)', string).groups(), ('a',))
299 self.assertEqual(re.match('(a)', string).group(0), 'a')
300 self.assertEqual(re.match('(a)', string).group(1), 'a')
301 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
302 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
303 self.assertEqual(re.match(b'a', string).groups(), ())
304 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
305 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
306 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
307 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000309 pat = re.compile('((a)|(b))(c)?')
310 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
311 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
312 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
313 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
314 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000315
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000316 # A single group
317 m = re.match('(a)', 'a')
318 self.assertEqual(m.group(0), 'a')
319 self.assertEqual(m.group(0), 'a')
320 self.assertEqual(m.group(1), 'a')
321 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000322
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000323 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
324 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
325 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
326 (None, 'b', None))
327 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000328
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000329 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000330 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
331 ('(', 'a'))
332 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
333 (None, 'a'))
334 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
335 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
336 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
337 ('a', 'b'))
338 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
339 (None, 'd'))
340 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
341 (None, 'd'))
342 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
343 ('a', ''))
344
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000345 # Tests for bug #1177831: exercise groups other than the first group
346 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
347 self.assertEqual(p.match('abc').groups(),
348 ('a', 'b', 'c'))
349 self.assertEqual(p.match('ad').groups(),
350 ('a', None, 'd'))
351 self.assertEqual(p.match('abd'), None)
352 self.assertEqual(p.match('ac'), None)
353
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000354
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000355 def test_re_groupref(self):
356 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
357 ('|', 'a'))
358 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
359 (None, 'a'))
360 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
361 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
362 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
363 ('a', 'a'))
364 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
365 (None, None))
366
367 def test_groupdict(self):
368 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
369 'first second').groupdict(),
370 {'first':'first', 'second':'second'})
371
372 def test_expand(self):
373 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
374 "first second")
375 .expand(r"\2 \1 \g<second> \g<first>"),
376 "second first second first")
377
378 def test_repeat_minmax(self):
379 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
380 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
381 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
382 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
383
384 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
385 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
386 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
387 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
388 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
389 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
390 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
391 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
392
393 self.assertEqual(re.match("^x{1}$", "xxx"), None)
394 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
395 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
396 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
397
398 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
399 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
400 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
401 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
402 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
403 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
404 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
405 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
406
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000407 self.assertEqual(re.match("^x{}$", "xxx"), None)
408 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
409
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000410 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000411 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000412 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000413 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
414 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
415 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
416 {'first': 1, 'other': 2})
417
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000418 self.assertEqual(re.match("(a)", "a").pos, 0)
419 self.assertEqual(re.match("(a)", "a").endpos, 1)
420 self.assertEqual(re.match("(a)", "a").string, "a")
421 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
422 self.assertNotEqual(re.match("(a)", "a").re, None)
423
424 def test_special_escapes(self):
425 self.assertEqual(re.search(r"\b(b.)\b",
426 "abcd abc bcd bx").group(1), "bx")
427 self.assertEqual(re.search(r"\B(b.)\B",
428 "abc bcd bc abxd").group(1), "bx")
429 self.assertEqual(re.search(r"\b(b.)\b",
430 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
431 self.assertEqual(re.search(r"\B(b.)\B",
432 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
433 self.assertEqual(re.search(r"\b(b.)\b",
434 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
435 self.assertEqual(re.search(r"\B(b.)\B",
436 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
437 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
438 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
439 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
440 self.assertEqual(re.search(r"\b(b.)\b",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000441 "abcd abc bcd bx").group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000442 self.assertEqual(re.search(r"\B(b.)\B",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000443 "abc bcd bc abxd").group(1), "bx")
444 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
445 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
446 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000447 self.assertEqual(re.search(r"\d\D\w\W\s\S",
448 "1aa! a").group(0), "1aa! a")
449 self.assertEqual(re.search(r"\d\D\w\W\s\S",
450 "1aa! a", re.LOCALE).group(0), "1aa! a")
451 self.assertEqual(re.search(r"\d\D\w\W\s\S",
452 "1aa! a", re.UNICODE).group(0), "1aa! a")
453
Ezio Melotti5a045b92012-02-29 11:48:44 +0200454 def test_string_boundaries(self):
455 # See http://bugs.python.org/issue10713
456 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
457 "abc")
458 # There's a word boundary at the start of a string.
459 self.assertTrue(re.match(r"\b", "abc"))
460 # A non-empty string includes a non-boundary zero-length match.
461 self.assertTrue(re.search(r"\B", "abc"))
462 # There is no non-boundary match at the start of a string.
463 self.assertFalse(re.match(r"\B", "abc"))
464 # However, an empty string contains no word boundaries, and also no
465 # non-boundaries.
466 self.assertEqual(re.search(r"\B", ""), None)
467 # This one is questionable and different from the perlre behaviour,
468 # but describes current behavior.
469 self.assertEqual(re.search(r"\b", ""), None)
470 # A single word-character string has two boundaries, but no
471 # non-boundary gaps.
472 self.assertEqual(len(re.findall(r"\b", "a")), 2)
473 self.assertEqual(len(re.findall(r"\B", "a")), 0)
474 # If there are no words, there are no boundaries
475 self.assertEqual(len(re.findall(r"\b", " ")), 0)
476 self.assertEqual(len(re.findall(r"\b", " ")), 0)
477 # Can match around the whitespace.
478 self.assertEqual(len(re.findall(r"\B", " ")), 2)
479
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000480 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000481 self.assertEqual(re.match("([\u2222\u2223])",
482 "\u2222").group(1), "\u2222")
483 self.assertEqual(re.match("([\u2222\u2223])",
484 "\u2222", re.UNICODE).group(1), "\u2222")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000485
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100486 def test_big_codesize(self):
487 # Issue #1160
488 r = re.compile('|'.join(('%d'%x for x in range(10000))))
489 self.assertIsNotNone(r.match('1000'))
490 self.assertIsNotNone(r.match('9999'))
491
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000492 def test_anyall(self):
493 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
494 "a\nb")
495 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
496 "a\n\nb")
497
498 def test_non_consuming(self):
499 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
500 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
501 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
502 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
503 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
504 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
505 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
506
507 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
508 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
509 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
510 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
511
512 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000513 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
514 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
516 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
517 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
518 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
519 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
520 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
521 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
522 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
523
524 def test_category(self):
525 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
526
527 def test_getlower(self):
528 import _sre
529 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
530 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
531 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
532
533 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000534 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000535
536 def test_not_literal(self):
537 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
538 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
539
540 def test_search_coverage(self):
541 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
542 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
543
Ezio Melottid2114eb2011-03-25 14:08:44 +0200544 def assertMatch(self, pattern, text, match=None, span=None,
545 matcher=re.match):
546 if match is None and span is None:
547 # the pattern matches the whole text
548 match = text
549 span = (0, len(text))
550 elif match is None or span is None:
551 raise ValueError('If match is not None, span should be specified '
552 '(and vice versa).')
553 m = matcher(pattern, text)
554 self.assertTrue(m)
555 self.assertEqual(m.group(), match)
556 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000557
Ezio Melottid2114eb2011-03-25 14:08:44 +0200558 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300559 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200560 p = ''.join(chr(i) for i in range(256))
561 for c in p:
562 if c in alnum_chars:
563 self.assertEqual(re.escape(c), c)
564 elif c == '\x00':
565 self.assertEqual(re.escape(c), '\\000')
566 else:
567 self.assertEqual(re.escape(c), '\\' + c)
568 self.assertMatch(re.escape(c), c)
569 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000570
Guido van Rossum698280d2008-09-10 17:44:35 +0000571 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300572 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200573 p = bytes(range(256))
574 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000575 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200576 if b in alnum_chars:
577 self.assertEqual(re.escape(b), b)
578 elif i == 0:
579 self.assertEqual(re.escape(b), b'\\000')
580 else:
581 self.assertEqual(re.escape(b), b'\\' + b)
582 self.assertMatch(re.escape(b), b)
583 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000584
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200585 def test_re_escape_non_ascii(self):
586 s = 'xxx\u2620\u2620\u2620xxx'
587 s_escaped = re.escape(s)
588 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
589 self.assertMatch(s_escaped, s)
590 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
591 'x\u2620\u2620\u2620x', (2, 7), re.search)
592
593 def test_re_escape_non_ascii_bytes(self):
594 b = 'y\u2620y\u2620y'.encode('utf-8')
595 b_escaped = re.escape(b)
596 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
597 self.assertMatch(b_escaped, b)
598 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
599 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000600
Skip Montanaro1e703c62003-04-25 15:40:28 +0000601 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000602 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
603 s = pickle.dumps(oldpat)
604 newpat = pickle.loads(s)
605 self.assertEqual(oldpat, newpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000606
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000607 def test_constants(self):
608 self.assertEqual(re.I, re.IGNORECASE)
609 self.assertEqual(re.L, re.LOCALE)
610 self.assertEqual(re.M, re.MULTILINE)
611 self.assertEqual(re.S, re.DOTALL)
612 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000613
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000614 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000615 for flag in [re.I, re.M, re.X, re.S, re.L]:
616 self.assertNotEqual(re.compile('^pattern$', flag), None)
Guido van Rossumf473cb01998-01-14 16:42:17 +0000617
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000618 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200619 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
620 if i < 256:
621 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
622 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
623 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
624 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
625 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
626 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
627 if i < 0x10000:
628 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
629 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
630 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
631 self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
632 self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
633 self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
634 self.assertIsNotNone(re.match(r"\0", "\000"))
635 self.assertIsNotNone(re.match(r"\08", "\0008"))
636 self.assertIsNotNone(re.match(r"\01", "\001"))
637 self.assertIsNotNone(re.match(r"\018", "\0018"))
638 self.assertIsNotNone(re.match(r"\567", chr(0o167)))
639 self.assertRaises(re.error, re.match, r"\911", "")
640 self.assertRaises(re.error, re.match, r"\x1", "")
641 self.assertRaises(re.error, re.match, r"\x1z", "")
642 self.assertRaises(re.error, re.match, r"\u123", "")
643 self.assertRaises(re.error, re.match, r"\u123z", "")
644 self.assertRaises(re.error, re.match, r"\U0001234", "")
645 self.assertRaises(re.error, re.match, r"\U0001234z", "")
646 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000647
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000648 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200649 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
650 if i < 256:
651 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
652 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
653 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
654 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
655 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
656 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
657 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
658 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
659 if i < 0x10000:
660 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
661 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
662 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
663 self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
664 self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
665 self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Ezio Melottieadece22013-02-23 08:40:07 +0200666 self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200667 self.assertRaises(re.error, re.match, r"[\911]", "")
668 self.assertRaises(re.error, re.match, r"[\x1z]", "")
669 self.assertRaises(re.error, re.match, r"[\u123z]", "")
670 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
671 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
672
673 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000674 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Antoine Pitrou463badf2012-06-23 13:29:19 +0200675 self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
676 self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
677 self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
678 self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
679 self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
680 self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
681 self.assertIsNotNone(re.match(br"\u", b'u'))
682 self.assertIsNotNone(re.match(br"\U", b'U'))
683 self.assertIsNotNone(re.match(br"\0", b"\000"))
684 self.assertIsNotNone(re.match(br"\08", b"\0008"))
685 self.assertIsNotNone(re.match(br"\01", b"\001"))
686 self.assertIsNotNone(re.match(br"\018", b"\0018"))
687 self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
688 self.assertRaises(re.error, re.match, br"\911", b"")
689 self.assertRaises(re.error, re.match, br"\x1", b"")
690 self.assertRaises(re.error, re.match, br"\x1z", b"")
691
692 def test_sre_byte_class_literals(self):
693 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
694 self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
695 self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
696 self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
697 self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
698 self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
699 self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
700 self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
701 self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
702 self.assertIsNotNone(re.match(br"[\u]", b'u'))
703 self.assertIsNotNone(re.match(br"[\U]", b'U'))
704 self.assertRaises(re.error, re.match, br"[\911]", "")
705 self.assertRaises(re.error, re.match, br"[\x1z]", "")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000706
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000707 def test_bug_113254(self):
708 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
709 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
710 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
711
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000712 def test_bug_527371(self):
713 # bug described in patches 527371/672491
714 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
715 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
716 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
717 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
718 self.assertEqual(re.match("((a))", "a").lastindex, 1)
719
720 def test_bug_545855(self):
721 # bug 545855 -- This pattern failed to cause a compile error as it
722 # should, instead provoking a TypeError.
723 self.assertRaises(re.error, re.compile, 'foo[a-')
724
725 def test_bug_418626(self):
726 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
727 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
728 # pattern '*?' on a long string.
729 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
730 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
731 20003)
732 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000733 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000734 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000735 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000736
737 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000738 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000739 self.assertEqual(re.compile(pat) and 1, 1)
740
Skip Montanaro1e703c62003-04-25 15:40:28 +0000741 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000742 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000743 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000744 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
745 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
746 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000747
Serhiy Storchakafa468162013-02-16 21:23:53 +0200748 def test_unlimited_zero_width_repeat(self):
749 # Issue #9669
750 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
751 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
752 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
753 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
754 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
755 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
756
Skip Montanaro1e703c62003-04-25 15:40:28 +0000757 def test_scanner(self):
758 def s_ident(scanner, token): return token
759 def s_operator(scanner, token): return "op%s" % token
760 def s_float(scanner, token): return float(token)
761 def s_int(scanner, token): return int(token)
762
763 scanner = Scanner([
764 (r"[a-zA-Z_]\w*", s_ident),
765 (r"\d+\.\d*", s_float),
766 (r"\d+", s_int),
767 (r"=|\+|-|\*|/", s_operator),
768 (r"\s+", None),
769 ])
770
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000771 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
772
Skip Montanaro1e703c62003-04-25 15:40:28 +0000773 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
774 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
775 'op+', 'bar'], ''))
776
Skip Montanaro5ba00542003-04-25 16:00:14 +0000777 def test_bug_448951(self):
778 # bug 448951 (similar to 429357, but with single char match)
779 # (Also test greedy matches.)
780 for op in '','?','*':
781 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
782 (None, None))
783 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
784 ('a:', 'a'))
785
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000786 def test_bug_725106(self):
787 # capturing groups in alternatives in repeats
788 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
789 ('b', 'a'))
790 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
791 ('c', 'b'))
792 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
793 ('b', None))
794 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
795 ('b', None))
796 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
797 ('b', 'a'))
798 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
799 ('c', 'b'))
800 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
801 ('b', None))
802 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
803 ('b', None))
804
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000805 def test_bug_725149(self):
806 # mark_stack_base restoring before restoring marks
807 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
808 ('a', None))
809 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
810 ('a', None, None))
811
Just van Rossum12723ba2003-07-02 20:03:04 +0000812 def test_bug_764548(self):
813 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000814 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000815 pat = re.compile(my_unicode("abc"))
816 self.assertEqual(pat.match("xyz"), None)
817
Skip Montanaro5ba00542003-04-25 16:00:14 +0000818 def test_finditer(self):
819 iter = re.finditer(r":+", "a:b::c:::d")
820 self.assertEqual([item.group(0) for item in iter],
821 [":", "::", ":::"])
822
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600823 pat = re.compile(r":+")
824 iter = pat.finditer("a:b::c:::d", 1, 10)
825 self.assertEqual([item.group(0) for item in iter],
826 [":", "::", ":::"])
827
828 pat = re.compile(r":+")
829 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
830 self.assertEqual([item.group(0) for item in iter],
831 [":", "::", ":::"])
832
833 pat = re.compile(r":+")
834 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
835 self.assertEqual([item.group(0) for item in iter],
836 [":", "::", ":::"])
837
838 pat = re.compile(r":+")
839 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
840 self.assertEqual([item.group(0) for item in iter],
841 ["::", "::"])
842
Thomas Wouters40a088d2008-03-18 20:19:54 +0000843 def test_bug_926075(self):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000844 self.assertTrue(re.compile('bug_926075') is not
Thomas Wouters40a088d2008-03-18 20:19:54 +0000845 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000846
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000847 def test_bug_931848(self):
Guido van Rossum7ebb9702007-05-15 21:39:58 +0000848 pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000849 self.assertEqual(re.compile(pattern).split("a.b.c"),
850 ['a','b','c'])
851
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000852 def test_bug_581080(self):
853 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000854 self.assertEqual(next(iter).span(), (1,2))
855 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000856
857 scanner = re.compile(r"\s").scanner("a b")
858 self.assertEqual(scanner.search().span(), (1, 2))
859 self.assertEqual(scanner.search(), None)
860
861 def test_bug_817234(self):
862 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000863 self.assertEqual(next(iter).span(), (0, 4))
864 self.assertEqual(next(iter).span(), (4, 4))
865 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000866
Mark Dickinson1f268282009-07-28 17:22:36 +0000867 def test_bug_6561(self):
868 # '\d' should match characters in Unicode category 'Nd'
869 # (Number, Decimal Digit), but not those in 'Nl' (Number,
870 # Letter) or 'No' (Number, Other).
871 decimal_digits = [
872 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
873 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
874 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
875 ]
876 for x in decimal_digits:
877 self.assertEqual(re.match('^\d$', x).group(0), x)
878
879 not_decimal_digits = [
880 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
881 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
882 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
883 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
884 ]
885 for x in not_decimal_digits:
886 self.assertIsNone(re.match('^\d$', x))
887
Guido van Rossumd8faa362007-04-27 19:54:29 +0000888 def test_empty_array(self):
889 # SF buf 1647541
890 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000891 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000892 a = array.array(typecode)
Antoine Pitroufd036452008-08-19 17:56:33 +0000893 self.assertEqual(re.compile(b"bla").match(a), None)
894 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000895
Christian Heimes072c0f12008-01-03 23:01:04 +0000896 def test_inline_flags(self):
897 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000898 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
899 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000900
901 p = re.compile(upper_char, re.I | re.U)
902 q = p.match(lower_char)
903 self.assertNotEqual(q, None)
904
905 p = re.compile(lower_char, re.I | re.U)
906 q = p.match(upper_char)
907 self.assertNotEqual(q, None)
908
909 p = re.compile('(?i)' + upper_char, re.U)
910 q = p.match(lower_char)
911 self.assertNotEqual(q, None)
912
913 p = re.compile('(?i)' + lower_char, re.U)
914 q = p.match(upper_char)
915 self.assertNotEqual(q, None)
916
917 p = re.compile('(?iu)' + upper_char)
918 q = p.match(lower_char)
919 self.assertNotEqual(q, None)
920
921 p = re.compile('(?iu)' + lower_char)
922 q = p.match(upper_char)
923 self.assertNotEqual(q, None)
924
Christian Heimes25bb7832008-01-11 16:17:00 +0000925 def test_dollar_matches_twice(self):
926 "$ matches the end of string, and just before the terminating \n"
927 pattern = re.compile('$')
928 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
929 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
930 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
931
932 pattern = re.compile('$', re.MULTILINE)
933 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
934 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
935 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
936
Antoine Pitroufd036452008-08-19 17:56:33 +0000937 def test_bytes_str_mixing(self):
938 # Mixing str and bytes is disallowed
939 pat = re.compile('.')
940 bpat = re.compile(b'.')
941 self.assertRaises(TypeError, pat.match, b'b')
942 self.assertRaises(TypeError, bpat.match, 'b')
943 self.assertRaises(TypeError, pat.sub, b'b', 'c')
944 self.assertRaises(TypeError, pat.sub, 'b', b'c')
945 self.assertRaises(TypeError, pat.sub, b'b', b'c')
946 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
947 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
948 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
949
950 def test_ascii_and_unicode_flag(self):
951 # String patterns
952 for flags in (0, re.UNICODE):
953 pat = re.compile('\xc0', flags | re.IGNORECASE)
954 self.assertNotEqual(pat.match('\xe0'), None)
955 pat = re.compile('\w', flags)
956 self.assertNotEqual(pat.match('\xe0'), None)
957 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
958 self.assertEqual(pat.match('\xe0'), None)
959 pat = re.compile('(?a)\xc0', re.IGNORECASE)
960 self.assertEqual(pat.match('\xe0'), None)
961 pat = re.compile('\w', re.ASCII)
962 self.assertEqual(pat.match('\xe0'), None)
963 pat = re.compile('(?a)\w')
964 self.assertEqual(pat.match('\xe0'), None)
965 # Bytes patterns
966 for flags in (0, re.ASCII):
967 pat = re.compile(b'\xc0', re.IGNORECASE)
968 self.assertEqual(pat.match(b'\xe0'), None)
969 pat = re.compile(b'\w')
970 self.assertEqual(pat.match(b'\xe0'), None)
971 # Incompatibilities
972 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
973 self.assertRaises(ValueError, re.compile, b'(?u)\w')
974 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
975 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
976 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
977 self.assertRaises(ValueError, re.compile, '(?au)\w')
978
Ezio Melottib92ed7c2010-03-06 15:24:08 +0000979 def test_bug_6509(self):
980 # Replacement strings of both types must parse properly.
981 # all strings
982 pat = re.compile('a(\w)')
983 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
984 pat = re.compile('a(.)')
985 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
986 pat = re.compile('..')
987 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
988
989 # all bytes
990 pat = re.compile(b'a(\w)')
991 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
992 pat = re.compile(b'a(.)')
993 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
994 pat = re.compile(b'..')
995 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
996
Antoine Pitrou82feb1f2010-01-14 17:34:48 +0000997 def test_dealloc(self):
998 # issue 3299: check for segfault in debug build
999 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001000 # the overflow limit is different on wide and narrow builds and it
1001 # depends on the definition of SRE_CODE (see sre.h).
1002 # 2**128 should be big enough to overflow on both. For smaller values
1003 # a RuntimeError is raised instead of OverflowError.
1004 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001005 self.assertRaises(TypeError, re.finditer, "a", {})
1006 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001007 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 def test_search_dot_unicode(self):
1010 self.assertIsNotNone(re.search("123.*-", '123abc-'))
1011 self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
1012 self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
1013 self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
1014 self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
1015
Ezio Melottidf723e12012-03-13 01:29:48 +02001016 def test_compile(self):
1017 # Test return value when given string and pattern as parameter
1018 pattern = re.compile('random pattern')
1019 self.assertIsInstance(pattern, re._pattern_type)
1020 same_pattern = re.compile(pattern)
1021 self.assertIsInstance(same_pattern, re._pattern_type)
1022 self.assertIs(same_pattern, pattern)
1023 # Test behaviour when not given a string or pattern as parameter
1024 self.assertRaises(TypeError, re.compile, 0)
1025
Ezio Melottife8e6e72013-01-11 08:32:01 +02001026 def test_bug_13899(self):
1027 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1028 # nothing. Ditto B and Z.
1029 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1030 ['A', 'B', '\b', 'C', 'Z'])
1031
Antoine Pitroub33941a2012-12-03 20:55:56 +01001032 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001033 def test_large_search(self, size):
1034 # Issue #10182: indices were 32-bit-truncated.
1035 s = 'a' * size
1036 m = re.search('$', s)
1037 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001038 self.assertEqual(m.start(), size)
1039 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001040
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001041 # The huge memuse is because of re.sub() using a list and a join()
1042 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001043 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001044 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001045 # Issue #10182: indices were 32-bit-truncated.
1046 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001047 r, n = re.subn('', '', s)
1048 self.assertEqual(r, s)
1049 self.assertEqual(n, size + 1)
1050
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001051 def test_bug_16688(self):
1052 # Issue 16688: Backreferences make case-insensitive regex fail on
1053 # non-ASCII strings.
1054 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1055 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001056
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001057 def test_repeat_minmax_overflow(self):
1058 # Issue #13169
1059 string = "x" * 100000
1060 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1061 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1062 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1063 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1064 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1065 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1066 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1067 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1068 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1069 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1070 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1071
1072 @cpython_only
1073 def test_repeat_minmax_overflow_maxrepeat(self):
1074 try:
1075 from _sre import MAXREPEAT
1076 except ImportError:
1077 self.skipTest('requires _sre.MAXREPEAT constant')
1078 string = "x" * 100000
1079 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1080 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1081 (0, 100000))
1082 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1083 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1084 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1085 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1086
R David Murray26dfaac92013-04-14 13:00:54 -04001087 def test_backref_group_name_in_exception(self):
1088 # Issue 17341: Poor error message when compiling invalid regex
1089 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1090 re.compile('(?P=<foo>)')
1091
1092 def test_group_name_in_exception(self):
1093 # Issue 17341: Poor error message when compiling invalid regex
1094 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1095 re.compile('(?P<?foo>)')
1096
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001097 def test_issue17998(self):
1098 for reps in '*', '+', '?', '{1}':
1099 for mod in '', '?':
1100 pattern = '.' + reps + mod + 'yz'
1101 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1102 ['xyz'], msg=pattern)
1103 pattern = pattern.encode()
1104 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1105 [b'xyz'], msg=pattern)
1106
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001107
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001108 def test_bug_2537(self):
1109 # issue 2537: empty submatches
1110 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1111 for inner_op in ('{0,}', '*', '?'):
1112 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1113 m = r.match("xyyzy")
1114 self.assertEqual(m.group(0), "xyy")
1115 self.assertEqual(m.group(1), "")
1116 self.assertEqual(m.group(2), "y")
1117
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001118def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001119 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001120 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001121 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001122 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001123 # To save time, only run the first and last 10 tests
1124 #tests = tests[:10] + tests[-10:]
1125 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001126
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001127 for t in tests:
1128 sys.stdout.flush()
1129 pattern = s = outcome = repl = expected = None
1130 if len(t) == 5:
1131 pattern, s, outcome, repl, expected = t
1132 elif len(t) == 3:
1133 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001134 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001135 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001136
Guido van Rossum41360a41998-03-26 19:42:58 +00001137 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001138 obj = re.compile(pattern)
1139 except re.error:
1140 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001141 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001142 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001143 except KeyboardInterrupt: raise KeyboardInterrupt
1144 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001145 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001146 if verbose:
1147 traceback.print_exc(file=sys.stdout)
1148 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001149 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001150 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001151 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001152 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001153 if outcome == SYNTAX_ERROR:
1154 # This should have been a syntax error; forget it.
1155 pass
1156 elif outcome == FAIL:
1157 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001158 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001159 elif outcome == SUCCEED:
1160 if result is not None:
1161 # Matched, as expected, so now we compute the
1162 # result string and compare it to our expected result.
1163 start, end = result.span(0)
1164 vardict={'found': result.group(0),
1165 'groups': result.group(),
1166 'flags': result.re.flags}
1167 for i in range(1, 100):
1168 try:
1169 gi = result.group(i)
1170 # Special hack because else the string concat fails:
1171 if gi is None:
1172 gi = "None"
1173 except IndexError:
1174 gi = "Error"
1175 vardict['g%d' % i] = gi
1176 for i in result.re.groupindex.keys():
1177 try:
1178 gi = result.group(i)
1179 if gi is None:
1180 gi = "None"
1181 except IndexError:
1182 gi = "Error"
1183 vardict[i] = gi
1184 repl = eval(repl, vardict)
1185 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001186 print('=== grouping error', t, end=' ')
1187 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001188 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001189 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001190
Antoine Pitrou22628c42008-07-22 17:53:22 +00001191 # Try the match with both pattern and string converted to
1192 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001193 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001194 bpat = bytes(pattern, "ascii")
1195 bs = bytes(s, "ascii")
1196 except UnicodeEncodeError:
1197 # skip non-ascii tests
1198 pass
1199 else:
1200 try:
1201 bpat = re.compile(bpat)
1202 except Exception:
1203 print('=== Fails on bytes pattern compile', t)
1204 if verbose:
1205 traceback.print_exc(file=sys.stdout)
1206 else:
1207 bytes_result = bpat.search(bs)
1208 if bytes_result is None:
1209 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001210
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001211 # Try the match with the search area limited to the extent
1212 # of the match and see if it still succeeds. \B will
1213 # break (because it won't match at the end or start of a
1214 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001215
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001216 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1217 and result is not None:
1218 obj = re.compile(pattern)
1219 result = obj.search(s, result.start(0), result.end(0) + 1)
1220 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001221 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001222
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001223 # Try the match with IGNORECASE enabled, and check that it
1224 # still succeeds.
1225 obj = re.compile(pattern, re.IGNORECASE)
1226 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001227 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001228 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001229
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001230 # Try the match with LOCALE enabled, and check that it
1231 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001232 if '(?u)' not in pattern:
1233 obj = re.compile(pattern, re.LOCALE)
1234 result = obj.search(s)
1235 if result is None:
1236 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001237
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001238 # Try the match with UNICODE locale enabled, and check
1239 # that it still succeeds.
1240 obj = re.compile(pattern, re.UNICODE)
1241 result = obj.search(s)
1242 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001243 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001244
Gregory P. Smith5a631832010-07-27 05:31:29 +00001245
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001246def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001247 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001248 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001249
1250if __name__ == "__main__":
1251 test_main()