blob: 42672f578ea3a89da7024dd7fe4ceebb52a5879a [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
87 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
88 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
158 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
159 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
183 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
197 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
198 self.assertRaises(re.error, re.compile, '(?Px)')
199 self.assertRaises(re.error, re.compile, '(?P=)')
200 self.assertRaises(re.error, re.compile, '(?P=1)')
201 self.assertRaises(re.error, re.compile, '(?P=a)')
202 self.assertRaises(re.error, re.compile, '(?P=a1)')
203 self.assertRaises(re.error, re.compile, '(?P=a.)')
204 self.assertRaises(re.error, re.compile, '(?P<)')
205 self.assertRaises(re.error, re.compile, '(?P<>)')
206 self.assertRaises(re.error, re.compile, '(?P<1>)')
207 self.assertRaises(re.error, re.compile, '(?P<a.>)')
208 self.assertRaises(re.error, re.compile, '(?())')
209 self.assertRaises(re.error, re.compile, '(?(a))')
210 self.assertRaises(re.error, re.compile, '(?(1a))')
211 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200212 # New valid/invalid identifiers in Python 3
213 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
214 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
215 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_symbolic_refs(self):
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
224 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200228 # New valid/invalid identifiers in Python 3
229 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
230 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
231 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000232
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 def test_re_subn(self):
234 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
235 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
236 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
237 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
238 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300241 for string in ":a:b::c", S(":a:b::c"):
242 self.assertTypedEqual(re.split(":", string),
243 ['', 'a', 'b', '', 'c'])
244 self.assertTypedEqual(re.split(":*", string),
245 ['', 'a', 'b', 'c'])
246 self.assertTypedEqual(re.split("(:*)", string),
247 ['', ':', 'a', ':', 'b', '::', 'c'])
248 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
249 memoryview(b":a:b::c")):
250 self.assertTypedEqual(re.split(b":", string),
251 [b'', b'a', b'b', b'', b'c'])
252 self.assertTypedEqual(re.split(b":*", string),
253 [b'', b'a', b'b', b'c'])
254 self.assertTypedEqual(re.split(b"(:*)", string),
255 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300256 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
257 "\U0001d49c\U0001d49e\U0001d4b5"):
258 string = ":%s:%s::%s" % (a, b, c)
259 self.assertEqual(re.split(":", string), ['', a, b, '', c])
260 self.assertEqual(re.split(":*", string), ['', a, b, c])
261 self.assertEqual(re.split("(:*)", string),
262 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300263
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000264 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
265 self.assertEqual(re.split("(:)*", ":a:b::c"),
266 ['', ':', 'a', ':', 'b', ':', 'c'])
267 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
268 ['', ':', 'a', ':b::', 'c'])
269 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
270 ['', None, ':', 'a', None, ':', '', 'b', None, '',
271 None, '::', 'c'])
272 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
273 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000274
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000275 def test_qualified_re_split(self):
276 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
277 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
278 self.assertEqual(re.split("(:)", ":a:b::c", 2),
279 ['', ':', 'a', ':', 'b::c'])
280 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
281 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_findall(self):
284 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300285 for string in "a:b::c:::d", S("a:b::c:::d"):
286 self.assertTypedEqual(re.findall(":+", string),
287 [":", "::", ":::"])
288 self.assertTypedEqual(re.findall("(:+)", string),
289 [":", "::", ":::"])
290 self.assertTypedEqual(re.findall("(:)(:*)", string),
291 [(":", ""), (":", ":"), (":", "::")])
292 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
293 memoryview(b"a:b::c:::d")):
294 self.assertTypedEqual(re.findall(b":+", string),
295 [b":", b"::", b":::"])
296 self.assertTypedEqual(re.findall(b"(:+)", string),
297 [b":", b"::", b":::"])
298 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
299 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300300 for x in ("\xe0", "\u0430", "\U0001d49c"):
301 xx = x * 2
302 xxx = x * 3
303 string = "a%sb%sc%sd" % (x, xx, xxx)
304 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
306 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
307 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro5ba00542003-04-25 16:00:14 +0000309 def test_bug_117612(self):
310 self.assertEqual(re.findall(r"(a|(b))", "aba"),
311 [("a", ""),("b", "b"),("a", "")])
312
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000313 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300314 for string in 'a', S('a'):
315 self.assertEqual(re.match('a', string).groups(), ())
316 self.assertEqual(re.match('(a)', string).groups(), ('a',))
317 self.assertEqual(re.match('(a)', string).group(0), 'a')
318 self.assertEqual(re.match('(a)', string).group(1), 'a')
319 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
320 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
321 self.assertEqual(re.match(b'a', string).groups(), ())
322 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
323 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
325 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300326 for a in ("\xe0", "\u0430", "\U0001d49c"):
327 self.assertEqual(re.match(a, a).groups(), ())
328 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
329 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
331 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000333 pat = re.compile('((a)|(b))(c)?')
334 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
335 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
336 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
337 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
338 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000339
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000340 # A single group
341 m = re.match('(a)', 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(0), 'a')
344 self.assertEqual(m.group(1), 'a')
345 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
348 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
349 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
350 (None, 'b', None))
351 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000352
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200353 def test_re_fullmatch(self):
354 # Issue 16203: Proposal: add re.fullmatch() method.
355 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
356 for string in "ab", S("ab"):
357 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
358 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
359 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
360 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
361 r = r"%s|%s" % (a, a + b)
362 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
363 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
365 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
366 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
367 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
368 self.assertIsNone(re.fullmatch(r"a+", "ab"))
369 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
371 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
372 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
374 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
375
376 self.assertEqual(
377 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
378 self.assertEqual(
379 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
380 self.assertEqual(
381 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
382
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000384 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
385 ('(', 'a'))
386 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
387 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300388 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
389 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
391 ('a', 'b'))
392 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
393 (None, 'd'))
394 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
395 (None, 'd'))
396 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
397 ('a', ''))
398
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000399 # Tests for bug #1177831: exercise groups other than the first group
400 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
401 self.assertEqual(p.match('abc').groups(),
402 ('a', 'b', 'c'))
403 self.assertEqual(p.match('ad').groups(),
404 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300405 self.assertIsNone(p.match('abd'))
406 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000407
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000408
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000409 def test_re_groupref(self):
410 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
411 ('|', 'a'))
412 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
413 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300414 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
415 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000416 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
417 ('a', 'a'))
418 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
419 (None, None))
420
421 def test_groupdict(self):
422 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
423 'first second').groupdict(),
424 {'first':'first', 'second':'second'})
425
426 def test_expand(self):
427 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
428 "first second")
429 .expand(r"\2 \1 \g<second> \g<first>"),
430 "second first second first")
431
432 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300433 self.assertIsNone(re.match("^(\w){1}$", "abc"))
434 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
435 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
436 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000437
438 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
445 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
446
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300447 self.assertIsNone(re.match("^x{1}$", "xxx"))
448 self.assertIsNone(re.match("^x{1}?$", "xxx"))
449 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
450 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000451
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300452 self.assertTrue(re.match("^x{3}$", "xxx"))
453 self.assertTrue(re.match("^x{1,3}$", "xxx"))
454 self.assertTrue(re.match("^x{1,4}$", "xxx"))
455 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
456 self.assertTrue(re.match("^x{3}?$", "xxx"))
457 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
458 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
459 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300461 self.assertIsNone(re.match("^x{}$", "xxx"))
462 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000463
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000464 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000467 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
468 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
469 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
470 {'first': 1, 'other': 2})
471
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match("(a)", "a").pos, 0)
473 self.assertEqual(re.match("(a)", "a").endpos, 1)
474 self.assertEqual(re.match("(a)", "a").string, "a")
475 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300476 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000477
478 def test_special_escapes(self):
479 self.assertEqual(re.search(r"\b(b.)\b",
480 "abcd abc bcd bx").group(1), "bx")
481 self.assertEqual(re.search(r"\B(b.)\B",
482 "abc bcd bc abxd").group(1), "bx")
483 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300484 "abcd abc bcd bx", re.ASCII).group(1), "bx")
485 self.assertEqual(re.search(r"\B(b.)\B",
486 "abc bcd bc abxd", re.ASCII).group(1), "bx")
487 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000488 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
489 self.assertEqual(re.search(r"\B(b.)\B",
490 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000491 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300493 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300494 self.assertEqual(re.search(br"\b(b.)\b",
495 b"abcd abc bcd bx").group(1), b"bx")
496 self.assertEqual(re.search(br"\B(b.)\B",
497 b"abc bcd bc abxd").group(1), b"bx")
498 self.assertEqual(re.search(br"\b(b.)\b",
499 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
500 self.assertEqual(re.search(br"\B(b.)\B",
501 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
502 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
503 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300504 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"\d\D\w\W\s\S",
506 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300507 self.assertEqual(re.search(br"\d\D\w\W\s\S",
508 b"1aa! a").group(0), b"1aa! a")
509 self.assertEqual(re.search(r"\d\D\w\W\s\S",
510 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511 self.assertEqual(re.search(r"\d\D\w\W\s\S",
512 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300513 self.assertEqual(re.search(br"\d\D\w\W\s\S",
514 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515
Ezio Melotti5a045b92012-02-29 11:48:44 +0200516 def test_string_boundaries(self):
517 # See http://bugs.python.org/issue10713
518 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
519 "abc")
520 # There's a word boundary at the start of a string.
521 self.assertTrue(re.match(r"\b", "abc"))
522 # A non-empty string includes a non-boundary zero-length match.
523 self.assertTrue(re.search(r"\B", "abc"))
524 # There is no non-boundary match at the start of a string.
525 self.assertFalse(re.match(r"\B", "abc"))
526 # However, an empty string contains no word boundaries, and also no
527 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300528 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200529 # This one is questionable and different from the perlre behaviour,
530 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200532 # A single word-character string has two boundaries, but no
533 # non-boundary gaps.
534 self.assertEqual(len(re.findall(r"\b", "a")), 2)
535 self.assertEqual(len(re.findall(r"\B", "a")), 0)
536 # If there are no words, there are no boundaries
537 self.assertEqual(len(re.findall(r"\b", " ")), 0)
538 self.assertEqual(len(re.findall(r"\b", " ")), 0)
539 # Can match around the whitespace.
540 self.assertEqual(len(re.findall(r"\B", " ")), 2)
541
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(re.match("([\u2222\u2223])",
544 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300545 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300546 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000547
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100548 def test_big_codesize(self):
549 # Issue #1160
550 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300551 self.assertTrue(r.match('1000'))
552 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100553
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000554 def test_anyall(self):
555 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
556 "a\nb")
557 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
558 "a\n\nb")
559
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200560 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
562 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
563 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
564 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
567 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
568
569 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
570 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
571 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
572 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
573
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200574 # Group reference.
575 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
576 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
577 # Conditional group reference.
578 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
579 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
580 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
581 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
582 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
583 # Group used before defined.
584 self.assertTrue(re.match('(a)b(?=(?(2)x|c))(c)', 'abc'))
585 self.assertIsNone(re.match('(a)b(?=(?(2)b|x))(c)', 'abc'))
586 self.assertTrue(re.match('(a)b(?=(?(1)c|x))(c)', 'abc'))
587
588 def test_lookbehind(self):
589 self.assertTrue(re.match('ab(?<=b)c', 'abc'))
590 self.assertIsNone(re.match('ab(?<=c)c', 'abc'))
591 self.assertIsNone(re.match('ab(?<!b)c', 'abc'))
592 self.assertTrue(re.match('ab(?<!c)c', 'abc'))
593 # Group reference.
594 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
595 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
596 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
597 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
598 # Conditional group reference.
599 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
600 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
601 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
602 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
603 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
604 # Group used before defined.
605 self.assertIsNone(re.match('(a)b(?<=(?(2)x|c))(c)', 'abc'))
606 self.assertIsNone(re.match('(a)b(?<=(?(2)b|x))(c)', 'abc'))
607 self.assertIsNone(re.match('(a)b(?<=(?(1)c|x))(c)', 'abc'))
608 self.assertTrue(re.match('(a)b(?<=(?(1)b|x))(c)', 'abc'))
609
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000610 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000611 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300612 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
614 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
615 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
616 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
617 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
618 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
619 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
620 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
621
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200622 assert '\u212a'.lower() == 'k' # 'K'
623 self.assertTrue(re.match(r'K', '\u212a', re.I))
624 self.assertTrue(re.match(r'k', '\u212a', re.I))
625 self.assertTrue(re.match(r'\u212a', 'K', re.I))
626 self.assertTrue(re.match(r'\u212a', 'k', re.I))
627 assert '\u017f'.upper() == 'S' # 'ſ'
628 self.assertTrue(re.match(r'S', '\u017f', re.I))
629 self.assertTrue(re.match(r's', '\u017f', re.I))
630 self.assertTrue(re.match(r'\u017f', 'S', re.I))
631 self.assertTrue(re.match(r'\u017f', 's', re.I))
632 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
633 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
634 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
635
636 def test_ignore_case_set(self):
637 self.assertTrue(re.match(r'[19A]', 'A', re.I))
638 self.assertTrue(re.match(r'[19a]', 'a', re.I))
639 self.assertTrue(re.match(r'[19a]', 'A', re.I))
640 self.assertTrue(re.match(r'[19A]', 'a', re.I))
641 self.assertTrue(re.match(br'[19A]', b'A', re.I))
642 self.assertTrue(re.match(br'[19a]', b'a', re.I))
643 self.assertTrue(re.match(br'[19a]', b'A', re.I))
644 self.assertTrue(re.match(br'[19A]', b'a', re.I))
645 assert '\u212a'.lower() == 'k' # 'K'
646 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
647 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
648 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
649 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
650 assert '\u017f'.upper() == 'S' # 'ſ'
651 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
652 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
653 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
654 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
655 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
656 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
657 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
658
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200659 def test_ignore_case_range(self):
660 # Issues #3511, #17381.
661 self.assertTrue(re.match(r'[9-a]', '_', re.I))
662 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
663 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
664 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
665 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
666 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
667 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
668 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
669 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
670 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
671 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
672 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
673 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
674 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
675 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
676 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
677
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200678 assert '\u212a'.lower() == 'k' # 'K'
679 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
680 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
681 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
682 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
683 assert '\u017f'.upper() == 'S' # 'ſ'
684 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
685 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
686 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
687 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
688 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
689 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
690 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
691
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000692 def test_category(self):
693 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
694
695 def test_getlower(self):
696 import _sre
697 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
698 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
699 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
700
701 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300702 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000703
704 def test_not_literal(self):
705 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
706 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
707
708 def test_search_coverage(self):
709 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
710 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
711
Ezio Melottid2114eb2011-03-25 14:08:44 +0200712 def assertMatch(self, pattern, text, match=None, span=None,
713 matcher=re.match):
714 if match is None and span is None:
715 # the pattern matches the whole text
716 match = text
717 span = (0, len(text))
718 elif match is None or span is None:
719 raise ValueError('If match is not None, span should be specified '
720 '(and vice versa).')
721 m = matcher(pattern, text)
722 self.assertTrue(m)
723 self.assertEqual(m.group(), match)
724 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000725
Ezio Melottid2114eb2011-03-25 14:08:44 +0200726 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300727 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200728 p = ''.join(chr(i) for i in range(256))
729 for c in p:
730 if c in alnum_chars:
731 self.assertEqual(re.escape(c), c)
732 elif c == '\x00':
733 self.assertEqual(re.escape(c), '\\000')
734 else:
735 self.assertEqual(re.escape(c), '\\' + c)
736 self.assertMatch(re.escape(c), c)
737 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000738
Guido van Rossum698280d2008-09-10 17:44:35 +0000739 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300740 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200741 p = bytes(range(256))
742 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000743 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200744 if b in alnum_chars:
745 self.assertEqual(re.escape(b), b)
746 elif i == 0:
747 self.assertEqual(re.escape(b), b'\\000')
748 else:
749 self.assertEqual(re.escape(b), b'\\' + b)
750 self.assertMatch(re.escape(b), b)
751 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000752
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200753 def test_re_escape_non_ascii(self):
754 s = 'xxx\u2620\u2620\u2620xxx'
755 s_escaped = re.escape(s)
756 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
757 self.assertMatch(s_escaped, s)
758 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
759 'x\u2620\u2620\u2620x', (2, 7), re.search)
760
761 def test_re_escape_non_ascii_bytes(self):
762 b = 'y\u2620y\u2620y'.encode('utf-8')
763 b_escaped = re.escape(b)
764 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
765 self.assertMatch(b_escaped, b)
766 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
767 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000768
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300769 def test_pickling(self):
770 import pickle
771 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
772 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
773 pickled = pickle.dumps(oldpat, proto)
774 newpat = pickle.loads(pickled)
775 self.assertEqual(newpat, oldpat)
776 # current pickle expects the _compile() reconstructor in re module
777 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000778
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000779 def test_constants(self):
780 self.assertEqual(re.I, re.IGNORECASE)
781 self.assertEqual(re.L, re.LOCALE)
782 self.assertEqual(re.M, re.MULTILINE)
783 self.assertEqual(re.S, re.DOTALL)
784 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000785
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000786 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000787 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300788 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000789
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000790 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200791 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
792 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300793 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
794 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
795 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
796 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
797 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
798 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200799 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300800 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
801 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
802 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
803 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
804 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
805 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
806 self.assertTrue(re.match(r"\0", "\000"))
807 self.assertTrue(re.match(r"\08", "\0008"))
808 self.assertTrue(re.match(r"\01", "\001"))
809 self.assertTrue(re.match(r"\018", "\0018"))
810 self.assertTrue(re.match(r"\567", chr(0o167)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200811 self.assertRaises(re.error, re.match, r"\911", "")
812 self.assertRaises(re.error, re.match, r"\x1", "")
813 self.assertRaises(re.error, re.match, r"\x1z", "")
814 self.assertRaises(re.error, re.match, r"\u123", "")
815 self.assertRaises(re.error, re.match, r"\u123z", "")
816 self.assertRaises(re.error, re.match, r"\U0001234", "")
817 self.assertRaises(re.error, re.match, r"\U0001234z", "")
818 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000819
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000820 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200821 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
822 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300823 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
824 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
825 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
826 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
827 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
828 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
829 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
830 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200831 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300832 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
833 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
834 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
835 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
836 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
837 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
838 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200839 self.assertRaises(re.error, re.match, r"[\911]", "")
840 self.assertRaises(re.error, re.match, r"[\x1z]", "")
841 self.assertRaises(re.error, re.match, r"[\u123z]", "")
842 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
843 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
844
845 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000846 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300847 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
848 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
849 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
850 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
851 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
852 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
853 self.assertTrue(re.match(br"\u", b'u'))
854 self.assertTrue(re.match(br"\U", b'U'))
855 self.assertTrue(re.match(br"\0", b"\000"))
856 self.assertTrue(re.match(br"\08", b"\0008"))
857 self.assertTrue(re.match(br"\01", b"\001"))
858 self.assertTrue(re.match(br"\018", b"\0018"))
859 self.assertTrue(re.match(br"\567", bytes([0o167])))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200860 self.assertRaises(re.error, re.match, br"\911", b"")
861 self.assertRaises(re.error, re.match, br"\x1", b"")
862 self.assertRaises(re.error, re.match, br"\x1z", b"")
863
864 def test_sre_byte_class_literals(self):
865 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300866 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
867 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
868 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
869 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
870 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
871 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
872 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
873 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
874 self.assertTrue(re.match(br"[\u]", b'u'))
875 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300876 self.assertRaises(re.error, re.match, br"[\911]", b"")
877 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000878
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000879 def test_bug_113254(self):
880 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
881 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
882 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
883
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000884 def test_bug_527371(self):
885 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300886 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000887 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
888 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
889 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
890 self.assertEqual(re.match("((a))", "a").lastindex, 1)
891
892 def test_bug_545855(self):
893 # bug 545855 -- This pattern failed to cause a compile error as it
894 # should, instead provoking a TypeError.
895 self.assertRaises(re.error, re.compile, 'foo[a-')
896
897 def test_bug_418626(self):
898 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
899 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
900 # pattern '*?' on a long string.
901 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
902 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
903 20003)
904 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000905 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000906 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000907 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000908
909 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000910 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000911 self.assertEqual(re.compile(pat) and 1, 1)
912
Skip Montanaro1e703c62003-04-25 15:40:28 +0000913 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000914 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000915 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000916 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
917 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
918 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000919
Serhiy Storchakafa468162013-02-16 21:23:53 +0200920 def test_unlimited_zero_width_repeat(self):
921 # Issue #9669
922 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
923 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
924 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
925 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
926 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
927 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
928
Skip Montanaro1e703c62003-04-25 15:40:28 +0000929 def test_scanner(self):
930 def s_ident(scanner, token): return token
931 def s_operator(scanner, token): return "op%s" % token
932 def s_float(scanner, token): return float(token)
933 def s_int(scanner, token): return int(token)
934
935 scanner = Scanner([
936 (r"[a-zA-Z_]\w*", s_ident),
937 (r"\d+\.\d*", s_float),
938 (r"\d+", s_int),
939 (r"=|\+|-|\*|/", s_operator),
940 (r"\s+", None),
941 ])
942
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300943 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000944
Skip Montanaro1e703c62003-04-25 15:40:28 +0000945 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
946 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
947 'op+', 'bar'], ''))
948
Skip Montanaro5ba00542003-04-25 16:00:14 +0000949 def test_bug_448951(self):
950 # bug 448951 (similar to 429357, but with single char match)
951 # (Also test greedy matches.)
952 for op in '','?','*':
953 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
954 (None, None))
955 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
956 ('a:', 'a'))
957
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000958 def test_bug_725106(self):
959 # capturing groups in alternatives in repeats
960 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
961 ('b', 'a'))
962 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
963 ('c', 'b'))
964 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
965 ('b', None))
966 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
967 ('b', None))
968 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
969 ('b', 'a'))
970 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
971 ('c', 'b'))
972 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
973 ('b', None))
974 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
975 ('b', None))
976
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000977 def test_bug_725149(self):
978 # mark_stack_base restoring before restoring marks
979 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
980 ('a', None))
981 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
982 ('a', None, None))
983
Just van Rossum12723ba2003-07-02 20:03:04 +0000984 def test_bug_764548(self):
985 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000986 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000987 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300988 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000989
Skip Montanaro5ba00542003-04-25 16:00:14 +0000990 def test_finditer(self):
991 iter = re.finditer(r":+", "a:b::c:::d")
992 self.assertEqual([item.group(0) for item in iter],
993 [":", "::", ":::"])
994
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600995 pat = re.compile(r":+")
996 iter = pat.finditer("a:b::c:::d", 1, 10)
997 self.assertEqual([item.group(0) for item in iter],
998 [":", "::", ":::"])
999
1000 pat = re.compile(r":+")
1001 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1002 self.assertEqual([item.group(0) for item in iter],
1003 [":", "::", ":::"])
1004
1005 pat = re.compile(r":+")
1006 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1007 self.assertEqual([item.group(0) for item in iter],
1008 [":", "::", ":::"])
1009
1010 pat = re.compile(r":+")
1011 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1012 self.assertEqual([item.group(0) for item in iter],
1013 ["::", "::"])
1014
Thomas Wouters40a088d2008-03-18 20:19:54 +00001015 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001016 self.assertIsNot(re.compile('bug_926075'),
1017 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001018
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001019 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001020 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001021 self.assertEqual(re.compile(pattern).split("a.b.c"),
1022 ['a','b','c'])
1023
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001024 def test_bug_581080(self):
1025 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001026 self.assertEqual(next(iter).span(), (1,2))
1027 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001028
1029 scanner = re.compile(r"\s").scanner("a b")
1030 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001031 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001032
1033 def test_bug_817234(self):
1034 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001035 self.assertEqual(next(iter).span(), (0, 4))
1036 self.assertEqual(next(iter).span(), (4, 4))
1037 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001038
Mark Dickinson1f268282009-07-28 17:22:36 +00001039 def test_bug_6561(self):
1040 # '\d' should match characters in Unicode category 'Nd'
1041 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1042 # Letter) or 'No' (Number, Other).
1043 decimal_digits = [
1044 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1045 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1046 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1047 ]
1048 for x in decimal_digits:
1049 self.assertEqual(re.match('^\d$', x).group(0), x)
1050
1051 not_decimal_digits = [
1052 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1053 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1054 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1055 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1056 ]
1057 for x in not_decimal_digits:
1058 self.assertIsNone(re.match('^\d$', x))
1059
Guido van Rossumd8faa362007-04-27 19:54:29 +00001060 def test_empty_array(self):
1061 # SF buf 1647541
1062 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001063 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001064 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001065 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001066 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001067
Christian Heimes072c0f12008-01-03 23:01:04 +00001068 def test_inline_flags(self):
1069 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +00001070 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
1071 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +00001072
1073 p = re.compile(upper_char, re.I | re.U)
1074 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001075 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001076
1077 p = re.compile(lower_char, re.I | re.U)
1078 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001079 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001080
1081 p = re.compile('(?i)' + upper_char, re.U)
1082 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001083 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001084
1085 p = re.compile('(?i)' + lower_char, re.U)
1086 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001087 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001088
1089 p = re.compile('(?iu)' + upper_char)
1090 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001091 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001092
1093 p = re.compile('(?iu)' + lower_char)
1094 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001095 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001096
Christian Heimes25bb7832008-01-11 16:17:00 +00001097 def test_dollar_matches_twice(self):
1098 "$ matches the end of string, and just before the terminating \n"
1099 pattern = re.compile('$')
1100 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1101 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1102 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1103
1104 pattern = re.compile('$', re.MULTILINE)
1105 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1106 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1107 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1108
Antoine Pitroufd036452008-08-19 17:56:33 +00001109 def test_bytes_str_mixing(self):
1110 # Mixing str and bytes is disallowed
1111 pat = re.compile('.')
1112 bpat = re.compile(b'.')
1113 self.assertRaises(TypeError, pat.match, b'b')
1114 self.assertRaises(TypeError, bpat.match, 'b')
1115 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1116 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1117 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1118 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1119 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1120 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1121
1122 def test_ascii_and_unicode_flag(self):
1123 # String patterns
1124 for flags in (0, re.UNICODE):
1125 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001126 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001127 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001128 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001129 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001130 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001131 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001132 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001133 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001134 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001135 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001136 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001137 # Bytes patterns
1138 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001139 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001140 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001141 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001142 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001143 # Incompatibilities
1144 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1145 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1146 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1147 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1148 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1149 self.assertRaises(ValueError, re.compile, '(?au)\w')
1150
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001151 def test_bug_6509(self):
1152 # Replacement strings of both types must parse properly.
1153 # all strings
1154 pat = re.compile('a(\w)')
1155 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1156 pat = re.compile('a(.)')
1157 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1158 pat = re.compile('..')
1159 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1160
1161 # all bytes
1162 pat = re.compile(b'a(\w)')
1163 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1164 pat = re.compile(b'a(.)')
1165 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1166 pat = re.compile(b'..')
1167 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1168
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001169 def test_dealloc(self):
1170 # issue 3299: check for segfault in debug build
1171 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001172 # the overflow limit is different on wide and narrow builds and it
1173 # depends on the definition of SRE_CODE (see sre.h).
1174 # 2**128 should be big enough to overflow on both. For smaller values
1175 # a RuntimeError is raised instead of OverflowError.
1176 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001177 self.assertRaises(TypeError, re.finditer, "a", {})
1178 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001179 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001182 self.assertTrue(re.search("123.*-", '123abc-'))
1183 self.assertTrue(re.search("123.*-", '123\xe9-'))
1184 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1185 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1186 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001187
Ezio Melottidf723e12012-03-13 01:29:48 +02001188 def test_compile(self):
1189 # Test return value when given string and pattern as parameter
1190 pattern = re.compile('random pattern')
1191 self.assertIsInstance(pattern, re._pattern_type)
1192 same_pattern = re.compile(pattern)
1193 self.assertIsInstance(same_pattern, re._pattern_type)
1194 self.assertIs(same_pattern, pattern)
1195 # Test behaviour when not given a string or pattern as parameter
1196 self.assertRaises(TypeError, re.compile, 0)
1197
Ezio Melottife8e6e72013-01-11 08:32:01 +02001198 def test_bug_13899(self):
1199 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1200 # nothing. Ditto B and Z.
1201 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1202 ['A', 'B', '\b', 'C', 'Z'])
1203
Antoine Pitroub33941a2012-12-03 20:55:56 +01001204 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001205 def test_large_search(self, size):
1206 # Issue #10182: indices were 32-bit-truncated.
1207 s = 'a' * size
1208 m = re.search('$', s)
1209 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001210 self.assertEqual(m.start(), size)
1211 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001212
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001213 # The huge memuse is because of re.sub() using a list and a join()
1214 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001215 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001216 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001217 # Issue #10182: indices were 32-bit-truncated.
1218 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001219 r, n = re.subn('', '', s)
1220 self.assertEqual(r, s)
1221 self.assertEqual(n, size + 1)
1222
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001223 def test_bug_16688(self):
1224 # Issue 16688: Backreferences make case-insensitive regex fail on
1225 # non-ASCII strings.
1226 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1227 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001228
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001229 def test_repeat_minmax_overflow(self):
1230 # Issue #13169
1231 string = "x" * 100000
1232 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1233 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1234 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1235 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1236 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1237 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1238 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1239 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1240 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1241 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1242 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1243
1244 @cpython_only
1245 def test_repeat_minmax_overflow_maxrepeat(self):
1246 try:
1247 from _sre import MAXREPEAT
1248 except ImportError:
1249 self.skipTest('requires _sre.MAXREPEAT constant')
1250 string = "x" * 100000
1251 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1252 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1253 (0, 100000))
1254 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1255 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1256 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1257 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1258
R David Murray26dfaac92013-04-14 13:00:54 -04001259 def test_backref_group_name_in_exception(self):
1260 # Issue 17341: Poor error message when compiling invalid regex
1261 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1262 re.compile('(?P=<foo>)')
1263
1264 def test_group_name_in_exception(self):
1265 # Issue 17341: Poor error message when compiling invalid regex
1266 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1267 re.compile('(?P<?foo>)')
1268
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001269 def test_issue17998(self):
1270 for reps in '*', '+', '?', '{1}':
1271 for mod in '', '?':
1272 pattern = '.' + reps + mod + 'yz'
1273 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1274 ['xyz'], msg=pattern)
1275 pattern = pattern.encode()
1276 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1277 [b'xyz'], msg=pattern)
1278
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001279 def test_match_repr(self):
1280 for string in '[abracadabra]', S('[abracadabra]'):
1281 m = re.search(r'(.+)(.*?)\1', string)
1282 self.assertEqual(repr(m), "<%s.%s object; "
1283 "span=(1, 12), match='abracadabra'>" %
1284 (type(m).__module__, type(m).__qualname__))
1285 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1286 bytearray(b'[abracadabra]'),
1287 memoryview(b'[abracadabra]')):
1288 m = re.search(rb'(.+)(.*?)\1', string)
1289 self.assertEqual(repr(m), "<%s.%s object; "
1290 "span=(1, 12), match=b'abracadabra'>" %
1291 (type(m).__module__, type(m).__qualname__))
1292
1293 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1294 self.assertEqual(repr(first), "<%s.%s object; "
1295 "span=(0, 2), match='aa'>" %
1296 (type(second).__module__, type(first).__qualname__))
1297 self.assertEqual(repr(second), "<%s.%s object; "
1298 "span=(3, 5), match='bb'>" %
1299 (type(second).__module__, type(second).__qualname__))
1300
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001301
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001302 def test_bug_2537(self):
1303 # issue 2537: empty submatches
1304 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1305 for inner_op in ('{0,}', '*', '?'):
1306 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1307 m = r.match("xyyzy")
1308 self.assertEqual(m.group(0), "xyy")
1309 self.assertEqual(m.group(1), "")
1310 self.assertEqual(m.group(2), "y")
1311
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001312 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001313 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001314 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001315 re.compile(pat, re.DEBUG)
1316 dump = '''\
1317subpattern 1
1318 literal 46
1319subpattern None
1320 branch
1321 in
1322 literal 99
1323 literal 104
1324 or
1325 literal 112
1326 literal 121
1327subpattern None
1328 groupref_exists 1
1329 at at_end
1330 else
1331 literal 58
1332 literal 32
1333'''
1334 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001335 # Debug output is output again even a second time (bypassing
1336 # the cache -- issue #20426).
1337 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001338 re.compile(pat, re.DEBUG)
1339 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001340
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001341 def test_keyword_parameters(self):
1342 # Issue #20283: Accepting the string keyword parameter.
1343 pat = re.compile(r'(ab)')
1344 self.assertEqual(
1345 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1346 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001347 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1348 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001349 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1350 self.assertEqual(
1351 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1352 self.assertEqual(
1353 pat.split(string='abracadabra', maxsplit=1),
1354 ['', 'ab', 'racadabra'])
1355 self.assertEqual(
1356 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1357 (7, 9))
1358
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001359 def test_bug_20998(self):
1360 # Issue #20998: Fullmatch of repeated single character pattern
1361 # with ignore case.
1362 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1363
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001364 def test_locale_caching(self):
1365 # Issue #22410
1366 oldlocale = locale.setlocale(locale.LC_CTYPE)
1367 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1368 for loc in 'en_US.iso88591', 'en_US.utf8':
1369 try:
1370 locale.setlocale(locale.LC_CTYPE, loc)
1371 except locale.Error:
1372 # Unsupported locale on this system
1373 self.skipTest('test needs %s locale' % loc)
1374
1375 re.purge()
1376 self.check_en_US_iso88591()
1377 self.check_en_US_utf8()
1378 re.purge()
1379 self.check_en_US_utf8()
1380 self.check_en_US_iso88591()
1381
1382 def check_en_US_iso88591(self):
1383 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1384 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1385 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1386 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1387 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1388 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1389 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1390
1391 def check_en_US_utf8(self):
1392 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1393 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1394 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1395 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1396 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1397 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1398 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1399
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001400
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001401class PatternReprTests(unittest.TestCase):
1402 def check(self, pattern, expected):
1403 self.assertEqual(repr(re.compile(pattern)), expected)
1404
1405 def check_flags(self, pattern, flags, expected):
1406 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1407
1408 def test_without_flags(self):
1409 self.check('random pattern',
1410 "re.compile('random pattern')")
1411
1412 def test_single_flag(self):
1413 self.check_flags('random pattern', re.IGNORECASE,
1414 "re.compile('random pattern', re.IGNORECASE)")
1415
1416 def test_multiple_flags(self):
1417 self.check_flags('random pattern', re.I|re.S|re.X,
1418 "re.compile('random pattern', "
1419 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1420
1421 def test_unicode_flag(self):
1422 self.check_flags('random pattern', re.U,
1423 "re.compile('random pattern')")
1424 self.check_flags('random pattern', re.I|re.S|re.U,
1425 "re.compile('random pattern', "
1426 "re.IGNORECASE|re.DOTALL)")
1427
1428 def test_inline_flags(self):
1429 self.check('(?i)pattern',
1430 "re.compile('(?i)pattern', re.IGNORECASE)")
1431
1432 def test_unknown_flags(self):
1433 self.check_flags('random pattern', 0x123000,
1434 "re.compile('random pattern', 0x123000)")
1435 self.check_flags('random pattern', 0x123000|re.I,
1436 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1437
1438 def test_bytes(self):
1439 self.check(b'bytes pattern',
1440 "re.compile(b'bytes pattern')")
1441 self.check_flags(b'bytes pattern', re.A,
1442 "re.compile(b'bytes pattern', re.ASCII)")
1443
1444 def test_quotes(self):
1445 self.check('random "double quoted" pattern',
1446 '''re.compile('random "double quoted" pattern')''')
1447 self.check("random 'single quoted' pattern",
1448 '''re.compile("random 'single quoted' pattern")''')
1449 self.check('''both 'single' and "double" quotes''',
1450 '''re.compile('both \\'single\\' and "double" quotes')''')
1451
1452 def test_long_pattern(self):
1453 pattern = 'Very %spattern' % ('long ' * 1000)
1454 r = repr(re.compile(pattern))
1455 self.assertLess(len(r), 300)
1456 self.assertEqual(r[:30], "re.compile('Very long long lon")
1457 r = repr(re.compile(pattern, re.I))
1458 self.assertLess(len(r), 300)
1459 self.assertEqual(r[:30], "re.compile('Very long long lon")
1460 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1461
1462
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001463class ImplementationTest(unittest.TestCase):
1464 """
1465 Test implementation details of the re module.
1466 """
1467
1468 def test_overlap_table(self):
1469 f = sre_compile._generate_overlap_table
1470 self.assertEqual(f(""), [])
1471 self.assertEqual(f("a"), [0])
1472 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1473 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1474 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1475 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1476
1477
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001478def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001479 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001480 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001481 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001482 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001483 # To save time, only run the first and last 10 tests
1484 #tests = tests[:10] + tests[-10:]
1485 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001486
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001487 for t in tests:
1488 sys.stdout.flush()
1489 pattern = s = outcome = repl = expected = None
1490 if len(t) == 5:
1491 pattern, s, outcome, repl, expected = t
1492 elif len(t) == 3:
1493 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001494 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001495 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001496
Guido van Rossum41360a41998-03-26 19:42:58 +00001497 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001498 obj = re.compile(pattern)
1499 except re.error:
1500 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001501 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001502 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001503 except KeyboardInterrupt: raise KeyboardInterrupt
1504 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001505 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001506 if verbose:
1507 traceback.print_exc(file=sys.stdout)
1508 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001509 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001510 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001511 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001512 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001513 if outcome == SYNTAX_ERROR:
1514 # This should have been a syntax error; forget it.
1515 pass
1516 elif outcome == FAIL:
1517 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001518 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001519 elif outcome == SUCCEED:
1520 if result is not None:
1521 # Matched, as expected, so now we compute the
1522 # result string and compare it to our expected result.
1523 start, end = result.span(0)
1524 vardict={'found': result.group(0),
1525 'groups': result.group(),
1526 'flags': result.re.flags}
1527 for i in range(1, 100):
1528 try:
1529 gi = result.group(i)
1530 # Special hack because else the string concat fails:
1531 if gi is None:
1532 gi = "None"
1533 except IndexError:
1534 gi = "Error"
1535 vardict['g%d' % i] = gi
1536 for i in result.re.groupindex.keys():
1537 try:
1538 gi = result.group(i)
1539 if gi is None:
1540 gi = "None"
1541 except IndexError:
1542 gi = "Error"
1543 vardict[i] = gi
1544 repl = eval(repl, vardict)
1545 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001546 print('=== grouping error', t, end=' ')
1547 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001548 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001549 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001550
Antoine Pitrou22628c42008-07-22 17:53:22 +00001551 # Try the match with both pattern and string converted to
1552 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001553 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001554 bpat = bytes(pattern, "ascii")
1555 bs = bytes(s, "ascii")
1556 except UnicodeEncodeError:
1557 # skip non-ascii tests
1558 pass
1559 else:
1560 try:
1561 bpat = re.compile(bpat)
1562 except Exception:
1563 print('=== Fails on bytes pattern compile', t)
1564 if verbose:
1565 traceback.print_exc(file=sys.stdout)
1566 else:
1567 bytes_result = bpat.search(bs)
1568 if bytes_result is None:
1569 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001570
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001571 # Try the match with the search area limited to the extent
1572 # of the match and see if it still succeeds. \B will
1573 # break (because it won't match at the end or start of a
1574 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001575
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001576 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1577 and result is not None:
1578 obj = re.compile(pattern)
1579 result = obj.search(s, result.start(0), result.end(0) + 1)
1580 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001581 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001582
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001583 # Try the match with IGNORECASE enabled, and check that it
1584 # still succeeds.
1585 obj = re.compile(pattern, re.IGNORECASE)
1586 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001587 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001588 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001589
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001590 # Try the match with LOCALE enabled, and check that it
1591 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001592 if '(?u)' not in pattern:
1593 obj = re.compile(pattern, re.LOCALE)
1594 result = obj.search(s)
1595 if result is None:
1596 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001597
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001598 # Try the match with UNICODE locale enabled, and check
1599 # that it still succeeds.
1600 obj = re.compile(pattern, re.UNICODE)
1601 result = obj.search(s)
1602 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001603 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001604
Gregory P. Smith5a631832010-07-27 05:31:29 +00001605
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001606def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001607 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001608 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001609
1610if __name__ == "__main__":
1611 test_main()