blob: 7348af3f1aaeb32032cacaf9369f16dae1225477 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
87 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
88 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
158 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
159 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
183 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
197 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
198 self.assertRaises(re.error, re.compile, '(?Px)')
199 self.assertRaises(re.error, re.compile, '(?P=)')
200 self.assertRaises(re.error, re.compile, '(?P=1)')
201 self.assertRaises(re.error, re.compile, '(?P=a)')
202 self.assertRaises(re.error, re.compile, '(?P=a1)')
203 self.assertRaises(re.error, re.compile, '(?P=a.)')
204 self.assertRaises(re.error, re.compile, '(?P<)')
205 self.assertRaises(re.error, re.compile, '(?P<>)')
206 self.assertRaises(re.error, re.compile, '(?P<1>)')
207 self.assertRaises(re.error, re.compile, '(?P<a.>)')
208 self.assertRaises(re.error, re.compile, '(?())')
209 self.assertRaises(re.error, re.compile, '(?(a))')
210 self.assertRaises(re.error, re.compile, '(?(1a))')
211 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200212 # New valid/invalid identifiers in Python 3
213 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
214 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
215 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_symbolic_refs(self):
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
224 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200228 # New valid/invalid identifiers in Python 3
229 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
230 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
231 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000232
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 def test_re_subn(self):
234 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
235 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
236 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
237 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
238 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300241 for string in ":a:b::c", S(":a:b::c"):
242 self.assertTypedEqual(re.split(":", string),
243 ['', 'a', 'b', '', 'c'])
244 self.assertTypedEqual(re.split(":*", string),
245 ['', 'a', 'b', 'c'])
246 self.assertTypedEqual(re.split("(:*)", string),
247 ['', ':', 'a', ':', 'b', '::', 'c'])
248 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
249 memoryview(b":a:b::c")):
250 self.assertTypedEqual(re.split(b":", string),
251 [b'', b'a', b'b', b'', b'c'])
252 self.assertTypedEqual(re.split(b":*", string),
253 [b'', b'a', b'b', b'c'])
254 self.assertTypedEqual(re.split(b"(:*)", string),
255 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300256 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
257 "\U0001d49c\U0001d49e\U0001d4b5"):
258 string = ":%s:%s::%s" % (a, b, c)
259 self.assertEqual(re.split(":", string), ['', a, b, '', c])
260 self.assertEqual(re.split(":*", string), ['', a, b, c])
261 self.assertEqual(re.split("(:*)", string),
262 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300263
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000264 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
265 self.assertEqual(re.split("(:)*", ":a:b::c"),
266 ['', ':', 'a', ':', 'b', ':', 'c'])
267 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
268 ['', ':', 'a', ':b::', 'c'])
269 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
270 ['', None, ':', 'a', None, ':', '', 'b', None, '',
271 None, '::', 'c'])
272 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
273 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000274
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000275 def test_qualified_re_split(self):
276 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
277 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
278 self.assertEqual(re.split("(:)", ":a:b::c", 2),
279 ['', ':', 'a', ':', 'b::c'])
280 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
281 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_findall(self):
284 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300285 for string in "a:b::c:::d", S("a:b::c:::d"):
286 self.assertTypedEqual(re.findall(":+", string),
287 [":", "::", ":::"])
288 self.assertTypedEqual(re.findall("(:+)", string),
289 [":", "::", ":::"])
290 self.assertTypedEqual(re.findall("(:)(:*)", string),
291 [(":", ""), (":", ":"), (":", "::")])
292 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
293 memoryview(b"a:b::c:::d")):
294 self.assertTypedEqual(re.findall(b":+", string),
295 [b":", b"::", b":::"])
296 self.assertTypedEqual(re.findall(b"(:+)", string),
297 [b":", b"::", b":::"])
298 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
299 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300300 for x in ("\xe0", "\u0430", "\U0001d49c"):
301 xx = x * 2
302 xxx = x * 3
303 string = "a%sb%sc%sd" % (x, xx, xxx)
304 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
306 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
307 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro5ba00542003-04-25 16:00:14 +0000309 def test_bug_117612(self):
310 self.assertEqual(re.findall(r"(a|(b))", "aba"),
311 [("a", ""),("b", "b"),("a", "")])
312
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000313 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300314 for string in 'a', S('a'):
315 self.assertEqual(re.match('a', string).groups(), ())
316 self.assertEqual(re.match('(a)', string).groups(), ('a',))
317 self.assertEqual(re.match('(a)', string).group(0), 'a')
318 self.assertEqual(re.match('(a)', string).group(1), 'a')
319 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
320 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
321 self.assertEqual(re.match(b'a', string).groups(), ())
322 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
323 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
325 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300326 for a in ("\xe0", "\u0430", "\U0001d49c"):
327 self.assertEqual(re.match(a, a).groups(), ())
328 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
329 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
331 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000333 pat = re.compile('((a)|(b))(c)?')
334 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
335 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
336 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
337 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
338 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000339
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000340 # A single group
341 m = re.match('(a)', 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(0), 'a')
344 self.assertEqual(m.group(1), 'a')
345 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
348 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
349 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
350 (None, 'b', None))
351 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000352
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200353 def test_re_fullmatch(self):
354 # Issue 16203: Proposal: add re.fullmatch() method.
355 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
356 for string in "ab", S("ab"):
357 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
358 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
359 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
360 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
361 r = r"%s|%s" % (a, a + b)
362 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
363 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
365 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
366 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
367 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
368 self.assertIsNone(re.fullmatch(r"a+", "ab"))
369 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
371 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
372 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
374 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
375
376 self.assertEqual(
377 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
378 self.assertEqual(
379 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
380 self.assertEqual(
381 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
382
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000384 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
385 ('(', 'a'))
386 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
387 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300388 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
389 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
391 ('a', 'b'))
392 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
393 (None, 'd'))
394 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
395 (None, 'd'))
396 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
397 ('a', ''))
398
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000399 # Tests for bug #1177831: exercise groups other than the first group
400 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
401 self.assertEqual(p.match('abc').groups(),
402 ('a', 'b', 'c'))
403 self.assertEqual(p.match('ad').groups(),
404 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300405 self.assertIsNone(p.match('abd'))
406 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000407
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000408
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000409 def test_re_groupref(self):
410 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
411 ('|', 'a'))
412 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
413 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300414 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
415 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000416 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
417 ('a', 'a'))
418 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
419 (None, None))
420
421 def test_groupdict(self):
422 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
423 'first second').groupdict(),
424 {'first':'first', 'second':'second'})
425
426 def test_expand(self):
427 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
428 "first second")
429 .expand(r"\2 \1 \g<second> \g<first>"),
430 "second first second first")
431
432 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300433 self.assertIsNone(re.match("^(\w){1}$", "abc"))
434 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
435 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
436 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000437
438 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
445 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
446
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300447 self.assertIsNone(re.match("^x{1}$", "xxx"))
448 self.assertIsNone(re.match("^x{1}?$", "xxx"))
449 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
450 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000451
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300452 self.assertTrue(re.match("^x{3}$", "xxx"))
453 self.assertTrue(re.match("^x{1,3}$", "xxx"))
454 self.assertTrue(re.match("^x{1,4}$", "xxx"))
455 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
456 self.assertTrue(re.match("^x{3}?$", "xxx"))
457 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
458 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
459 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300461 self.assertIsNone(re.match("^x{}$", "xxx"))
462 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000463
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000464 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000467 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
468 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
469 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
470 {'first': 1, 'other': 2})
471
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match("(a)", "a").pos, 0)
473 self.assertEqual(re.match("(a)", "a").endpos, 1)
474 self.assertEqual(re.match("(a)", "a").string, "a")
475 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300476 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000477
478 def test_special_escapes(self):
479 self.assertEqual(re.search(r"\b(b.)\b",
480 "abcd abc bcd bx").group(1), "bx")
481 self.assertEqual(re.search(r"\B(b.)\B",
482 "abc bcd bc abxd").group(1), "bx")
483 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300484 "abcd abc bcd bx", re.ASCII).group(1), "bx")
485 self.assertEqual(re.search(r"\B(b.)\B",
486 "abc bcd bc abxd", re.ASCII).group(1), "bx")
487 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000488 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
489 self.assertEqual(re.search(r"\B(b.)\B",
490 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000491 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300493 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300494 self.assertEqual(re.search(br"\b(b.)\b",
495 b"abcd abc bcd bx").group(1), b"bx")
496 self.assertEqual(re.search(br"\B(b.)\B",
497 b"abc bcd bc abxd").group(1), b"bx")
498 self.assertEqual(re.search(br"\b(b.)\b",
499 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
500 self.assertEqual(re.search(br"\B(b.)\B",
501 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
502 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
503 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300504 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"\d\D\w\W\s\S",
506 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300507 self.assertEqual(re.search(br"\d\D\w\W\s\S",
508 b"1aa! a").group(0), b"1aa! a")
509 self.assertEqual(re.search(r"\d\D\w\W\s\S",
510 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511 self.assertEqual(re.search(r"\d\D\w\W\s\S",
512 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300513 self.assertEqual(re.search(br"\d\D\w\W\s\S",
514 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515
Ezio Melotti5a045b92012-02-29 11:48:44 +0200516 def test_string_boundaries(self):
517 # See http://bugs.python.org/issue10713
518 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
519 "abc")
520 # There's a word boundary at the start of a string.
521 self.assertTrue(re.match(r"\b", "abc"))
522 # A non-empty string includes a non-boundary zero-length match.
523 self.assertTrue(re.search(r"\B", "abc"))
524 # There is no non-boundary match at the start of a string.
525 self.assertFalse(re.match(r"\B", "abc"))
526 # However, an empty string contains no word boundaries, and also no
527 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300528 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200529 # This one is questionable and different from the perlre behaviour,
530 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200532 # A single word-character string has two boundaries, but no
533 # non-boundary gaps.
534 self.assertEqual(len(re.findall(r"\b", "a")), 2)
535 self.assertEqual(len(re.findall(r"\B", "a")), 0)
536 # If there are no words, there are no boundaries
537 self.assertEqual(len(re.findall(r"\b", " ")), 0)
538 self.assertEqual(len(re.findall(r"\b", " ")), 0)
539 # Can match around the whitespace.
540 self.assertEqual(len(re.findall(r"\B", " ")), 2)
541
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(re.match("([\u2222\u2223])",
544 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300545 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300546 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000547
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100548 def test_big_codesize(self):
549 # Issue #1160
550 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300551 self.assertTrue(r.match('1000'))
552 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100553
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000554 def test_anyall(self):
555 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
556 "a\nb")
557 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
558 "a\n\nb")
559
Serhiy Storchakaa3369a52015-02-21 12:08:52 +0200560 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
562 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
563 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
564 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
567 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
568
569 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
570 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
571 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
572 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
573
Serhiy Storchakaa3369a52015-02-21 12:08:52 +0200574 # Group reference.
575 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
576 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
577 # Named group reference.
578 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
579 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
580 # Conditional group reference.
581 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
582 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
583 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
584 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
585 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
586 # Group used before defined.
587 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
588 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
589 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
590
591 def test_lookbehind(self):
592 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
593 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
594 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
595 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
596 # Group reference.
597 self.assertWarns(RuntimeWarning, re.compile, r'(a)a(?<=\1)c')
598 # Named group reference.
599 self.assertWarns(RuntimeWarning, re.compile, r'(?P<g>a)a(?<=(?P=g))c')
600 # Conditional group reference.
601 self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(1)b|x))c')
602 # Group used before defined.
603 self.assertWarns(RuntimeWarning, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
604
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000605 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000606 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300607 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000608 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
609 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
610 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
611 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
612 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
613 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
614 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
615 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
616
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200617 assert '\u212a'.lower() == 'k' # 'K'
618 self.assertTrue(re.match(r'K', '\u212a', re.I))
619 self.assertTrue(re.match(r'k', '\u212a', re.I))
620 self.assertTrue(re.match(r'\u212a', 'K', re.I))
621 self.assertTrue(re.match(r'\u212a', 'k', re.I))
622 assert '\u017f'.upper() == 'S' # 'ſ'
623 self.assertTrue(re.match(r'S', '\u017f', re.I))
624 self.assertTrue(re.match(r's', '\u017f', re.I))
625 self.assertTrue(re.match(r'\u017f', 'S', re.I))
626 self.assertTrue(re.match(r'\u017f', 's', re.I))
627 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
628 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
629 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
630
631 def test_ignore_case_set(self):
632 self.assertTrue(re.match(r'[19A]', 'A', re.I))
633 self.assertTrue(re.match(r'[19a]', 'a', re.I))
634 self.assertTrue(re.match(r'[19a]', 'A', re.I))
635 self.assertTrue(re.match(r'[19A]', 'a', re.I))
636 self.assertTrue(re.match(br'[19A]', b'A', re.I))
637 self.assertTrue(re.match(br'[19a]', b'a', re.I))
638 self.assertTrue(re.match(br'[19a]', b'A', re.I))
639 self.assertTrue(re.match(br'[19A]', b'a', re.I))
640 assert '\u212a'.lower() == 'k' # 'K'
641 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
642 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
643 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
644 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
645 assert '\u017f'.upper() == 'S' # 'ſ'
646 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
647 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
648 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
649 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
650 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
651 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
652 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
653
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200654 def test_ignore_case_range(self):
655 # Issues #3511, #17381.
656 self.assertTrue(re.match(r'[9-a]', '_', re.I))
657 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
658 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
659 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
660 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
661 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
662 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
663 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
664 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
665 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
666 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
667 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
668 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
669 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
670 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
671 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
672
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200673 assert '\u212a'.lower() == 'k' # 'K'
674 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
675 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
676 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
677 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
678 assert '\u017f'.upper() == 'S' # 'ſ'
679 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
680 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
681 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
682 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
683 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
684 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
685 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
686
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000687 def test_category(self):
688 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
689
690 def test_getlower(self):
691 import _sre
692 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
693 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
694 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
695
696 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300697 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000698
699 def test_not_literal(self):
700 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
701 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
702
703 def test_search_coverage(self):
704 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
705 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
706
Ezio Melottid2114eb2011-03-25 14:08:44 +0200707 def assertMatch(self, pattern, text, match=None, span=None,
708 matcher=re.match):
709 if match is None and span is None:
710 # the pattern matches the whole text
711 match = text
712 span = (0, len(text))
713 elif match is None or span is None:
714 raise ValueError('If match is not None, span should be specified '
715 '(and vice versa).')
716 m = matcher(pattern, text)
717 self.assertTrue(m)
718 self.assertEqual(m.group(), match)
719 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000720
Ezio Melottid2114eb2011-03-25 14:08:44 +0200721 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300722 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200723 p = ''.join(chr(i) for i in range(256))
724 for c in p:
725 if c in alnum_chars:
726 self.assertEqual(re.escape(c), c)
727 elif c == '\x00':
728 self.assertEqual(re.escape(c), '\\000')
729 else:
730 self.assertEqual(re.escape(c), '\\' + c)
731 self.assertMatch(re.escape(c), c)
732 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000733
Guido van Rossum698280d2008-09-10 17:44:35 +0000734 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300735 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200736 p = bytes(range(256))
737 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000738 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200739 if b in alnum_chars:
740 self.assertEqual(re.escape(b), b)
741 elif i == 0:
742 self.assertEqual(re.escape(b), b'\\000')
743 else:
744 self.assertEqual(re.escape(b), b'\\' + b)
745 self.assertMatch(re.escape(b), b)
746 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000747
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200748 def test_re_escape_non_ascii(self):
749 s = 'xxx\u2620\u2620\u2620xxx'
750 s_escaped = re.escape(s)
751 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
752 self.assertMatch(s_escaped, s)
753 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
754 'x\u2620\u2620\u2620x', (2, 7), re.search)
755
756 def test_re_escape_non_ascii_bytes(self):
757 b = 'y\u2620y\u2620y'.encode('utf-8')
758 b_escaped = re.escape(b)
759 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
760 self.assertMatch(b_escaped, b)
761 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
762 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000763
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300764 def test_pickling(self):
765 import pickle
766 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
767 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
768 pickled = pickle.dumps(oldpat, proto)
769 newpat = pickle.loads(pickled)
770 self.assertEqual(newpat, oldpat)
771 # current pickle expects the _compile() reconstructor in re module
772 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000773
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000774 def test_constants(self):
775 self.assertEqual(re.I, re.IGNORECASE)
776 self.assertEqual(re.L, re.LOCALE)
777 self.assertEqual(re.M, re.MULTILINE)
778 self.assertEqual(re.S, re.DOTALL)
779 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000780
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000781 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000782 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300783 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000784
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000785 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200786 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
787 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300788 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
789 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
790 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
791 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
792 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
793 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200794 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300795 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
796 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
797 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
798 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
799 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
800 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
801 self.assertTrue(re.match(r"\0", "\000"))
802 self.assertTrue(re.match(r"\08", "\0008"))
803 self.assertTrue(re.match(r"\01", "\001"))
804 self.assertTrue(re.match(r"\018", "\0018"))
805 self.assertTrue(re.match(r"\567", chr(0o167)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200806 self.assertRaises(re.error, re.match, r"\911", "")
807 self.assertRaises(re.error, re.match, r"\x1", "")
808 self.assertRaises(re.error, re.match, r"\x1z", "")
809 self.assertRaises(re.error, re.match, r"\u123", "")
810 self.assertRaises(re.error, re.match, r"\u123z", "")
811 self.assertRaises(re.error, re.match, r"\U0001234", "")
812 self.assertRaises(re.error, re.match, r"\U0001234z", "")
813 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000814
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000815 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200816 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
817 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300818 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
819 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
820 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
821 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
822 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
823 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
824 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
825 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200826 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300827 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
828 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
829 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
830 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
831 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
832 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
833 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200834 self.assertRaises(re.error, re.match, r"[\911]", "")
835 self.assertRaises(re.error, re.match, r"[\x1z]", "")
836 self.assertRaises(re.error, re.match, r"[\u123z]", "")
837 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
838 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
839
840 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000841 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300842 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
843 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
844 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
845 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
846 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
847 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
848 self.assertTrue(re.match(br"\u", b'u'))
849 self.assertTrue(re.match(br"\U", b'U'))
850 self.assertTrue(re.match(br"\0", b"\000"))
851 self.assertTrue(re.match(br"\08", b"\0008"))
852 self.assertTrue(re.match(br"\01", b"\001"))
853 self.assertTrue(re.match(br"\018", b"\0018"))
854 self.assertTrue(re.match(br"\567", bytes([0o167])))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200855 self.assertRaises(re.error, re.match, br"\911", b"")
856 self.assertRaises(re.error, re.match, br"\x1", b"")
857 self.assertRaises(re.error, re.match, br"\x1z", b"")
858
859 def test_sre_byte_class_literals(self):
860 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300861 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
862 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
863 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
864 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
865 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
866 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
867 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
868 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
869 self.assertTrue(re.match(br"[\u]", b'u'))
870 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300871 self.assertRaises(re.error, re.match, br"[\911]", b"")
872 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000873
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000874 def test_bug_113254(self):
875 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
876 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
877 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
878
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000879 def test_bug_527371(self):
880 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300881 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000882 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
883 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
884 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
885 self.assertEqual(re.match("((a))", "a").lastindex, 1)
886
887 def test_bug_545855(self):
888 # bug 545855 -- This pattern failed to cause a compile error as it
889 # should, instead provoking a TypeError.
890 self.assertRaises(re.error, re.compile, 'foo[a-')
891
892 def test_bug_418626(self):
893 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
894 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
895 # pattern '*?' on a long string.
896 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
897 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
898 20003)
899 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000900 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000901 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000902 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000903
904 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000905 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000906 self.assertEqual(re.compile(pat) and 1, 1)
907
Skip Montanaro1e703c62003-04-25 15:40:28 +0000908 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000909 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000910 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
912 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
913 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000914
Serhiy Storchakafa468162013-02-16 21:23:53 +0200915 def test_unlimited_zero_width_repeat(self):
916 # Issue #9669
917 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
918 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
919 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
920 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
921 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
922 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
923
Skip Montanaro1e703c62003-04-25 15:40:28 +0000924 def test_scanner(self):
925 def s_ident(scanner, token): return token
926 def s_operator(scanner, token): return "op%s" % token
927 def s_float(scanner, token): return float(token)
928 def s_int(scanner, token): return int(token)
929
930 scanner = Scanner([
931 (r"[a-zA-Z_]\w*", s_ident),
932 (r"\d+\.\d*", s_float),
933 (r"\d+", s_int),
934 (r"=|\+|-|\*|/", s_operator),
935 (r"\s+", None),
936 ])
937
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300938 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000939
Skip Montanaro1e703c62003-04-25 15:40:28 +0000940 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
941 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
942 'op+', 'bar'], ''))
943
Skip Montanaro5ba00542003-04-25 16:00:14 +0000944 def test_bug_448951(self):
945 # bug 448951 (similar to 429357, but with single char match)
946 # (Also test greedy matches.)
947 for op in '','?','*':
948 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
949 (None, None))
950 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
951 ('a:', 'a'))
952
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000953 def test_bug_725106(self):
954 # capturing groups in alternatives in repeats
955 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
956 ('b', 'a'))
957 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
958 ('c', 'b'))
959 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
960 ('b', None))
961 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
962 ('b', None))
963 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
964 ('b', 'a'))
965 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
966 ('c', 'b'))
967 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
968 ('b', None))
969 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
970 ('b', None))
971
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000972 def test_bug_725149(self):
973 # mark_stack_base restoring before restoring marks
974 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
975 ('a', None))
976 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
977 ('a', None, None))
978
Just van Rossum12723ba2003-07-02 20:03:04 +0000979 def test_bug_764548(self):
980 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000981 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000982 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300983 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000984
Skip Montanaro5ba00542003-04-25 16:00:14 +0000985 def test_finditer(self):
986 iter = re.finditer(r":+", "a:b::c:::d")
987 self.assertEqual([item.group(0) for item in iter],
988 [":", "::", ":::"])
989
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600990 pat = re.compile(r":+")
991 iter = pat.finditer("a:b::c:::d", 1, 10)
992 self.assertEqual([item.group(0) for item in iter],
993 [":", "::", ":::"])
994
995 pat = re.compile(r":+")
996 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
997 self.assertEqual([item.group(0) for item in iter],
998 [":", "::", ":::"])
999
1000 pat = re.compile(r":+")
1001 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1002 self.assertEqual([item.group(0) for item in iter],
1003 [":", "::", ":::"])
1004
1005 pat = re.compile(r":+")
1006 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1007 self.assertEqual([item.group(0) for item in iter],
1008 ["::", "::"])
1009
Thomas Wouters40a088d2008-03-18 20:19:54 +00001010 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001011 self.assertIsNot(re.compile('bug_926075'),
1012 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001013
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001014 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001015 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001016 self.assertEqual(re.compile(pattern).split("a.b.c"),
1017 ['a','b','c'])
1018
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001019 def test_bug_581080(self):
1020 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001021 self.assertEqual(next(iter).span(), (1,2))
1022 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001023
1024 scanner = re.compile(r"\s").scanner("a b")
1025 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001026 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001027
1028 def test_bug_817234(self):
1029 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001030 self.assertEqual(next(iter).span(), (0, 4))
1031 self.assertEqual(next(iter).span(), (4, 4))
1032 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001033
Mark Dickinson1f268282009-07-28 17:22:36 +00001034 def test_bug_6561(self):
1035 # '\d' should match characters in Unicode category 'Nd'
1036 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1037 # Letter) or 'No' (Number, Other).
1038 decimal_digits = [
1039 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1040 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1041 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1042 ]
1043 for x in decimal_digits:
1044 self.assertEqual(re.match('^\d$', x).group(0), x)
1045
1046 not_decimal_digits = [
1047 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1048 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1049 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1050 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1051 ]
1052 for x in not_decimal_digits:
1053 self.assertIsNone(re.match('^\d$', x))
1054
Guido van Rossumd8faa362007-04-27 19:54:29 +00001055 def test_empty_array(self):
1056 # SF buf 1647541
1057 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001058 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001059 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001060 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001061 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001062
Christian Heimes072c0f12008-01-03 23:01:04 +00001063 def test_inline_flags(self):
1064 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +00001065 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
1066 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +00001067
1068 p = re.compile(upper_char, re.I | re.U)
1069 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001070 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001071
1072 p = re.compile(lower_char, re.I | re.U)
1073 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001074 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001075
1076 p = re.compile('(?i)' + upper_char, re.U)
1077 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001078 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001079
1080 p = re.compile('(?i)' + lower_char, re.U)
1081 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001082 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001083
1084 p = re.compile('(?iu)' + upper_char)
1085 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001086 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001087
1088 p = re.compile('(?iu)' + lower_char)
1089 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001090 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001091
Christian Heimes25bb7832008-01-11 16:17:00 +00001092 def test_dollar_matches_twice(self):
1093 "$ matches the end of string, and just before the terminating \n"
1094 pattern = re.compile('$')
1095 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1096 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1097 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1098
1099 pattern = re.compile('$', re.MULTILINE)
1100 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1101 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1102 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1103
Antoine Pitroufd036452008-08-19 17:56:33 +00001104 def test_bytes_str_mixing(self):
1105 # Mixing str and bytes is disallowed
1106 pat = re.compile('.')
1107 bpat = re.compile(b'.')
1108 self.assertRaises(TypeError, pat.match, b'b')
1109 self.assertRaises(TypeError, bpat.match, 'b')
1110 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1111 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1112 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1113 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1114 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1115 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1116
1117 def test_ascii_and_unicode_flag(self):
1118 # String patterns
1119 for flags in (0, re.UNICODE):
1120 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001121 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001122 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001123 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001124 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001125 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001126 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001127 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001128 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001129 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001130 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001131 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001132 # Bytes patterns
1133 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001134 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001135 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001136 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001137 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001138 # Incompatibilities
1139 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1140 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1141 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1142 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1143 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1144 self.assertRaises(ValueError, re.compile, '(?au)\w')
1145
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001146 def test_bug_6509(self):
1147 # Replacement strings of both types must parse properly.
1148 # all strings
1149 pat = re.compile('a(\w)')
1150 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1151 pat = re.compile('a(.)')
1152 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1153 pat = re.compile('..')
1154 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1155
1156 # all bytes
1157 pat = re.compile(b'a(\w)')
1158 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1159 pat = re.compile(b'a(.)')
1160 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1161 pat = re.compile(b'..')
1162 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1163
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001164 def test_dealloc(self):
1165 # issue 3299: check for segfault in debug build
1166 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001167 # the overflow limit is different on wide and narrow builds and it
1168 # depends on the definition of SRE_CODE (see sre.h).
1169 # 2**128 should be big enough to overflow on both. For smaller values
1170 # a RuntimeError is raised instead of OverflowError.
1171 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001172 self.assertRaises(TypeError, re.finditer, "a", {})
1173 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001174 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001177 self.assertTrue(re.search("123.*-", '123abc-'))
1178 self.assertTrue(re.search("123.*-", '123\xe9-'))
1179 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1180 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1181 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182
Ezio Melottidf723e12012-03-13 01:29:48 +02001183 def test_compile(self):
1184 # Test return value when given string and pattern as parameter
1185 pattern = re.compile('random pattern')
1186 self.assertIsInstance(pattern, re._pattern_type)
1187 same_pattern = re.compile(pattern)
1188 self.assertIsInstance(same_pattern, re._pattern_type)
1189 self.assertIs(same_pattern, pattern)
1190 # Test behaviour when not given a string or pattern as parameter
1191 self.assertRaises(TypeError, re.compile, 0)
1192
Ezio Melottife8e6e72013-01-11 08:32:01 +02001193 def test_bug_13899(self):
1194 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1195 # nothing. Ditto B and Z.
1196 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1197 ['A', 'B', '\b', 'C', 'Z'])
1198
Antoine Pitroub33941a2012-12-03 20:55:56 +01001199 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001200 def test_large_search(self, size):
1201 # Issue #10182: indices were 32-bit-truncated.
1202 s = 'a' * size
1203 m = re.search('$', s)
1204 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001205 self.assertEqual(m.start(), size)
1206 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001207
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001208 # The huge memuse is because of re.sub() using a list and a join()
1209 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001210 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001211 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001212 # Issue #10182: indices were 32-bit-truncated.
1213 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001214 r, n = re.subn('', '', s)
1215 self.assertEqual(r, s)
1216 self.assertEqual(n, size + 1)
1217
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001218 def test_bug_16688(self):
1219 # Issue 16688: Backreferences make case-insensitive regex fail on
1220 # non-ASCII strings.
1221 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1222 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001223
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001224 def test_repeat_minmax_overflow(self):
1225 # Issue #13169
1226 string = "x" * 100000
1227 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1228 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1229 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1230 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1231 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1232 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1233 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1234 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1235 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1236 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1237 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1238
1239 @cpython_only
1240 def test_repeat_minmax_overflow_maxrepeat(self):
1241 try:
1242 from _sre import MAXREPEAT
1243 except ImportError:
1244 self.skipTest('requires _sre.MAXREPEAT constant')
1245 string = "x" * 100000
1246 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1247 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1248 (0, 100000))
1249 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1250 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1251 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1252 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1253
R David Murray26dfaac92013-04-14 13:00:54 -04001254 def test_backref_group_name_in_exception(self):
1255 # Issue 17341: Poor error message when compiling invalid regex
1256 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1257 re.compile('(?P=<foo>)')
1258
1259 def test_group_name_in_exception(self):
1260 # Issue 17341: Poor error message when compiling invalid regex
1261 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1262 re.compile('(?P<?foo>)')
1263
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001264 def test_issue17998(self):
1265 for reps in '*', '+', '?', '{1}':
1266 for mod in '', '?':
1267 pattern = '.' + reps + mod + 'yz'
1268 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1269 ['xyz'], msg=pattern)
1270 pattern = pattern.encode()
1271 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1272 [b'xyz'], msg=pattern)
1273
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001274 def test_match_repr(self):
1275 for string in '[abracadabra]', S('[abracadabra]'):
1276 m = re.search(r'(.+)(.*?)\1', string)
1277 self.assertEqual(repr(m), "<%s.%s object; "
1278 "span=(1, 12), match='abracadabra'>" %
1279 (type(m).__module__, type(m).__qualname__))
1280 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1281 bytearray(b'[abracadabra]'),
1282 memoryview(b'[abracadabra]')):
1283 m = re.search(rb'(.+)(.*?)\1', string)
1284 self.assertEqual(repr(m), "<%s.%s object; "
1285 "span=(1, 12), match=b'abracadabra'>" %
1286 (type(m).__module__, type(m).__qualname__))
1287
1288 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1289 self.assertEqual(repr(first), "<%s.%s object; "
1290 "span=(0, 2), match='aa'>" %
1291 (type(second).__module__, type(first).__qualname__))
1292 self.assertEqual(repr(second), "<%s.%s object; "
1293 "span=(3, 5), match='bb'>" %
1294 (type(second).__module__, type(second).__qualname__))
1295
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001296
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001297 def test_bug_2537(self):
1298 # issue 2537: empty submatches
1299 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1300 for inner_op in ('{0,}', '*', '?'):
1301 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1302 m = r.match("xyyzy")
1303 self.assertEqual(m.group(0), "xyy")
1304 self.assertEqual(m.group(1), "")
1305 self.assertEqual(m.group(2), "y")
1306
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001307 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001308 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001309 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001310 re.compile(pat, re.DEBUG)
1311 dump = '''\
1312subpattern 1
1313 literal 46
1314subpattern None
1315 branch
1316 in
1317 literal 99
1318 literal 104
1319 or
1320 literal 112
1321 literal 121
1322subpattern None
1323 groupref_exists 1
1324 at at_end
1325 else
1326 literal 58
1327 literal 32
1328'''
1329 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001330 # Debug output is output again even a second time (bypassing
1331 # the cache -- issue #20426).
1332 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001333 re.compile(pat, re.DEBUG)
1334 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001335
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001336 def test_keyword_parameters(self):
1337 # Issue #20283: Accepting the string keyword parameter.
1338 pat = re.compile(r'(ab)')
1339 self.assertEqual(
1340 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1341 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001342 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1343 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001344 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1345 self.assertEqual(
1346 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1347 self.assertEqual(
1348 pat.split(string='abracadabra', maxsplit=1),
1349 ['', 'ab', 'racadabra'])
1350 self.assertEqual(
1351 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1352 (7, 9))
1353
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001354 def test_bug_20998(self):
1355 # Issue #20998: Fullmatch of repeated single character pattern
1356 # with ignore case.
1357 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1358
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001359 def test_locale_caching(self):
1360 # Issue #22410
1361 oldlocale = locale.setlocale(locale.LC_CTYPE)
1362 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1363 for loc in 'en_US.iso88591', 'en_US.utf8':
1364 try:
1365 locale.setlocale(locale.LC_CTYPE, loc)
1366 except locale.Error:
1367 # Unsupported locale on this system
1368 self.skipTest('test needs %s locale' % loc)
1369
1370 re.purge()
1371 self.check_en_US_iso88591()
1372 self.check_en_US_utf8()
1373 re.purge()
1374 self.check_en_US_utf8()
1375 self.check_en_US_iso88591()
1376
1377 def check_en_US_iso88591(self):
1378 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1379 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1380 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1381 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1382 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1383 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1384 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1385
1386 def check_en_US_utf8(self):
1387 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1388 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1389 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1390 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1391 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1392 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1393 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1394
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001395
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001396class PatternReprTests(unittest.TestCase):
1397 def check(self, pattern, expected):
1398 self.assertEqual(repr(re.compile(pattern)), expected)
1399
1400 def check_flags(self, pattern, flags, expected):
1401 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1402
1403 def test_without_flags(self):
1404 self.check('random pattern',
1405 "re.compile('random pattern')")
1406
1407 def test_single_flag(self):
1408 self.check_flags('random pattern', re.IGNORECASE,
1409 "re.compile('random pattern', re.IGNORECASE)")
1410
1411 def test_multiple_flags(self):
1412 self.check_flags('random pattern', re.I|re.S|re.X,
1413 "re.compile('random pattern', "
1414 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1415
1416 def test_unicode_flag(self):
1417 self.check_flags('random pattern', re.U,
1418 "re.compile('random pattern')")
1419 self.check_flags('random pattern', re.I|re.S|re.U,
1420 "re.compile('random pattern', "
1421 "re.IGNORECASE|re.DOTALL)")
1422
1423 def test_inline_flags(self):
1424 self.check('(?i)pattern',
1425 "re.compile('(?i)pattern', re.IGNORECASE)")
1426
1427 def test_unknown_flags(self):
1428 self.check_flags('random pattern', 0x123000,
1429 "re.compile('random pattern', 0x123000)")
1430 self.check_flags('random pattern', 0x123000|re.I,
1431 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1432
1433 def test_bytes(self):
1434 self.check(b'bytes pattern',
1435 "re.compile(b'bytes pattern')")
1436 self.check_flags(b'bytes pattern', re.A,
1437 "re.compile(b'bytes pattern', re.ASCII)")
1438
1439 def test_quotes(self):
1440 self.check('random "double quoted" pattern',
1441 '''re.compile('random "double quoted" pattern')''')
1442 self.check("random 'single quoted' pattern",
1443 '''re.compile("random 'single quoted' pattern")''')
1444 self.check('''both 'single' and "double" quotes''',
1445 '''re.compile('both \\'single\\' and "double" quotes')''')
1446
1447 def test_long_pattern(self):
1448 pattern = 'Very %spattern' % ('long ' * 1000)
1449 r = repr(re.compile(pattern))
1450 self.assertLess(len(r), 300)
1451 self.assertEqual(r[:30], "re.compile('Very long long lon")
1452 r = repr(re.compile(pattern, re.I))
1453 self.assertLess(len(r), 300)
1454 self.assertEqual(r[:30], "re.compile('Very long long lon")
1455 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1456
1457
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001458class ImplementationTest(unittest.TestCase):
1459 """
1460 Test implementation details of the re module.
1461 """
1462
1463 def test_overlap_table(self):
1464 f = sre_compile._generate_overlap_table
1465 self.assertEqual(f(""), [])
1466 self.assertEqual(f("a"), [0])
1467 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1468 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1469 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1470 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1471
1472
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001473class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001474
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001475 def test_re_benchmarks(self):
1476 're_tests benchmarks'
1477 from test.re_tests import benchmarks
1478 for pattern, s in benchmarks:
1479 with self.subTest(pattern=pattern, string=s):
1480 p = re.compile(pattern)
1481 self.assertTrue(p.search(s))
1482 self.assertTrue(p.match(s))
1483 self.assertTrue(p.fullmatch(s))
1484 s2 = ' '*10000 + s + ' '*10000
1485 self.assertTrue(p.search(s2))
1486 self.assertTrue(p.match(s2, 10000))
1487 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1488 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001489
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001490 def test_re_tests(self):
1491 're_tests test suite'
1492 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1493 for t in tests:
1494 pattern = s = outcome = repl = expected = None
1495 if len(t) == 5:
1496 pattern, s, outcome, repl, expected = t
1497 elif len(t) == 3:
1498 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001499 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001500 raise ValueError('Test tuples should have 3 or 5 fields', t)
1501
1502 with self.subTest(pattern=pattern, string=s):
1503 if outcome == SYNTAX_ERROR: # Expected a syntax error
1504 with self.assertRaises(re.error):
1505 re.compile(pattern)
1506 continue
1507
1508 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001509 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001510 if outcome == FAIL:
1511 self.assertIsNone(result, 'Succeeded incorrectly')
1512 continue
1513
1514 with self.subTest():
1515 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001516 # Matched, as expected, so now we compute the
1517 # result string and compare it to our expected result.
1518 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001519 vardict = {'found': result.group(0),
1520 'groups': result.group(),
1521 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001522 for i in range(1, 100):
1523 try:
1524 gi = result.group(i)
1525 # Special hack because else the string concat fails:
1526 if gi is None:
1527 gi = "None"
1528 except IndexError:
1529 gi = "Error"
1530 vardict['g%d' % i] = gi
1531 for i in result.re.groupindex.keys():
1532 try:
1533 gi = result.group(i)
1534 if gi is None:
1535 gi = "None"
1536 except IndexError:
1537 gi = "Error"
1538 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001539 self.assertEqual(eval(repl, vardict), expected,
1540 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001541
Antoine Pitrou22628c42008-07-22 17:53:22 +00001542 # Try the match with both pattern and string converted to
1543 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001544 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001545 bpat = bytes(pattern, "ascii")
1546 bs = bytes(s, "ascii")
1547 except UnicodeEncodeError:
1548 # skip non-ascii tests
1549 pass
1550 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001551 with self.subTest('bytes pattern match'):
Antoine Pitrou22628c42008-07-22 17:53:22 +00001552 bpat = re.compile(bpat)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001553 self.assertTrue(bpat.search(bs))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001554
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001555 # Try the match with the search area limited to the extent
1556 # of the match and see if it still succeeds. \B will
1557 # break (because it won't match at the end or start of a
1558 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001559 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1560 and result is not None):
1561 with self.subTest('range-limited match'):
1562 obj = re.compile(pattern)
1563 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001564
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001565 # Try the match with IGNORECASE enabled, and check that it
1566 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001567 with self.subTest('case-insensitive match'):
1568 obj = re.compile(pattern, re.IGNORECASE)
1569 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001570
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001571 # Try the match with LOCALE enabled, and check that it
1572 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001573 if '(?u)' not in pattern:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001574 with self.subTest('locale-sensitive match'):
1575 obj = re.compile(pattern, re.LOCALE)
1576 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001577
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001578 # Try the match with UNICODE locale enabled, and check
1579 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001580 with self.subTest('unicode-sensitive match'):
1581 obj = re.compile(pattern, re.UNICODE)
1582 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001583
Gregory P. Smith5a631832010-07-27 05:31:29 +00001584
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001585if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001586 unittest.main()