blob: 0834fe0f403e91a8b6efbb8ee211f3053bb4eedb [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
106 '9.3 -3 23x99y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100107 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000109
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000110 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
111 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000112
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000113 s = r"\1\1"
114 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
115 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
116 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000117
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
120 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
121 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000122
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200123 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
124 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
125 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
126 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
127 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
128 with self.subTest(c):
129 with self.assertWarns(DeprecationWarning):
130 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000131
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000132 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000133
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000134 def test_bug_449964(self):
135 # fails for group followed by other escape
136 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
137 'xx\bxx\b')
138
139 def test_bug_449000(self):
140 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000141 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
147 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
148 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000149
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000150 def test_bug_1661(self):
151 # Verify that flags do not get silently ignored with compiled patterns
152 pattern = re.compile('.')
153 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
155 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
156 self.assertRaises(ValueError, re.compile, pattern, re.I)
157
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000158 def test_bug_3629(self):
159 # A regex that triggered a bug in the sre-code validator
160 re.compile("(?P<quote>)(?(quote))")
161
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000162 def test_sub_template_numeric_escape(self):
163 # bug 776311 and friends
164 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
165 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
166 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
167 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
168 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
169 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
170 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200171 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000172
173 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
174 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
175
176 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
177 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
178 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
179 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
180 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
181
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200182 self.checkTemplateError('x', r'\400', 'x',
183 r'octal escape value \400 outside of '
184 r'range 0-0o377', 0)
185 self.checkTemplateError('x', r'\777', 'x',
186 r'octal escape value \777 outside of '
187 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000188
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200189 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
196 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
198 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
199 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
200 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000201
202 # in python2.3 (etc), these loop endlessly in sre_parser.py
203 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
205 'xz8')
206 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
207 'xza')
208
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000209 def test_qualified_re_sub(self):
210 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300211 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Victor Stinner55e614a2014-10-29 16:58:59 +0100212 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000213
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000214 def test_bug_114660(self):
215 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
216 'hello there')
217
218 def test_bug_462270(self):
219 # Test for empty sub() behaviour, see SF bug #462270
220 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
221 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
222
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200223 def test_symbolic_groups(self):
224 re.compile('(?P<a>x)(?P=a)(?(a)y)')
225 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300226 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200227 self.checkPatternError('(?P<a>)(?P<a>)',
228 "redefinition of group name 'a' as group 2; "
229 "was group 1")
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300230 self.checkPatternError('(?P<a>(?P=a))',
231 "cannot refer to an open group", 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200232 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
233 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
234 self.checkPatternError('(?P=', 'missing group name', 4)
235 self.checkPatternError('(?P=)', 'missing group name', 4)
236 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
237 self.checkPatternError('(?P=a)', "unknown group name 'a'")
238 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
239 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
240 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
241 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
242 self.checkPatternError('(?P<', 'missing group name', 4)
243 self.checkPatternError('(?P<>)', 'missing group name', 4)
244 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
245 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
246 self.checkPatternError(r'(?(', 'missing group name', 3)
247 self.checkPatternError(r'(?())', 'missing group name', 3)
248 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
249 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
250 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
251 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200252 # New valid/invalid identifiers in Python 3
253 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
254 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200255 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300256 # Support > 100 groups.
257 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
258 pat = '(?:%s)(?(200)z|t)' % pat
259 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200260
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000261 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200262 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
263 'missing >, unterminated name', 3)
264 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
265 'missing group name', 3)
266 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
267 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
268 "bad character in group name 'a a'", 3)
269 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
270 'missing group name', 3)
271 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
272 "bad character in group name '1a1'", 3)
273 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
274 'invalid group reference')
275 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
276 'invalid group reference')
277 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
278 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300279 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
280 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200281 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
282 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200283 # New valid/invalid identifiers in Python 3
284 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
285 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200286 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
287 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300288 # Support > 100 groups.
289 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
290 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000291
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000292 def test_re_subn(self):
293 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
294 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
295 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
296 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300297 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Victor Stinner55e614a2014-10-29 16:58:59 +0100298 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000299
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000300 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 for string in ":a:b::c", S(":a:b::c"):
302 self.assertTypedEqual(re.split(":", string),
303 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200304 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300305 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200306 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300307 ['', ':', 'a', ':', 'b', '::', 'c'])
308 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
309 memoryview(b":a:b::c")):
310 self.assertTypedEqual(re.split(b":", string),
311 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200312 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200314 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300315 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300316 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
317 "\U0001d49c\U0001d49e\U0001d4b5"):
318 string = ":%s:%s::%s" % (a, b, c)
319 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200320 self.assertEqual(re.split(":+", string), ['', a, b, c])
321 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300322 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300323
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200324 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
325 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000326 ['', ':', 'a', ':', 'b', ':', 'c'])
327 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
328 ['', ':', 'a', ':b::', 'c'])
329 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
330 ['', None, ':', 'a', None, ':', '', 'b', None, '',
331 None, '::', 'c'])
332 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
333 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000334
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200335 for sep, expected in [
336 (':*', ['', 'a', 'b', 'c']),
337 ('(?::*)', ['', 'a', 'b', 'c']),
338 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
339 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
340 ]:
341 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
342 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
343
344 for sep, expected in [
345 ('', [':a:b::c']),
346 (r'\b', [':a:b::c']),
347 (r'(?=:)', [':a:b::c']),
348 (r'(?<=:)', [':a:b::c']),
349 ]:
350 with self.subTest(sep=sep), self.assertRaises(ValueError):
351 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
352
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 def test_qualified_re_split(self):
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300354 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100355 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
356 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
357 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000358 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200359 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000360 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200361 with self.assertWarns(FutureWarning):
362 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
363 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000364
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000365 def test_re_findall(self):
366 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300367 for string in "a:b::c:::d", S("a:b::c:::d"):
368 self.assertTypedEqual(re.findall(":+", string),
369 [":", "::", ":::"])
370 self.assertTypedEqual(re.findall("(:+)", string),
371 [":", "::", ":::"])
372 self.assertTypedEqual(re.findall("(:)(:*)", string),
373 [(":", ""), (":", ":"), (":", "::")])
374 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
375 memoryview(b"a:b::c:::d")):
376 self.assertTypedEqual(re.findall(b":+", string),
377 [b":", b"::", b":::"])
378 self.assertTypedEqual(re.findall(b"(:+)", string),
379 [b":", b"::", b":::"])
380 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
381 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300382 for x in ("\xe0", "\u0430", "\U0001d49c"):
383 xx = x * 2
384 xxx = x * 3
385 string = "a%sb%sc%sd" % (x, xx, xxx)
386 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
387 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
388 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
389 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000390
Skip Montanaro5ba00542003-04-25 16:00:14 +0000391 def test_bug_117612(self):
392 self.assertEqual(re.findall(r"(a|(b))", "aba"),
393 [("a", ""),("b", "b"),("a", "")])
394
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000395 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300396 for string in 'a', S('a'):
397 self.assertEqual(re.match('a', string).groups(), ())
398 self.assertEqual(re.match('(a)', string).groups(), ('a',))
399 self.assertEqual(re.match('(a)', string).group(0), 'a')
400 self.assertEqual(re.match('(a)', string).group(1), 'a')
401 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
402 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
403 self.assertEqual(re.match(b'a', string).groups(), ())
404 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
405 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
406 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
407 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300408 for a in ("\xe0", "\u0430", "\U0001d49c"):
409 self.assertEqual(re.match(a, a).groups(), ())
410 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
411 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
412 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
413 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000414
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000415 pat = re.compile('((a)|(b))(c)?')
416 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
417 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
418 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
419 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
420 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000421
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000422 # A single group
423 m = re.match('(a)', 'a')
424 self.assertEqual(m.group(0), 'a')
425 self.assertEqual(m.group(0), 'a')
426 self.assertEqual(m.group(1), 'a')
427 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000428
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000429 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
430 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
431 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
432 (None, 'b', None))
433 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000434
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200435 def test_re_fullmatch(self):
436 # Issue 16203: Proposal: add re.fullmatch() method.
437 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
438 for string in "ab", S("ab"):
439 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
440 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
441 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
442 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
443 r = r"%s|%s" % (a, a + b)
444 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
445 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
446 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
447 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
448 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
449 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
450 self.assertIsNone(re.fullmatch(r"a+", "ab"))
451 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
452 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
453 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
454 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
455 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
456 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
457
458 self.assertEqual(
459 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
460 self.assertEqual(
461 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
462 self.assertEqual(
463 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
464
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000465 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000466 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
467 ('(', 'a'))
468 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
469 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300470 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
471 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
473 ('a', 'b'))
474 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
475 (None, 'd'))
476 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
477 (None, 'd'))
478 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
479 ('a', ''))
480
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000481 # Tests for bug #1177831: exercise groups other than the first group
482 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
483 self.assertEqual(p.match('abc').groups(),
484 ('a', 'b', 'c'))
485 self.assertEqual(p.match('ad').groups(),
486 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300487 self.assertIsNone(p.match('abd'))
488 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000489
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300490 # Support > 100 groups.
491 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
492 pat = '(?:%s)(?(200)z)' % pat
493 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000494
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200495 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
496 self.checkPatternError(r'()(?(1)a|b',
497 'missing ), unterminated subpattern', 2)
498 self.checkPatternError(r'()(?(1)a|b|c)',
499 'conditional backref with more than '
500 'two branches', 10)
501
502 def test_re_groupref_overflow(self):
503 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
504 'invalid group reference', 3)
505 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
506 'invalid group reference', 10)
507
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000508 def test_re_groupref(self):
509 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
510 ('|', 'a'))
511 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
512 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300513 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
514 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
516 ('a', 'a'))
517 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
518 (None, None))
519
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200520 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
521
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000522 def test_groupdict(self):
523 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
524 'first second').groupdict(),
525 {'first':'first', 'second':'second'})
526
527 def test_expand(self):
528 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
529 "first second")
530 .expand(r"\2 \1 \g<second> \g<first>"),
531 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300532 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
533 "first")
534 .expand(r"\2 \g<second>"),
535 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000536
537 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300538 self.assertIsNone(re.match("^(\w){1}$", "abc"))
539 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
540 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
541 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542
543 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
544 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
545 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
546 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
547 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
548 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
549 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
550 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
551
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300552 self.assertIsNone(re.match("^x{1}$", "xxx"))
553 self.assertIsNone(re.match("^x{1}?$", "xxx"))
554 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
555 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000556
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300557 self.assertTrue(re.match("^x{3}$", "xxx"))
558 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200559 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300560 self.assertTrue(re.match("^x{1,4}$", "xxx"))
561 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
562 self.assertTrue(re.match("^x{3}?$", "xxx"))
563 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
564 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
565 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000566
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300567 self.assertIsNone(re.match("^x{}$", "xxx"))
568 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000569
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200570 self.checkPatternError(r'x{2,1}',
571 'min repeat greater than max repeat', 2)
572
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000573 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000574 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000575 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000576 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
577 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
578 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
579 {'first': 1, 'other': 2})
580
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000581 self.assertEqual(re.match("(a)", "a").pos, 0)
582 self.assertEqual(re.match("(a)", "a").endpos, 1)
583 self.assertEqual(re.match("(a)", "a").string, "a")
584 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300585 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000586
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300587 # Issue 14260. groupindex should be non-modifiable mapping.
588 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
589 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
590 self.assertEqual(p.groupindex['other'], 2)
591 with self.assertRaises(TypeError):
592 p.groupindex['other'] = 0
593 self.assertEqual(p.groupindex['other'], 2)
594
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000595 def test_special_escapes(self):
596 self.assertEqual(re.search(r"\b(b.)\b",
597 "abcd abc bcd bx").group(1), "bx")
598 self.assertEqual(re.search(r"\B(b.)\B",
599 "abc bcd bc abxd").group(1), "bx")
600 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300601 "abcd abc bcd bx", re.ASCII).group(1), "bx")
602 self.assertEqual(re.search(r"\B(b.)\B",
603 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000604 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
605 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300606 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300607 self.assertEqual(re.search(br"\b(b.)\b",
608 b"abcd abc bcd bx").group(1), b"bx")
609 self.assertEqual(re.search(br"\B(b.)\B",
610 b"abc bcd bc abxd").group(1), b"bx")
611 self.assertEqual(re.search(br"\b(b.)\b",
612 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
613 self.assertEqual(re.search(br"\B(b.)\B",
614 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
615 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
616 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300617 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000618 self.assertEqual(re.search(r"\d\D\w\W\s\S",
619 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300620 self.assertEqual(re.search(br"\d\D\w\W\s\S",
621 b"1aa! a").group(0), b"1aa! a")
622 self.assertEqual(re.search(r"\d\D\w\W\s\S",
623 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300624 self.assertEqual(re.search(br"\d\D\w\W\s\S",
625 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000626
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200627 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200628 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200629 self.assertEqual(re.match(r"\(", '(').group(), '(')
630 self.assertIsNone(re.match(r"\(", ')'))
631 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200632 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
633 self.assertIsNone(re.match(r"[\]]", '['))
634 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
635 self.assertIsNone(re.match(r"[a\-c]", 'b'))
636 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
637 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200638 re.purge() # for warnings
639 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
640 with self.subTest(c):
641 with self.assertWarns(DeprecationWarning):
642 self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
643 self.assertIsNone(re.match('\\%c' % c, 'a'))
644 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
645 with self.subTest(c):
646 with self.assertWarns(DeprecationWarning):
647 self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
648 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200649
Ezio Melotti5a045b92012-02-29 11:48:44 +0200650 def test_string_boundaries(self):
651 # See http://bugs.python.org/issue10713
652 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
653 "abc")
654 # There's a word boundary at the start of a string.
655 self.assertTrue(re.match(r"\b", "abc"))
656 # A non-empty string includes a non-boundary zero-length match.
657 self.assertTrue(re.search(r"\B", "abc"))
658 # There is no non-boundary match at the start of a string.
659 self.assertFalse(re.match(r"\B", "abc"))
660 # However, an empty string contains no word boundaries, and also no
661 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300662 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200663 # This one is questionable and different from the perlre behaviour,
664 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300665 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200666 # A single word-character string has two boundaries, but no
667 # non-boundary gaps.
668 self.assertEqual(len(re.findall(r"\b", "a")), 2)
669 self.assertEqual(len(re.findall(r"\B", "a")), 0)
670 # If there are no words, there are no boundaries
671 self.assertEqual(len(re.findall(r"\b", " ")), 0)
672 self.assertEqual(len(re.findall(r"\b", " ")), 0)
673 # Can match around the whitespace.
674 self.assertEqual(len(re.findall(r"\B", " ")), 2)
675
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000676 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000677 self.assertEqual(re.match("([\u2222\u2223])",
678 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300679 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300680 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000681
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100682 def test_big_codesize(self):
683 # Issue #1160
684 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300685 self.assertTrue(r.match('1000'))
686 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100687
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000688 def test_anyall(self):
689 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
690 "a\nb")
691 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
692 "a\n\nb")
693
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200694 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000695 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
696 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
697 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
698 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
699 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
700 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
701 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
702
703 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
704 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
705 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
706 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
707
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200708 # Group reference.
709 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
710 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
711 # Conditional group reference.
712 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
713 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
714 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
715 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
716 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
717 # Group used before defined.
718 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
719 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
720 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
721
722 def test_lookbehind(self):
723 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
724 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
725 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
726 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
727 # Group reference.
728 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
729 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
730 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
731 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
732 # Conditional group reference.
733 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
734 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
735 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
736 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
737 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
738 # Group used before defined.
739 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
740 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
741 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
742 # Group defined in the same lookbehind pattern
743 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
744 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
745 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
746 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
747
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000748 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000749 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300750 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000751 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
752 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
753 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
754 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
755 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
756 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
757 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
758 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
759
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200760 assert '\u212a'.lower() == 'k' # 'K'
761 self.assertTrue(re.match(r'K', '\u212a', re.I))
762 self.assertTrue(re.match(r'k', '\u212a', re.I))
763 self.assertTrue(re.match(r'\u212a', 'K', re.I))
764 self.assertTrue(re.match(r'\u212a', 'k', re.I))
765 assert '\u017f'.upper() == 'S' # 'ſ'
766 self.assertTrue(re.match(r'S', '\u017f', re.I))
767 self.assertTrue(re.match(r's', '\u017f', re.I))
768 self.assertTrue(re.match(r'\u017f', 'S', re.I))
769 self.assertTrue(re.match(r'\u017f', 's', re.I))
770 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
771 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
772 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
773
774 def test_ignore_case_set(self):
775 self.assertTrue(re.match(r'[19A]', 'A', re.I))
776 self.assertTrue(re.match(r'[19a]', 'a', re.I))
777 self.assertTrue(re.match(r'[19a]', 'A', re.I))
778 self.assertTrue(re.match(r'[19A]', 'a', re.I))
779 self.assertTrue(re.match(br'[19A]', b'A', re.I))
780 self.assertTrue(re.match(br'[19a]', b'a', re.I))
781 self.assertTrue(re.match(br'[19a]', b'A', re.I))
782 self.assertTrue(re.match(br'[19A]', b'a', re.I))
783 assert '\u212a'.lower() == 'k' # 'K'
784 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
785 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
786 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
787 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
788 assert '\u017f'.upper() == 'S' # 'ſ'
789 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
790 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
791 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
792 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
793 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
794 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
795 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
796
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200797 def test_ignore_case_range(self):
798 # Issues #3511, #17381.
799 self.assertTrue(re.match(r'[9-a]', '_', re.I))
800 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
801 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
802 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
803 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
804 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
805 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
806 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
807 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
808 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
809 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
810 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
811 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
812 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
813 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
814 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
815
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200816 assert '\u212a'.lower() == 'k' # 'K'
817 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
818 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
819 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
820 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
821 assert '\u017f'.upper() == 'S' # 'ſ'
822 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
823 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
824 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
825 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
826 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
827 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
828 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
829
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000830 def test_category(self):
831 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
832
833 def test_getlower(self):
834 import _sre
835 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
836 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
837 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200838 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000839
840 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300841 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200842 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
843 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000844
845 def test_not_literal(self):
846 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
847 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
848
849 def test_search_coverage(self):
850 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
851 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
852
Ezio Melottid2114eb2011-03-25 14:08:44 +0200853 def assertMatch(self, pattern, text, match=None, span=None,
854 matcher=re.match):
855 if match is None and span is None:
856 # the pattern matches the whole text
857 match = text
858 span = (0, len(text))
859 elif match is None or span is None:
860 raise ValueError('If match is not None, span should be specified '
861 '(and vice versa).')
862 m = matcher(pattern, text)
863 self.assertTrue(m)
864 self.assertEqual(m.group(), match)
865 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000866
Ezio Melottid2114eb2011-03-25 14:08:44 +0200867 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300868 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200869 p = ''.join(chr(i) for i in range(256))
870 for c in p:
871 if c in alnum_chars:
872 self.assertEqual(re.escape(c), c)
873 elif c == '\x00':
874 self.assertEqual(re.escape(c), '\\000')
875 else:
876 self.assertEqual(re.escape(c), '\\' + c)
877 self.assertMatch(re.escape(c), c)
878 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000879
Guido van Rossum698280d2008-09-10 17:44:35 +0000880 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300881 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200882 p = bytes(range(256))
883 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000884 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200885 if b in alnum_chars:
886 self.assertEqual(re.escape(b), b)
887 elif i == 0:
888 self.assertEqual(re.escape(b), b'\\000')
889 else:
890 self.assertEqual(re.escape(b), b'\\' + b)
891 self.assertMatch(re.escape(b), b)
892 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000893
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200894 def test_re_escape_non_ascii(self):
895 s = 'xxx\u2620\u2620\u2620xxx'
896 s_escaped = re.escape(s)
897 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
898 self.assertMatch(s_escaped, s)
899 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
900 'x\u2620\u2620\u2620x', (2, 7), re.search)
901
902 def test_re_escape_non_ascii_bytes(self):
903 b = 'y\u2620y\u2620y'.encode('utf-8')
904 b_escaped = re.escape(b)
905 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
906 self.assertMatch(b_escaped, b)
907 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
908 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000909
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300910 def test_pickling(self):
911 import pickle
912 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
913 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
914 pickled = pickle.dumps(oldpat, proto)
915 newpat = pickle.loads(pickled)
916 self.assertEqual(newpat, oldpat)
917 # current pickle expects the _compile() reconstructor in re module
918 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000919
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000920 def test_constants(self):
921 self.assertEqual(re.I, re.IGNORECASE)
922 self.assertEqual(re.L, re.LOCALE)
923 self.assertEqual(re.M, re.MULTILINE)
924 self.assertEqual(re.S, re.DOTALL)
925 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000926
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000927 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200928 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300929 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200930 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
931 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000932
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000933 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200934 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
935 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300936 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
937 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
938 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
939 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
940 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
941 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200942 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300943 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
944 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
945 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
946 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
947 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
948 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
949 self.assertTrue(re.match(r"\0", "\000"))
950 self.assertTrue(re.match(r"\08", "\0008"))
951 self.assertTrue(re.match(r"\01", "\001"))
952 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200953 self.checkPatternError(r"\567",
954 r'octal escape value \567 outside of '
955 r'range 0-0o377', 0)
956 self.checkPatternError(r"\911", 'invalid group reference', 0)
957 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
958 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
959 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
960 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
961 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
962 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
963 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000964
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000965 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200966 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
967 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300968 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
969 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
970 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
971 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
972 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
973 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
974 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
975 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200976 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300977 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
978 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
979 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
980 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
981 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
982 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200983 self.checkPatternError(r"[\567]",
984 r'octal escape value \567 outside of '
985 r'range 0-0o377', 1)
986 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
987 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
988 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
989 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
990 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300991 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200992
993 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000994 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300995 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
996 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
997 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
998 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
999 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1000 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001001 with self.assertWarns(DeprecationWarning):
1002 self.assertTrue(re.match(br"\u1234", b'u1234'))
1003 with self.assertWarns(DeprecationWarning):
1004 self.assertTrue(re.match(br"\U00012345", b'U00012345'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001005 self.assertTrue(re.match(br"\0", b"\000"))
1006 self.assertTrue(re.match(br"\08", b"\0008"))
1007 self.assertTrue(re.match(br"\01", b"\001"))
1008 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001009 self.checkPatternError(br"\567",
1010 r'octal escape value \567 outside of '
1011 r'range 0-0o377', 0)
1012 self.checkPatternError(br"\911", 'invalid group reference', 0)
1013 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1014 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001015
1016 def test_sre_byte_class_literals(self):
1017 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001018 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1019 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1020 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1021 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1022 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1023 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1024 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1025 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001026 with self.assertWarns(DeprecationWarning):
1027 self.assertTrue(re.match(br"[\u1234]", b'u'))
1028 with self.assertWarns(DeprecationWarning):
1029 self.assertTrue(re.match(br"[\U00012345]", b'U'))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001030 self.checkPatternError(br"[\567]",
1031 r'octal escape value \567 outside of '
1032 r'range 0-0o377', 1)
1033 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1034 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1035
1036 def test_character_set_errors(self):
1037 self.checkPatternError(r'[', 'unterminated character set', 0)
1038 self.checkPatternError(r'[^', 'unterminated character set', 0)
1039 self.checkPatternError(r'[a', 'unterminated character set', 0)
1040 # bug 545855 -- This pattern failed to cause a compile error as it
1041 # should, instead provoking a TypeError.
1042 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1043 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1044 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1045 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001046
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001047 def test_bug_113254(self):
1048 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1049 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1050 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1051
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001052 def test_bug_527371(self):
1053 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001054 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001055 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1056 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1057 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1058 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1059
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001060 def test_bug_418626(self):
1061 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1062 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1063 # pattern '*?' on a long string.
1064 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1065 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1066 20003)
1067 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001068 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001069 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001071
1072 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001073 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001074 self.assertEqual(re.compile(pat) and 1, 1)
1075
Skip Montanaro1e703c62003-04-25 15:40:28 +00001076 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001077 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001078 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001079 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1080 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1081 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001082
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001083 def test_nothing_to_repeat(self):
1084 for reps in '*', '+', '?', '{1,2}':
1085 for mod in '', '?':
1086 self.checkPatternError('%s%s' % (reps, mod),
1087 'nothing to repeat', 0)
1088 self.checkPatternError('(?:%s%s)' % (reps, mod),
1089 'nothing to repeat', 3)
1090
1091 def test_multiple_repeat(self):
1092 for outer_reps in '*', '+', '{1,2}':
1093 for outer_mod in '', '?':
1094 outer_op = outer_reps + outer_mod
1095 for inner_reps in '*', '+', '?', '{1,2}':
1096 for inner_mod in '', '?':
1097 inner_op = inner_reps + inner_mod
1098 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1099 'multiple repeat', 1 + len(inner_op))
1100
Serhiy Storchakafa468162013-02-16 21:23:53 +02001101 def test_unlimited_zero_width_repeat(self):
1102 # Issue #9669
1103 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1104 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1105 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1106 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1107 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1108 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1109
Skip Montanaro1e703c62003-04-25 15:40:28 +00001110 def test_scanner(self):
1111 def s_ident(scanner, token): return token
1112 def s_operator(scanner, token): return "op%s" % token
1113 def s_float(scanner, token): return float(token)
1114 def s_int(scanner, token): return int(token)
1115
1116 scanner = Scanner([
1117 (r"[a-zA-Z_]\w*", s_ident),
1118 (r"\d+\.\d*", s_float),
1119 (r"\d+", s_int),
1120 (r"=|\+|-|\*|/", s_operator),
1121 (r"\s+", None),
1122 ])
1123
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001124 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001125
Skip Montanaro1e703c62003-04-25 15:40:28 +00001126 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1127 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1128 'op+', 'bar'], ''))
1129
Skip Montanaro5ba00542003-04-25 16:00:14 +00001130 def test_bug_448951(self):
1131 # bug 448951 (similar to 429357, but with single char match)
1132 # (Also test greedy matches.)
1133 for op in '','?','*':
1134 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1135 (None, None))
1136 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1137 ('a:', 'a'))
1138
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001139 def test_bug_725106(self):
1140 # capturing groups in alternatives in repeats
1141 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1142 ('b', 'a'))
1143 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1144 ('c', 'b'))
1145 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1146 ('b', None))
1147 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1148 ('b', None))
1149 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1150 ('b', 'a'))
1151 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1152 ('c', 'b'))
1153 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1154 ('b', None))
1155 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1156 ('b', None))
1157
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001158 def test_bug_725149(self):
1159 # mark_stack_base restoring before restoring marks
1160 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1161 ('a', None))
1162 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1163 ('a', None, None))
1164
Just van Rossum12723ba2003-07-02 20:03:04 +00001165 def test_bug_764548(self):
1166 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001167 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001168 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001169 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001170
Skip Montanaro5ba00542003-04-25 16:00:14 +00001171 def test_finditer(self):
1172 iter = re.finditer(r":+", "a:b::c:::d")
1173 self.assertEqual([item.group(0) for item in iter],
1174 [":", "::", ":::"])
1175
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001176 pat = re.compile(r":+")
1177 iter = pat.finditer("a:b::c:::d", 1, 10)
1178 self.assertEqual([item.group(0) for item in iter],
1179 [":", "::", ":::"])
1180
1181 pat = re.compile(r":+")
1182 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1183 self.assertEqual([item.group(0) for item in iter],
1184 [":", "::", ":::"])
1185
1186 pat = re.compile(r":+")
1187 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1188 self.assertEqual([item.group(0) for item in iter],
1189 [":", "::", ":::"])
1190
1191 pat = re.compile(r":+")
1192 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1193 self.assertEqual([item.group(0) for item in iter],
1194 ["::", "::"])
1195
Thomas Wouters40a088d2008-03-18 20:19:54 +00001196 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001197 self.assertIsNot(re.compile('bug_926075'),
1198 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001199
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001200 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001201 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001202 self.assertEqual(re.compile(pattern).split("a.b.c"),
1203 ['a','b','c'])
1204
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001205 def test_bug_581080(self):
1206 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001207 self.assertEqual(next(iter).span(), (1,2))
1208 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001209
1210 scanner = re.compile(r"\s").scanner("a b")
1211 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001212 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001213
1214 def test_bug_817234(self):
1215 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001216 self.assertEqual(next(iter).span(), (0, 4))
1217 self.assertEqual(next(iter).span(), (4, 4))
1218 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001219
Mark Dickinson1f268282009-07-28 17:22:36 +00001220 def test_bug_6561(self):
1221 # '\d' should match characters in Unicode category 'Nd'
1222 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1223 # Letter) or 'No' (Number, Other).
1224 decimal_digits = [
1225 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1226 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1227 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1228 ]
1229 for x in decimal_digits:
1230 self.assertEqual(re.match('^\d$', x).group(0), x)
1231
1232 not_decimal_digits = [
1233 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1234 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1235 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1236 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1237 ]
1238 for x in not_decimal_digits:
1239 self.assertIsNone(re.match('^\d$', x))
1240
Guido van Rossumd8faa362007-04-27 19:54:29 +00001241 def test_empty_array(self):
1242 # SF buf 1647541
1243 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001244 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001245 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001246 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001247 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001248
Christian Heimes072c0f12008-01-03 23:01:04 +00001249 def test_inline_flags(self):
1250 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001251 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1252 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001253
1254 p = re.compile(upper_char, re.I | re.U)
1255 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001256 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001257
1258 p = re.compile(lower_char, re.I | re.U)
1259 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001260 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001261
1262 p = re.compile('(?i)' + upper_char, re.U)
1263 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001264 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001265
1266 p = re.compile('(?i)' + lower_char, re.U)
1267 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001268 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001269
1270 p = re.compile('(?iu)' + upper_char)
1271 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001272 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001273
1274 p = re.compile('(?iu)' + lower_char)
1275 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001276 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001277
Serhiy Storchakacc66a652016-09-11 01:39:51 +03001278 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
1279 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
1280
Christian Heimes25bb7832008-01-11 16:17:00 +00001281 def test_dollar_matches_twice(self):
1282 "$ matches the end of string, and just before the terminating \n"
1283 pattern = re.compile('$')
1284 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1285 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1286 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1287
1288 pattern = re.compile('$', re.MULTILINE)
1289 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1290 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1291 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1292
Antoine Pitroufd036452008-08-19 17:56:33 +00001293 def test_bytes_str_mixing(self):
1294 # Mixing str and bytes is disallowed
1295 pat = re.compile('.')
1296 bpat = re.compile(b'.')
1297 self.assertRaises(TypeError, pat.match, b'b')
1298 self.assertRaises(TypeError, bpat.match, 'b')
1299 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1300 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1301 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1302 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1303 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1304 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1305
1306 def test_ascii_and_unicode_flag(self):
1307 # String patterns
1308 for flags in (0, re.UNICODE):
1309 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001310 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001311 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001312 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001313 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001314 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001315 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001316 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001317 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001318 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001319 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001320 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001321 # Bytes patterns
1322 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001323 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001324 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001325 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001326 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001327 # Incompatibilities
1328 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1329 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1330 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1331 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1332 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1333 self.assertRaises(ValueError, re.compile, '(?au)\w')
1334
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001335 def test_locale_flag(self):
1336 import locale
1337 _, enc = locale.getlocale(locale.LC_CTYPE)
1338 # Search non-ASCII letter
1339 for i in range(128, 256):
1340 try:
1341 c = bytes([i]).decode(enc)
1342 sletter = c.lower()
1343 if sletter == c: continue
1344 bletter = sletter.encode(enc)
1345 if len(bletter) != 1: continue
1346 if bletter.decode(enc) != sletter: continue
1347 bpat = re.escape(bytes([i]))
1348 break
1349 except (UnicodeError, TypeError):
1350 pass
1351 else:
1352 bletter = None
1353 bpat = b'A'
1354 # Bytes patterns
1355 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1356 if bletter:
1357 self.assertTrue(pat.match(bletter))
1358 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1359 if bletter:
1360 self.assertTrue(pat.match(bletter))
1361 pat = re.compile(bpat, re.IGNORECASE)
1362 if bletter:
1363 self.assertIsNone(pat.match(bletter))
1364 pat = re.compile(b'\w', re.LOCALE)
1365 if bletter:
1366 self.assertTrue(pat.match(bletter))
1367 pat = re.compile(b'(?L)\w')
1368 if bletter:
1369 self.assertTrue(pat.match(bletter))
1370 pat = re.compile(b'\w')
1371 if bletter:
1372 self.assertIsNone(pat.match(bletter))
1373 # Incompatibilities
1374 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1375 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1376 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1377 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1378 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1379 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1380
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001381 def test_bug_6509(self):
1382 # Replacement strings of both types must parse properly.
1383 # all strings
1384 pat = re.compile('a(\w)')
1385 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1386 pat = re.compile('a(.)')
1387 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1388 pat = re.compile('..')
1389 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1390
1391 # all bytes
1392 pat = re.compile(b'a(\w)')
1393 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1394 pat = re.compile(b'a(.)')
1395 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1396 pat = re.compile(b'..')
1397 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1398
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001399 def test_dealloc(self):
1400 # issue 3299: check for segfault in debug build
1401 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001402 # the overflow limit is different on wide and narrow builds and it
1403 # depends on the definition of SRE_CODE (see sre.h).
1404 # 2**128 should be big enough to overflow on both. For smaller values
1405 # a RuntimeError is raised instead of OverflowError.
1406 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001407 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001408 with self.assertRaises(OverflowError):
1409 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1410 with self.assertRaises(TypeError):
1411 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001414 self.assertTrue(re.search("123.*-", '123abc-'))
1415 self.assertTrue(re.search("123.*-", '123\xe9-'))
1416 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1417 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1418 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419
Ezio Melottidf723e12012-03-13 01:29:48 +02001420 def test_compile(self):
1421 # Test return value when given string and pattern as parameter
1422 pattern = re.compile('random pattern')
1423 self.assertIsInstance(pattern, re._pattern_type)
1424 same_pattern = re.compile(pattern)
1425 self.assertIsInstance(same_pattern, re._pattern_type)
1426 self.assertIs(same_pattern, pattern)
1427 # Test behaviour when not given a string or pattern as parameter
1428 self.assertRaises(TypeError, re.compile, 0)
1429
Ezio Melottife8e6e72013-01-11 08:32:01 +02001430 def test_bug_13899(self):
1431 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1432 # nothing. Ditto B and Z.
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001433 with self.assertWarns(DeprecationWarning):
1434 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1435 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melottife8e6e72013-01-11 08:32:01 +02001436
Antoine Pitroub33941a2012-12-03 20:55:56 +01001437 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001438 def test_large_search(self, size):
1439 # Issue #10182: indices were 32-bit-truncated.
1440 s = 'a' * size
1441 m = re.search('$', s)
1442 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001443 self.assertEqual(m.start(), size)
1444 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001445
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001446 # The huge memuse is because of re.sub() using a list and a join()
1447 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001448 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001449 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001450 # Issue #10182: indices were 32-bit-truncated.
1451 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001452 r, n = re.subn('', '', s)
1453 self.assertEqual(r, s)
1454 self.assertEqual(n, size + 1)
1455
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001456 def test_bug_16688(self):
1457 # Issue 16688: Backreferences make case-insensitive regex fail on
1458 # non-ASCII strings.
1459 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1460 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001461
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001462 def test_repeat_minmax_overflow(self):
1463 # Issue #13169
1464 string = "x" * 100000
1465 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1466 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1467 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1468 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1469 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1470 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1471 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1472 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1473 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1474 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1475 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1476
1477 @cpython_only
1478 def test_repeat_minmax_overflow_maxrepeat(self):
1479 try:
1480 from _sre import MAXREPEAT
1481 except ImportError:
1482 self.skipTest('requires _sre.MAXREPEAT constant')
1483 string = "x" * 100000
1484 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1485 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1486 (0, 100000))
1487 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1488 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1489 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1490 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1491
R David Murray26dfaac92013-04-14 13:00:54 -04001492 def test_backref_group_name_in_exception(self):
1493 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001494 self.checkPatternError('(?P=<foo>)',
1495 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001496
1497 def test_group_name_in_exception(self):
1498 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001499 self.checkPatternError('(?P<?foo>)',
1500 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001501
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001502 def test_issue17998(self):
1503 for reps in '*', '+', '?', '{1}':
1504 for mod in '', '?':
1505 pattern = '.' + reps + mod + 'yz'
1506 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1507 ['xyz'], msg=pattern)
1508 pattern = pattern.encode()
1509 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1510 [b'xyz'], msg=pattern)
1511
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001512 def test_match_repr(self):
1513 for string in '[abracadabra]', S('[abracadabra]'):
1514 m = re.search(r'(.+)(.*?)\1', string)
1515 self.assertEqual(repr(m), "<%s.%s object; "
1516 "span=(1, 12), match='abracadabra'>" %
1517 (type(m).__module__, type(m).__qualname__))
1518 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1519 bytearray(b'[abracadabra]'),
1520 memoryview(b'[abracadabra]')):
1521 m = re.search(rb'(.+)(.*?)\1', string)
1522 self.assertEqual(repr(m), "<%s.%s object; "
1523 "span=(1, 12), match=b'abracadabra'>" %
1524 (type(m).__module__, type(m).__qualname__))
1525
1526 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1527 self.assertEqual(repr(first), "<%s.%s object; "
1528 "span=(0, 2), match='aa'>" %
1529 (type(second).__module__, type(first).__qualname__))
1530 self.assertEqual(repr(second), "<%s.%s object; "
1531 "span=(3, 5), match='bb'>" %
1532 (type(second).__module__, type(second).__qualname__))
1533
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001534
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001535 def test_bug_2537(self):
1536 # issue 2537: empty submatches
1537 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1538 for inner_op in ('{0,}', '*', '?'):
1539 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1540 m = r.match("xyyzy")
1541 self.assertEqual(m.group(0), "xyy")
1542 self.assertEqual(m.group(1), "")
1543 self.assertEqual(m.group(2), "y")
1544
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001545 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001546 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001547 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001548 re.compile(pat, re.DEBUG)
1549 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001550SUBPATTERN 1
1551 LITERAL 46
1552SUBPATTERN None
1553 BRANCH
1554 IN
1555 LITERAL 99
1556 LITERAL 104
1557 OR
1558 LITERAL 112
1559 LITERAL 121
1560SUBPATTERN None
1561 GROUPREF_EXISTS 1
1562 AT AT_END
1563 ELSE
1564 LITERAL 58
1565 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001566'''
1567 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001568 # Debug output is output again even a second time (bypassing
1569 # the cache -- issue #20426).
1570 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001571 re.compile(pat, re.DEBUG)
1572 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001573
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001574 def test_keyword_parameters(self):
1575 # Issue #20283: Accepting the string keyword parameter.
1576 pat = re.compile(r'(ab)')
1577 self.assertEqual(
1578 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1579 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001580 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1581 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001582 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1583 self.assertEqual(
1584 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1585 self.assertEqual(
1586 pat.split(string='abracadabra', maxsplit=1),
1587 ['', 'ab', 'racadabra'])
1588 self.assertEqual(
1589 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1590 (7, 9))
1591
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001592 def test_bug_20998(self):
1593 # Issue #20998: Fullmatch of repeated single character pattern
1594 # with ignore case.
1595 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1596
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001597 def test_locale_caching(self):
1598 # Issue #22410
1599 oldlocale = locale.setlocale(locale.LC_CTYPE)
1600 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1601 for loc in 'en_US.iso88591', 'en_US.utf8':
1602 try:
1603 locale.setlocale(locale.LC_CTYPE, loc)
1604 except locale.Error:
1605 # Unsupported locale on this system
1606 self.skipTest('test needs %s locale' % loc)
1607
1608 re.purge()
1609 self.check_en_US_iso88591()
1610 self.check_en_US_utf8()
1611 re.purge()
1612 self.check_en_US_utf8()
1613 self.check_en_US_iso88591()
1614
1615 def check_en_US_iso88591(self):
1616 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1617 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1618 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1619 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1620 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1621 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1622 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1623
1624 def check_en_US_utf8(self):
1625 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1626 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1627 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1628 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1629 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1630 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1631 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1632
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001633 def test_error(self):
1634 with self.assertRaises(re.error) as cm:
1635 re.compile('(\u20ac))')
1636 err = cm.exception
1637 self.assertIsInstance(err.pattern, str)
1638 self.assertEqual(err.pattern, '(\u20ac))')
1639 self.assertEqual(err.pos, 3)
1640 self.assertEqual(err.lineno, 1)
1641 self.assertEqual(err.colno, 4)
1642 self.assertIn(err.msg, str(err))
1643 self.assertIn(' at position 3', str(err))
1644 self.assertNotIn(' at position 3', err.msg)
1645 # Bytes pattern
1646 with self.assertRaises(re.error) as cm:
1647 re.compile(b'(\xa4))')
1648 err = cm.exception
1649 self.assertIsInstance(err.pattern, bytes)
1650 self.assertEqual(err.pattern, b'(\xa4))')
1651 self.assertEqual(err.pos, 3)
1652 # Multiline pattern
1653 with self.assertRaises(re.error) as cm:
1654 re.compile("""
1655 (
1656 abc
1657 )
1658 )
1659 (
1660 """, re.VERBOSE)
1661 err = cm.exception
1662 self.assertEqual(err.pos, 77)
1663 self.assertEqual(err.lineno, 5)
1664 self.assertEqual(err.colno, 17)
1665 self.assertIn(err.msg, str(err))
1666 self.assertIn(' at position 77', str(err))
1667 self.assertIn('(line 5, column 17)', str(err))
1668
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001669 def test_misc_errors(self):
1670 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1671 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1672 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1673 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1674 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1675 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1676 self.checkPatternError(r'(?i', 'missing )', 3)
1677 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1678 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1679 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1680 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1681
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001682
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001683class PatternReprTests(unittest.TestCase):
1684 def check(self, pattern, expected):
1685 self.assertEqual(repr(re.compile(pattern)), expected)
1686
1687 def check_flags(self, pattern, flags, expected):
1688 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1689
1690 def test_without_flags(self):
1691 self.check('random pattern',
1692 "re.compile('random pattern')")
1693
1694 def test_single_flag(self):
1695 self.check_flags('random pattern', re.IGNORECASE,
1696 "re.compile('random pattern', re.IGNORECASE)")
1697
1698 def test_multiple_flags(self):
1699 self.check_flags('random pattern', re.I|re.S|re.X,
1700 "re.compile('random pattern', "
1701 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1702
1703 def test_unicode_flag(self):
1704 self.check_flags('random pattern', re.U,
1705 "re.compile('random pattern')")
1706 self.check_flags('random pattern', re.I|re.S|re.U,
1707 "re.compile('random pattern', "
1708 "re.IGNORECASE|re.DOTALL)")
1709
1710 def test_inline_flags(self):
1711 self.check('(?i)pattern',
1712 "re.compile('(?i)pattern', re.IGNORECASE)")
1713
1714 def test_unknown_flags(self):
1715 self.check_flags('random pattern', 0x123000,
1716 "re.compile('random pattern', 0x123000)")
1717 self.check_flags('random pattern', 0x123000|re.I,
1718 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1719
1720 def test_bytes(self):
1721 self.check(b'bytes pattern',
1722 "re.compile(b'bytes pattern')")
1723 self.check_flags(b'bytes pattern', re.A,
1724 "re.compile(b'bytes pattern', re.ASCII)")
1725
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001726 def test_locale(self):
1727 self.check_flags(b'bytes pattern', re.L,
1728 "re.compile(b'bytes pattern', re.LOCALE)")
1729
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001730 def test_quotes(self):
1731 self.check('random "double quoted" pattern',
1732 '''re.compile('random "double quoted" pattern')''')
1733 self.check("random 'single quoted' pattern",
1734 '''re.compile("random 'single quoted' pattern")''')
1735 self.check('''both 'single' and "double" quotes''',
1736 '''re.compile('both \\'single\\' and "double" quotes')''')
1737
1738 def test_long_pattern(self):
1739 pattern = 'Very %spattern' % ('long ' * 1000)
1740 r = repr(re.compile(pattern))
1741 self.assertLess(len(r), 300)
1742 self.assertEqual(r[:30], "re.compile('Very long long lon")
1743 r = repr(re.compile(pattern, re.I))
1744 self.assertLess(len(r), 300)
1745 self.assertEqual(r[:30], "re.compile('Very long long lon")
1746 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1747
1748
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001749class ImplementationTest(unittest.TestCase):
1750 """
1751 Test implementation details of the re module.
1752 """
1753
1754 def test_overlap_table(self):
1755 f = sre_compile._generate_overlap_table
1756 self.assertEqual(f(""), [])
1757 self.assertEqual(f("a"), [0])
1758 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1759 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1760 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1761 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1762
1763
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001764class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001765
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001766 def test_re_benchmarks(self):
1767 're_tests benchmarks'
1768 from test.re_tests import benchmarks
1769 for pattern, s in benchmarks:
1770 with self.subTest(pattern=pattern, string=s):
1771 p = re.compile(pattern)
1772 self.assertTrue(p.search(s))
1773 self.assertTrue(p.match(s))
1774 self.assertTrue(p.fullmatch(s))
1775 s2 = ' '*10000 + s + ' '*10000
1776 self.assertTrue(p.search(s2))
1777 self.assertTrue(p.match(s2, 10000))
1778 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1779 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001780
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001781 def test_re_tests(self):
1782 're_tests test suite'
1783 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1784 for t in tests:
1785 pattern = s = outcome = repl = expected = None
1786 if len(t) == 5:
1787 pattern, s, outcome, repl, expected = t
1788 elif len(t) == 3:
1789 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001790 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001791 raise ValueError('Test tuples should have 3 or 5 fields', t)
1792
1793 with self.subTest(pattern=pattern, string=s):
1794 if outcome == SYNTAX_ERROR: # Expected a syntax error
1795 with self.assertRaises(re.error):
1796 re.compile(pattern)
1797 continue
1798
1799 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001800 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001801 if outcome == FAIL:
1802 self.assertIsNone(result, 'Succeeded incorrectly')
1803 continue
1804
1805 with self.subTest():
1806 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001807 # Matched, as expected, so now we compute the
1808 # result string and compare it to our expected result.
1809 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001810 vardict = {'found': result.group(0),
1811 'groups': result.group(),
1812 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001813 for i in range(1, 100):
1814 try:
1815 gi = result.group(i)
1816 # Special hack because else the string concat fails:
1817 if gi is None:
1818 gi = "None"
1819 except IndexError:
1820 gi = "Error"
1821 vardict['g%d' % i] = gi
1822 for i in result.re.groupindex.keys():
1823 try:
1824 gi = result.group(i)
1825 if gi is None:
1826 gi = "None"
1827 except IndexError:
1828 gi = "Error"
1829 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001830 self.assertEqual(eval(repl, vardict), expected,
1831 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001832
Antoine Pitrou22628c42008-07-22 17:53:22 +00001833 # Try the match with both pattern and string converted to
1834 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001835 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001836 bpat = bytes(pattern, "ascii")
1837 bs = bytes(s, "ascii")
1838 except UnicodeEncodeError:
1839 # skip non-ascii tests
1840 pass
1841 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001842 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001843 obj = re.compile(bpat)
1844 self.assertTrue(obj.search(bs))
1845
1846 # Try the match with LOCALE enabled, and check that it
1847 # still succeeds.
1848 with self.subTest('locale-sensitive match'):
1849 obj = re.compile(bpat, re.LOCALE)
1850 result = obj.search(bs)
1851 if result is None:
1852 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001853
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001854 # Try the match with the search area limited to the extent
1855 # of the match and see if it still succeeds. \B will
1856 # break (because it won't match at the end or start of a
1857 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001858 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1859 and result is not None):
1860 with self.subTest('range-limited match'):
1861 obj = re.compile(pattern)
1862 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001863
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001864 # Try the match with IGNORECASE enabled, and check that it
1865 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001866 with self.subTest('case-insensitive match'):
1867 obj = re.compile(pattern, re.IGNORECASE)
1868 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001869
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001870 # Try the match with UNICODE locale enabled, and check
1871 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001872 with self.subTest('unicode-sensitive match'):
1873 obj = re.compile(pattern, re.UNICODE)
1874 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001875
Gregory P. Smith5a631832010-07-27 05:31:29 +00001876
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001877if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001878 unittest.main()