blob: 24a0604948e0aa3b6440a11eb54fcdb15844480c [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000116 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300127 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000130 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
134 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
221 re.compile('(?P<a>x)(?P=a)(?(a)y)')
222 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300223 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200224 self.checkPatternError('(?P<a>)(?P<a>)',
225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300227 self.checkPatternError('(?P<a>(?P=a))',
228 "cannot refer to an open group", 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200229 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
230 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
231 self.checkPatternError('(?P=', 'missing group name', 4)
232 self.checkPatternError('(?P=)', 'missing group name', 4)
233 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
234 self.checkPatternError('(?P=a)', "unknown group name 'a'")
235 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
236 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
237 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
238 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
239 self.checkPatternError('(?P<', 'missing group name', 4)
240 self.checkPatternError('(?P<>)', 'missing group name', 4)
241 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
242 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
243 self.checkPatternError(r'(?(', 'missing group name', 3)
244 self.checkPatternError(r'(?())', 'missing group name', 3)
245 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
246 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
247 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
248 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200249 # New valid/invalid identifiers in Python 3
250 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
251 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200252 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300253 # Support > 100 groups.
254 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
255 pat = '(?:%s)(?(200)z|t)' % pat
256 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200259 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
260 'missing >, unterminated name', 3)
261 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
262 'missing group name', 3)
263 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
264 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
265 "bad character in group name 'a a'", 3)
266 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
267 'missing group name', 3)
268 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
269 "bad character in group name '1a1'", 3)
270 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
271 'invalid group reference')
272 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
273 'invalid group reference')
274 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
275 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300276 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200278 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
279 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200280 # New valid/invalid identifiers in Python 3
281 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
282 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200283 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
284 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300285 # Support > 100 groups.
286 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
287 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000288
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 def test_re_subn(self):
290 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
291 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
292 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
293 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100294 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000295
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000296 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300297 for string in ":a:b::c", S(":a:b::c"):
298 self.assertTypedEqual(re.split(":", string),
299 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', ':', 'a', ':', 'b', '::', 'c'])
304 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
305 memoryview(b":a:b::c")):
306 self.assertTypedEqual(re.split(b":", string),
307 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300312 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
313 "\U0001d49c\U0001d49e\U0001d4b5"):
314 string = ":%s:%s::%s" % (a, b, c)
315 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200316 self.assertEqual(re.split(":+", string), ['', a, b, c])
317 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300318 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300319
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200320 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
321 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000322 ['', ':', 'a', ':', 'b', ':', 'c'])
323 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
324 ['', ':', 'a', ':b::', 'c'])
325 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
326 ['', None, ':', 'a', None, ':', '', 'b', None, '',
327 None, '::', 'c'])
328 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
329 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000330
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200331 for sep, expected in [
332 (':*', ['', 'a', 'b', 'c']),
333 ('(?::*)', ['', 'a', 'b', 'c']),
334 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
335 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
336 ]:
337 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
338 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
339
340 for sep, expected in [
341 ('', [':a:b::c']),
342 (r'\b', [':a:b::c']),
343 (r'(?=:)', [':a:b::c']),
344 (r'(?<=:)', [':a:b::c']),
345 ]:
346 with self.subTest(sep=sep), self.assertRaises(ValueError):
347 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
348
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000349 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100350 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
351 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
352 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000355 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200356 with self.assertWarns(FutureWarning):
357 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
358 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000359
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000360 def test_re_findall(self):
361 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300362 for string in "a:b::c:::d", S("a:b::c:::d"):
363 self.assertTypedEqual(re.findall(":+", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:+)", string),
366 [":", "::", ":::"])
367 self.assertTypedEqual(re.findall("(:)(:*)", string),
368 [(":", ""), (":", ":"), (":", "::")])
369 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
370 memoryview(b"a:b::c:::d")):
371 self.assertTypedEqual(re.findall(b":+", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:+)", string),
374 [b":", b"::", b":::"])
375 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
376 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300377 for x in ("\xe0", "\u0430", "\U0001d49c"):
378 xx = x * 2
379 xxx = x * 3
380 string = "a%sb%sc%sd" % (x, xx, xxx)
381 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
382 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
383 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
384 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000385
Skip Montanaro5ba00542003-04-25 16:00:14 +0000386 def test_bug_117612(self):
387 self.assertEqual(re.findall(r"(a|(b))", "aba"),
388 [("a", ""),("b", "b"),("a", "")])
389
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000390 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300391 for string in 'a', S('a'):
392 self.assertEqual(re.match('a', string).groups(), ())
393 self.assertEqual(re.match('(a)', string).groups(), ('a',))
394 self.assertEqual(re.match('(a)', string).group(0), 'a')
395 self.assertEqual(re.match('(a)', string).group(1), 'a')
396 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
397 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
398 self.assertEqual(re.match(b'a', string).groups(), ())
399 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
400 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
401 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
402 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300403 for a in ("\xe0", "\u0430", "\U0001d49c"):
404 self.assertEqual(re.match(a, a).groups(), ())
405 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
406 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
407 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
408 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000409
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000410 pat = re.compile('((a)|(b))(c)?')
411 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
412 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
413 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
414 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
415 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000416
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000417 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
418 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
419 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
420 (None, 'b', None))
421 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000422
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300423 def test_group(self):
424 class Index:
425 def __init__(self, value):
426 self.value = value
427 def __index__(self):
428 return self.value
429 # A single group
430 m = re.match('(a)(b)', 'ab')
431 self.assertEqual(m.group(), 'ab')
432 self.assertEqual(m.group(0), 'ab')
433 self.assertEqual(m.group(1), 'a')
434 self.assertEqual(m.group(Index(1)), 'a')
435 self.assertRaises(IndexError, m.group, -1)
436 self.assertRaises(IndexError, m.group, 3)
437 self.assertRaises(IndexError, m.group, 1<<1000)
438 self.assertRaises(IndexError, m.group, Index(1<<1000))
439 self.assertRaises(IndexError, m.group, 'x')
440 # Multiple groups
441 self.assertEqual(m.group(2, 1), ('b', 'a'))
442 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
443
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200444 def test_re_fullmatch(self):
445 # Issue 16203: Proposal: add re.fullmatch() method.
446 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
447 for string in "ab", S("ab"):
448 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
449 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
450 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
451 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
452 r = r"%s|%s" % (a, a + b)
453 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
454 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
455 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
456 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
457 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
458 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
459 self.assertIsNone(re.fullmatch(r"a+", "ab"))
460 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
461 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
462 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
463 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
464 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
465 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
466
467 self.assertEqual(
468 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
469 self.assertEqual(
470 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
471 self.assertEqual(
472 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
473
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000474 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000475 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
476 ('(', 'a'))
477 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
478 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300479 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
480 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000481 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
482 ('a', 'b'))
483 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
484 (None, 'd'))
485 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
486 (None, 'd'))
487 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
488 ('a', ''))
489
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000490 # Tests for bug #1177831: exercise groups other than the first group
491 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
492 self.assertEqual(p.match('abc').groups(),
493 ('a', 'b', 'c'))
494 self.assertEqual(p.match('ad').groups(),
495 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300496 self.assertIsNone(p.match('abd'))
497 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000498
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300499 # Support > 100 groups.
500 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
501 pat = '(?:%s)(?(200)z)' % pat
502 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000503
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200504 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
505 self.checkPatternError(r'()(?(1)a|b',
506 'missing ), unterminated subpattern', 2)
507 self.checkPatternError(r'()(?(1)a|b|c)',
508 'conditional backref with more than '
509 'two branches', 10)
510
511 def test_re_groupref_overflow(self):
512 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
513 'invalid group reference', 3)
514 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
515 'invalid group reference', 10)
516
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000517 def test_re_groupref(self):
518 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
519 ('|', 'a'))
520 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
521 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300522 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
523 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000524 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
525 ('a', 'a'))
526 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
527 (None, None))
528
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200529 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
530
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000531 def test_groupdict(self):
532 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
533 'first second').groupdict(),
534 {'first':'first', 'second':'second'})
535
536 def test_expand(self):
537 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
538 "first second")
539 .expand(r"\2 \1 \g<second> \g<first>"),
540 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300541 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
542 "first")
543 .expand(r"\2 \g<second>"),
544 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000545
546 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300547 self.assertIsNone(re.match("^(\w){1}$", "abc"))
548 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
549 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
550 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000551
552 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
553 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
554 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
555 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
556 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
557 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
558 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
559 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
560
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300561 self.assertIsNone(re.match("^x{1}$", "xxx"))
562 self.assertIsNone(re.match("^x{1}?$", "xxx"))
563 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
564 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000565
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300566 self.assertTrue(re.match("^x{3}$", "xxx"))
567 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200568 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300569 self.assertTrue(re.match("^x{1,4}$", "xxx"))
570 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
571 self.assertTrue(re.match("^x{3}?$", "xxx"))
572 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
573 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
574 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000575
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300576 self.assertIsNone(re.match("^x{}$", "xxx"))
577 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000578
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200579 self.checkPatternError(r'x{2,1}',
580 'min repeat greater than max repeat', 2)
581
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000582 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000583 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000584 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000585 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
586 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
587 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
588 {'first': 1, 'other': 2})
589
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000590 self.assertEqual(re.match("(a)", "a").pos, 0)
591 self.assertEqual(re.match("(a)", "a").endpos, 1)
592 self.assertEqual(re.match("(a)", "a").string, "a")
593 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300594 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000595
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300596 # Issue 14260. groupindex should be non-modifiable mapping.
597 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
598 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
599 self.assertEqual(p.groupindex['other'], 2)
600 with self.assertRaises(TypeError):
601 p.groupindex['other'] = 0
602 self.assertEqual(p.groupindex['other'], 2)
603
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000604 def test_special_escapes(self):
605 self.assertEqual(re.search(r"\b(b.)\b",
606 "abcd abc bcd bx").group(1), "bx")
607 self.assertEqual(re.search(r"\B(b.)\B",
608 "abc bcd bc abxd").group(1), "bx")
609 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300610 "abcd abc bcd bx", re.ASCII).group(1), "bx")
611 self.assertEqual(re.search(r"\B(b.)\B",
612 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
614 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300615 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300616 self.assertEqual(re.search(br"\b(b.)\b",
617 b"abcd abc bcd bx").group(1), b"bx")
618 self.assertEqual(re.search(br"\B(b.)\B",
619 b"abc bcd bc abxd").group(1), b"bx")
620 self.assertEqual(re.search(br"\b(b.)\b",
621 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
622 self.assertEqual(re.search(br"\B(b.)\B",
623 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
624 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
625 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300626 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000627 self.assertEqual(re.search(r"\d\D\w\W\s\S",
628 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300629 self.assertEqual(re.search(br"\d\D\w\W\s\S",
630 b"1aa! a").group(0), b"1aa! a")
631 self.assertEqual(re.search(r"\d\D\w\W\s\S",
632 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300633 self.assertEqual(re.search(br"\d\D\w\W\s\S",
634 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000635
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200636 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200637 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200638 self.assertEqual(re.match(r"\(", '(').group(), '(')
639 self.assertIsNone(re.match(r"\(", ')'))
640 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200641 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
642 self.assertIsNone(re.match(r"[\]]", '['))
643 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
644 self.assertIsNone(re.match(r"[a\-c]", 'b'))
645 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
646 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200647 re.purge() # for warnings
648 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
649 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300650 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200651 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
652 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300653 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200654
Ezio Melotti5a045b92012-02-29 11:48:44 +0200655 def test_string_boundaries(self):
656 # See http://bugs.python.org/issue10713
657 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
658 "abc")
659 # There's a word boundary at the start of a string.
660 self.assertTrue(re.match(r"\b", "abc"))
661 # A non-empty string includes a non-boundary zero-length match.
662 self.assertTrue(re.search(r"\B", "abc"))
663 # There is no non-boundary match at the start of a string.
664 self.assertFalse(re.match(r"\B", "abc"))
665 # However, an empty string contains no word boundaries, and also no
666 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300667 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200668 # This one is questionable and different from the perlre behaviour,
669 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300670 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200671 # A single word-character string has two boundaries, but no
672 # non-boundary gaps.
673 self.assertEqual(len(re.findall(r"\b", "a")), 2)
674 self.assertEqual(len(re.findall(r"\B", "a")), 0)
675 # If there are no words, there are no boundaries
676 self.assertEqual(len(re.findall(r"\b", " ")), 0)
677 self.assertEqual(len(re.findall(r"\b", " ")), 0)
678 # Can match around the whitespace.
679 self.assertEqual(len(re.findall(r"\B", " ")), 2)
680
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000681 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000682 self.assertEqual(re.match("([\u2222\u2223])",
683 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300684 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300685 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000686
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100687 def test_big_codesize(self):
688 # Issue #1160
689 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300690 self.assertTrue(r.match('1000'))
691 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100692
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000693 def test_anyall(self):
694 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
695 "a\nb")
696 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
697 "a\n\nb")
698
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200699 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000700 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
701 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
702 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
703 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
704 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
705 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
706 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
707
708 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
709 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
710 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
711 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
712
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200713 # Group reference.
714 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
715 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
716 # Conditional group reference.
717 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
718 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
719 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
720 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
721 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
722 # Group used before defined.
723 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
724 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
725 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
726
727 def test_lookbehind(self):
728 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
729 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
730 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
731 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
732 # Group reference.
733 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
734 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
735 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
736 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
737 # Conditional group reference.
738 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
739 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
740 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
741 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
742 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
743 # Group used before defined.
744 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
745 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
746 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
747 # Group defined in the same lookbehind pattern
748 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
749 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
750 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
751 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
752
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000753 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000754 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300755 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000756 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
757 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
758 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
759 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
760 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
761 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
762 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
763 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
764
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200765 assert '\u212a'.lower() == 'k' # 'K'
766 self.assertTrue(re.match(r'K', '\u212a', re.I))
767 self.assertTrue(re.match(r'k', '\u212a', re.I))
768 self.assertTrue(re.match(r'\u212a', 'K', re.I))
769 self.assertTrue(re.match(r'\u212a', 'k', re.I))
770 assert '\u017f'.upper() == 'S' # 'ſ'
771 self.assertTrue(re.match(r'S', '\u017f', re.I))
772 self.assertTrue(re.match(r's', '\u017f', re.I))
773 self.assertTrue(re.match(r'\u017f', 'S', re.I))
774 self.assertTrue(re.match(r'\u017f', 's', re.I))
775 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
776 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
777 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
778
779 def test_ignore_case_set(self):
780 self.assertTrue(re.match(r'[19A]', 'A', re.I))
781 self.assertTrue(re.match(r'[19a]', 'a', re.I))
782 self.assertTrue(re.match(r'[19a]', 'A', re.I))
783 self.assertTrue(re.match(r'[19A]', 'a', re.I))
784 self.assertTrue(re.match(br'[19A]', b'A', re.I))
785 self.assertTrue(re.match(br'[19a]', b'a', re.I))
786 self.assertTrue(re.match(br'[19a]', b'A', re.I))
787 self.assertTrue(re.match(br'[19A]', b'a', re.I))
788 assert '\u212a'.lower() == 'k' # 'K'
789 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
790 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
791 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
792 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
793 assert '\u017f'.upper() == 'S' # 'ſ'
794 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
795 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
796 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
797 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
798 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
799 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
800 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
801
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200802 def test_ignore_case_range(self):
803 # Issues #3511, #17381.
804 self.assertTrue(re.match(r'[9-a]', '_', re.I))
805 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
806 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
807 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
808 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
809 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
810 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
811 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
812 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
813 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
814 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
815 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
816 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
817 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
818 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
819 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
820
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200821 assert '\u212a'.lower() == 'k' # 'K'
822 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
823 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
824 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
825 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
826 assert '\u017f'.upper() == 'S' # 'ſ'
827 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
828 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
829 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
830 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
831 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
832 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
833 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
834
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000835 def test_category(self):
836 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
837
838 def test_getlower(self):
839 import _sre
840 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
841 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
842 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200843 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000844
845 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300846 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200847 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
848 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000849
850 def test_not_literal(self):
851 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
852 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
853
854 def test_search_coverage(self):
855 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
856 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
857
Ezio Melottid2114eb2011-03-25 14:08:44 +0200858 def assertMatch(self, pattern, text, match=None, span=None,
859 matcher=re.match):
860 if match is None and span is None:
861 # the pattern matches the whole text
862 match = text
863 span = (0, len(text))
864 elif match is None or span is None:
865 raise ValueError('If match is not None, span should be specified '
866 '(and vice versa).')
867 m = matcher(pattern, text)
868 self.assertTrue(m)
869 self.assertEqual(m.group(), match)
870 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000871
Ezio Melottid2114eb2011-03-25 14:08:44 +0200872 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300873 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200874 p = ''.join(chr(i) for i in range(256))
875 for c in p:
876 if c in alnum_chars:
877 self.assertEqual(re.escape(c), c)
878 elif c == '\x00':
879 self.assertEqual(re.escape(c), '\\000')
880 else:
881 self.assertEqual(re.escape(c), '\\' + c)
882 self.assertMatch(re.escape(c), c)
883 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000884
Guido van Rossum698280d2008-09-10 17:44:35 +0000885 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300886 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200887 p = bytes(range(256))
888 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000889 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200890 if b in alnum_chars:
891 self.assertEqual(re.escape(b), b)
892 elif i == 0:
893 self.assertEqual(re.escape(b), b'\\000')
894 else:
895 self.assertEqual(re.escape(b), b'\\' + b)
896 self.assertMatch(re.escape(b), b)
897 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000898
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200899 def test_re_escape_non_ascii(self):
900 s = 'xxx\u2620\u2620\u2620xxx'
901 s_escaped = re.escape(s)
902 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
903 self.assertMatch(s_escaped, s)
904 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
905 'x\u2620\u2620\u2620x', (2, 7), re.search)
906
907 def test_re_escape_non_ascii_bytes(self):
908 b = 'y\u2620y\u2620y'.encode('utf-8')
909 b_escaped = re.escape(b)
910 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
911 self.assertMatch(b_escaped, b)
912 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
913 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000914
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300915 def test_pickling(self):
916 import pickle
917 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
918 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
919 pickled = pickle.dumps(oldpat, proto)
920 newpat = pickle.loads(pickled)
921 self.assertEqual(newpat, oldpat)
922 # current pickle expects the _compile() reconstructor in re module
923 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000924
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000925 def test_constants(self):
926 self.assertEqual(re.I, re.IGNORECASE)
927 self.assertEqual(re.L, re.LOCALE)
928 self.assertEqual(re.M, re.MULTILINE)
929 self.assertEqual(re.S, re.DOTALL)
930 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000931
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000932 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200933 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300934 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200935 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
936 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000937
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000938 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200939 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
940 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300941 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
942 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
943 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
944 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
945 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
946 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200947 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300948 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
949 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
950 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
951 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
952 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
953 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
954 self.assertTrue(re.match(r"\0", "\000"))
955 self.assertTrue(re.match(r"\08", "\0008"))
956 self.assertTrue(re.match(r"\01", "\001"))
957 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200958 self.checkPatternError(r"\567",
959 r'octal escape value \567 outside of '
960 r'range 0-0o377', 0)
961 self.checkPatternError(r"\911", 'invalid group reference', 0)
962 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
963 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
964 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
965 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
966 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
967 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
968 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000969
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000970 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200971 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
972 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300973 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
974 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
975 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
976 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
977 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
978 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
979 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
980 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200981 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300982 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
983 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
984 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
985 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
986 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
987 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200988 self.checkPatternError(r"[\567]",
989 r'octal escape value \567 outside of '
990 r'range 0-0o377', 1)
991 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
992 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
993 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
994 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
995 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300996 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200997
998 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000999 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001000 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1001 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1002 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1003 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1004 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1005 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001006 self.assertRaises(re.error, re.compile, br"\u1234")
1007 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001008 self.assertTrue(re.match(br"\0", b"\000"))
1009 self.assertTrue(re.match(br"\08", b"\0008"))
1010 self.assertTrue(re.match(br"\01", b"\001"))
1011 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001012 self.checkPatternError(br"\567",
1013 r'octal escape value \567 outside of '
1014 r'range 0-0o377', 0)
1015 self.checkPatternError(br"\911", 'invalid group reference', 0)
1016 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1017 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001018
1019 def test_sre_byte_class_literals(self):
1020 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001021 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1022 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1023 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1024 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1025 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1026 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1027 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1028 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001029 self.assertRaises(re.error, re.compile, br"[\u1234]")
1030 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001031 self.checkPatternError(br"[\567]",
1032 r'octal escape value \567 outside of '
1033 r'range 0-0o377', 1)
1034 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1035 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1036
1037 def test_character_set_errors(self):
1038 self.checkPatternError(r'[', 'unterminated character set', 0)
1039 self.checkPatternError(r'[^', 'unterminated character set', 0)
1040 self.checkPatternError(r'[a', 'unterminated character set', 0)
1041 # bug 545855 -- This pattern failed to cause a compile error as it
1042 # should, instead provoking a TypeError.
1043 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1044 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1045 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1046 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001047
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001048 def test_bug_113254(self):
1049 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1050 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1051 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1052
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001053 def test_bug_527371(self):
1054 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001055 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001056 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1057 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1058 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1059 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1060
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001061 def test_bug_418626(self):
1062 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1063 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1064 # pattern '*?' on a long string.
1065 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1066 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1067 20003)
1068 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001069 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001070 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001071 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001072
1073 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001074 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001075 self.assertEqual(re.compile(pat) and 1, 1)
1076
Skip Montanaro1e703c62003-04-25 15:40:28 +00001077 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001078 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001079 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001080 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1081 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1082 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001083
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001084 def test_nothing_to_repeat(self):
1085 for reps in '*', '+', '?', '{1,2}':
1086 for mod in '', '?':
1087 self.checkPatternError('%s%s' % (reps, mod),
1088 'nothing to repeat', 0)
1089 self.checkPatternError('(?:%s%s)' % (reps, mod),
1090 'nothing to repeat', 3)
1091
1092 def test_multiple_repeat(self):
1093 for outer_reps in '*', '+', '{1,2}':
1094 for outer_mod in '', '?':
1095 outer_op = outer_reps + outer_mod
1096 for inner_reps in '*', '+', '?', '{1,2}':
1097 for inner_mod in '', '?':
1098 inner_op = inner_reps + inner_mod
1099 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1100 'multiple repeat', 1 + len(inner_op))
1101
Serhiy Storchakafa468162013-02-16 21:23:53 +02001102 def test_unlimited_zero_width_repeat(self):
1103 # Issue #9669
1104 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1105 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1106 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1107 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1108 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1109 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1110
Skip Montanaro1e703c62003-04-25 15:40:28 +00001111 def test_scanner(self):
1112 def s_ident(scanner, token): return token
1113 def s_operator(scanner, token): return "op%s" % token
1114 def s_float(scanner, token): return float(token)
1115 def s_int(scanner, token): return int(token)
1116
1117 scanner = Scanner([
1118 (r"[a-zA-Z_]\w*", s_ident),
1119 (r"\d+\.\d*", s_float),
1120 (r"\d+", s_int),
1121 (r"=|\+|-|\*|/", s_operator),
1122 (r"\s+", None),
1123 ])
1124
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001125 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001126
Skip Montanaro1e703c62003-04-25 15:40:28 +00001127 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1128 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1129 'op+', 'bar'], ''))
1130
Skip Montanaro5ba00542003-04-25 16:00:14 +00001131 def test_bug_448951(self):
1132 # bug 448951 (similar to 429357, but with single char match)
1133 # (Also test greedy matches.)
1134 for op in '','?','*':
1135 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1136 (None, None))
1137 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1138 ('a:', 'a'))
1139
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001140 def test_bug_725106(self):
1141 # capturing groups in alternatives in repeats
1142 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1143 ('b', 'a'))
1144 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1145 ('c', 'b'))
1146 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1147 ('b', None))
1148 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1149 ('b', None))
1150 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1151 ('b', 'a'))
1152 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1153 ('c', 'b'))
1154 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1155 ('b', None))
1156 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1157 ('b', None))
1158
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001159 def test_bug_725149(self):
1160 # mark_stack_base restoring before restoring marks
1161 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1162 ('a', None))
1163 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1164 ('a', None, None))
1165
Just van Rossum12723ba2003-07-02 20:03:04 +00001166 def test_bug_764548(self):
1167 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001168 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001169 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001170 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001171
Skip Montanaro5ba00542003-04-25 16:00:14 +00001172 def test_finditer(self):
1173 iter = re.finditer(r":+", "a:b::c:::d")
1174 self.assertEqual([item.group(0) for item in iter],
1175 [":", "::", ":::"])
1176
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001177 pat = re.compile(r":+")
1178 iter = pat.finditer("a:b::c:::d", 1, 10)
1179 self.assertEqual([item.group(0) for item in iter],
1180 [":", "::", ":::"])
1181
1182 pat = re.compile(r":+")
1183 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1184 self.assertEqual([item.group(0) for item in iter],
1185 [":", "::", ":::"])
1186
1187 pat = re.compile(r":+")
1188 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1189 self.assertEqual([item.group(0) for item in iter],
1190 [":", "::", ":::"])
1191
1192 pat = re.compile(r":+")
1193 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1194 self.assertEqual([item.group(0) for item in iter],
1195 ["::", "::"])
1196
Thomas Wouters40a088d2008-03-18 20:19:54 +00001197 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001198 self.assertIsNot(re.compile('bug_926075'),
1199 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001200
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001201 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001202 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001203 self.assertEqual(re.compile(pattern).split("a.b.c"),
1204 ['a','b','c'])
1205
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001206 def test_bug_581080(self):
1207 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001208 self.assertEqual(next(iter).span(), (1,2))
1209 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001210
1211 scanner = re.compile(r"\s").scanner("a b")
1212 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001213 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001214
1215 def test_bug_817234(self):
1216 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001217 self.assertEqual(next(iter).span(), (0, 4))
1218 self.assertEqual(next(iter).span(), (4, 4))
1219 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001220
Mark Dickinson1f268282009-07-28 17:22:36 +00001221 def test_bug_6561(self):
1222 # '\d' should match characters in Unicode category 'Nd'
1223 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1224 # Letter) or 'No' (Number, Other).
1225 decimal_digits = [
1226 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1227 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1228 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1229 ]
1230 for x in decimal_digits:
1231 self.assertEqual(re.match('^\d$', x).group(0), x)
1232
1233 not_decimal_digits = [
1234 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1235 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1236 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1237 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1238 ]
1239 for x in not_decimal_digits:
1240 self.assertIsNone(re.match('^\d$', x))
1241
Guido van Rossumd8faa362007-04-27 19:54:29 +00001242 def test_empty_array(self):
1243 # SF buf 1647541
1244 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001245 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001246 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001247 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001248 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001249
Christian Heimes072c0f12008-01-03 23:01:04 +00001250 def test_inline_flags(self):
1251 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001252 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1253 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001254
1255 p = re.compile(upper_char, re.I | re.U)
1256 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001257 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001258
1259 p = re.compile(lower_char, re.I | re.U)
1260 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001261 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001262
1263 p = re.compile('(?i)' + upper_char, re.U)
1264 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001265 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001266
1267 p = re.compile('(?i)' + lower_char, re.U)
1268 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001269 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001270
1271 p = re.compile('(?iu)' + upper_char)
1272 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001273 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001274
1275 p = re.compile('(?iu)' + lower_char)
1276 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001277 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001278
Christian Heimes25bb7832008-01-11 16:17:00 +00001279 def test_dollar_matches_twice(self):
1280 "$ matches the end of string, and just before the terminating \n"
1281 pattern = re.compile('$')
1282 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1283 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1284 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1285
1286 pattern = re.compile('$', re.MULTILINE)
1287 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1288 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1289 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1290
Antoine Pitroufd036452008-08-19 17:56:33 +00001291 def test_bytes_str_mixing(self):
1292 # Mixing str and bytes is disallowed
1293 pat = re.compile('.')
1294 bpat = re.compile(b'.')
1295 self.assertRaises(TypeError, pat.match, b'b')
1296 self.assertRaises(TypeError, bpat.match, 'b')
1297 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1298 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1299 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1300 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1301 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1302 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1303
1304 def test_ascii_and_unicode_flag(self):
1305 # String patterns
1306 for flags in (0, re.UNICODE):
1307 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001308 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001309 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001310 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001311 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001312 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001313 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001314 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001315 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001316 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001317 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001318 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001319 # Bytes patterns
1320 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001321 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001322 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001323 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001324 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001325 # Incompatibilities
1326 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1327 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1328 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1329 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1330 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1331 self.assertRaises(ValueError, re.compile, '(?au)\w')
1332
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001333 def test_locale_flag(self):
1334 import locale
1335 _, enc = locale.getlocale(locale.LC_CTYPE)
1336 # Search non-ASCII letter
1337 for i in range(128, 256):
1338 try:
1339 c = bytes([i]).decode(enc)
1340 sletter = c.lower()
1341 if sletter == c: continue
1342 bletter = sletter.encode(enc)
1343 if len(bletter) != 1: continue
1344 if bletter.decode(enc) != sletter: continue
1345 bpat = re.escape(bytes([i]))
1346 break
1347 except (UnicodeError, TypeError):
1348 pass
1349 else:
1350 bletter = None
1351 bpat = b'A'
1352 # Bytes patterns
1353 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1354 if bletter:
1355 self.assertTrue(pat.match(bletter))
1356 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1357 if bletter:
1358 self.assertTrue(pat.match(bletter))
1359 pat = re.compile(bpat, re.IGNORECASE)
1360 if bletter:
1361 self.assertIsNone(pat.match(bletter))
1362 pat = re.compile(b'\w', re.LOCALE)
1363 if bletter:
1364 self.assertTrue(pat.match(bletter))
1365 pat = re.compile(b'(?L)\w')
1366 if bletter:
1367 self.assertTrue(pat.match(bletter))
1368 pat = re.compile(b'\w')
1369 if bletter:
1370 self.assertIsNone(pat.match(bletter))
1371 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001372 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1373 self.assertRaises(ValueError, re.compile, '(?L)')
1374 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1375 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1376 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1377 self.assertRaises(ValueError, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001378
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001379 def test_bug_6509(self):
1380 # Replacement strings of both types must parse properly.
1381 # all strings
1382 pat = re.compile('a(\w)')
1383 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1384 pat = re.compile('a(.)')
1385 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1386 pat = re.compile('..')
1387 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1388
1389 # all bytes
1390 pat = re.compile(b'a(\w)')
1391 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1392 pat = re.compile(b'a(.)')
1393 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1394 pat = re.compile(b'..')
1395 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1396
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001397 def test_dealloc(self):
1398 # issue 3299: check for segfault in debug build
1399 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001400 # the overflow limit is different on wide and narrow builds and it
1401 # depends on the definition of SRE_CODE (see sre.h).
1402 # 2**128 should be big enough to overflow on both. For smaller values
1403 # a RuntimeError is raised instead of OverflowError.
1404 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001405 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001406 with self.assertRaises(OverflowError):
1407 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1408 with self.assertRaises(TypeError):
1409 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001412 self.assertTrue(re.search("123.*-", '123abc-'))
1413 self.assertTrue(re.search("123.*-", '123\xe9-'))
1414 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1415 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1416 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417
Ezio Melottidf723e12012-03-13 01:29:48 +02001418 def test_compile(self):
1419 # Test return value when given string and pattern as parameter
1420 pattern = re.compile('random pattern')
1421 self.assertIsInstance(pattern, re._pattern_type)
1422 same_pattern = re.compile(pattern)
1423 self.assertIsInstance(same_pattern, re._pattern_type)
1424 self.assertIs(same_pattern, pattern)
1425 # Test behaviour when not given a string or pattern as parameter
1426 self.assertRaises(TypeError, re.compile, 0)
1427
Antoine Pitroub33941a2012-12-03 20:55:56 +01001428 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001429 def test_large_search(self, size):
1430 # Issue #10182: indices were 32-bit-truncated.
1431 s = 'a' * size
1432 m = re.search('$', s)
1433 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001434 self.assertEqual(m.start(), size)
1435 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001436
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001437 # The huge memuse is because of re.sub() using a list and a join()
1438 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001439 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001440 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001441 # Issue #10182: indices were 32-bit-truncated.
1442 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001443 r, n = re.subn('', '', s)
1444 self.assertEqual(r, s)
1445 self.assertEqual(n, size + 1)
1446
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001447 def test_bug_16688(self):
1448 # Issue 16688: Backreferences make case-insensitive regex fail on
1449 # non-ASCII strings.
1450 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1451 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001452
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001453 def test_repeat_minmax_overflow(self):
1454 # Issue #13169
1455 string = "x" * 100000
1456 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1457 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1458 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1459 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1460 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1461 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1462 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1463 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1464 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1465 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1466 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1467
1468 @cpython_only
1469 def test_repeat_minmax_overflow_maxrepeat(self):
1470 try:
1471 from _sre import MAXREPEAT
1472 except ImportError:
1473 self.skipTest('requires _sre.MAXREPEAT constant')
1474 string = "x" * 100000
1475 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1476 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1477 (0, 100000))
1478 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1479 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1480 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1481 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1482
R David Murray26dfaac92013-04-14 13:00:54 -04001483 def test_backref_group_name_in_exception(self):
1484 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001485 self.checkPatternError('(?P=<foo>)',
1486 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001487
1488 def test_group_name_in_exception(self):
1489 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001490 self.checkPatternError('(?P<?foo>)',
1491 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001492
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001493 def test_issue17998(self):
1494 for reps in '*', '+', '?', '{1}':
1495 for mod in '', '?':
1496 pattern = '.' + reps + mod + 'yz'
1497 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1498 ['xyz'], msg=pattern)
1499 pattern = pattern.encode()
1500 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1501 [b'xyz'], msg=pattern)
1502
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001503 def test_match_repr(self):
1504 for string in '[abracadabra]', S('[abracadabra]'):
1505 m = re.search(r'(.+)(.*?)\1', string)
1506 self.assertEqual(repr(m), "<%s.%s object; "
1507 "span=(1, 12), match='abracadabra'>" %
1508 (type(m).__module__, type(m).__qualname__))
1509 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1510 bytearray(b'[abracadabra]'),
1511 memoryview(b'[abracadabra]')):
1512 m = re.search(rb'(.+)(.*?)\1', string)
1513 self.assertEqual(repr(m), "<%s.%s object; "
1514 "span=(1, 12), match=b'abracadabra'>" %
1515 (type(m).__module__, type(m).__qualname__))
1516
1517 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1518 self.assertEqual(repr(first), "<%s.%s object; "
1519 "span=(0, 2), match='aa'>" %
1520 (type(second).__module__, type(first).__qualname__))
1521 self.assertEqual(repr(second), "<%s.%s object; "
1522 "span=(3, 5), match='bb'>" %
1523 (type(second).__module__, type(second).__qualname__))
1524
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001525
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001526 def test_bug_2537(self):
1527 # issue 2537: empty submatches
1528 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1529 for inner_op in ('{0,}', '*', '?'):
1530 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1531 m = r.match("xyyzy")
1532 self.assertEqual(m.group(0), "xyy")
1533 self.assertEqual(m.group(1), "")
1534 self.assertEqual(m.group(2), "y")
1535
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001536 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001537 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001538 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001539 re.compile(pat, re.DEBUG)
1540 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001541SUBPATTERN 1
1542 LITERAL 46
1543SUBPATTERN None
1544 BRANCH
1545 IN
1546 LITERAL 99
1547 LITERAL 104
1548 OR
1549 LITERAL 112
1550 LITERAL 121
1551SUBPATTERN None
1552 GROUPREF_EXISTS 1
1553 AT AT_END
1554 ELSE
1555 LITERAL 58
1556 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001557'''
1558 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001559 # Debug output is output again even a second time (bypassing
1560 # the cache -- issue #20426).
1561 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001562 re.compile(pat, re.DEBUG)
1563 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001564
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001565 def test_keyword_parameters(self):
1566 # Issue #20283: Accepting the string keyword parameter.
1567 pat = re.compile(r'(ab)')
1568 self.assertEqual(
1569 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1570 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001571 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1572 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001573 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1574 self.assertEqual(
1575 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1576 self.assertEqual(
1577 pat.split(string='abracadabra', maxsplit=1),
1578 ['', 'ab', 'racadabra'])
1579 self.assertEqual(
1580 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1581 (7, 9))
1582
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001583 def test_bug_20998(self):
1584 # Issue #20998: Fullmatch of repeated single character pattern
1585 # with ignore case.
1586 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1587
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001588 def test_locale_caching(self):
1589 # Issue #22410
1590 oldlocale = locale.setlocale(locale.LC_CTYPE)
1591 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1592 for loc in 'en_US.iso88591', 'en_US.utf8':
1593 try:
1594 locale.setlocale(locale.LC_CTYPE, loc)
1595 except locale.Error:
1596 # Unsupported locale on this system
1597 self.skipTest('test needs %s locale' % loc)
1598
1599 re.purge()
1600 self.check_en_US_iso88591()
1601 self.check_en_US_utf8()
1602 re.purge()
1603 self.check_en_US_utf8()
1604 self.check_en_US_iso88591()
1605
1606 def check_en_US_iso88591(self):
1607 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1608 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1609 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1610 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1611 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1612 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1613 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1614
1615 def check_en_US_utf8(self):
1616 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1617 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1618 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1619 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1620 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1621 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1622 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1623
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001624 def test_error(self):
1625 with self.assertRaises(re.error) as cm:
1626 re.compile('(\u20ac))')
1627 err = cm.exception
1628 self.assertIsInstance(err.pattern, str)
1629 self.assertEqual(err.pattern, '(\u20ac))')
1630 self.assertEqual(err.pos, 3)
1631 self.assertEqual(err.lineno, 1)
1632 self.assertEqual(err.colno, 4)
1633 self.assertIn(err.msg, str(err))
1634 self.assertIn(' at position 3', str(err))
1635 self.assertNotIn(' at position 3', err.msg)
1636 # Bytes pattern
1637 with self.assertRaises(re.error) as cm:
1638 re.compile(b'(\xa4))')
1639 err = cm.exception
1640 self.assertIsInstance(err.pattern, bytes)
1641 self.assertEqual(err.pattern, b'(\xa4))')
1642 self.assertEqual(err.pos, 3)
1643 # Multiline pattern
1644 with self.assertRaises(re.error) as cm:
1645 re.compile("""
1646 (
1647 abc
1648 )
1649 )
1650 (
1651 """, re.VERBOSE)
1652 err = cm.exception
1653 self.assertEqual(err.pos, 77)
1654 self.assertEqual(err.lineno, 5)
1655 self.assertEqual(err.colno, 17)
1656 self.assertIn(err.msg, str(err))
1657 self.assertIn(' at position 77', str(err))
1658 self.assertIn('(line 5, column 17)', str(err))
1659
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001660 def test_misc_errors(self):
1661 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1662 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1663 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1664 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1665 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1666 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1667 self.checkPatternError(r'(?i', 'missing )', 3)
1668 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1669 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1670 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1671 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1672
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001673
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001674class PatternReprTests(unittest.TestCase):
1675 def check(self, pattern, expected):
1676 self.assertEqual(repr(re.compile(pattern)), expected)
1677
1678 def check_flags(self, pattern, flags, expected):
1679 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1680
1681 def test_without_flags(self):
1682 self.check('random pattern',
1683 "re.compile('random pattern')")
1684
1685 def test_single_flag(self):
1686 self.check_flags('random pattern', re.IGNORECASE,
1687 "re.compile('random pattern', re.IGNORECASE)")
1688
1689 def test_multiple_flags(self):
1690 self.check_flags('random pattern', re.I|re.S|re.X,
1691 "re.compile('random pattern', "
1692 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1693
1694 def test_unicode_flag(self):
1695 self.check_flags('random pattern', re.U,
1696 "re.compile('random pattern')")
1697 self.check_flags('random pattern', re.I|re.S|re.U,
1698 "re.compile('random pattern', "
1699 "re.IGNORECASE|re.DOTALL)")
1700
1701 def test_inline_flags(self):
1702 self.check('(?i)pattern',
1703 "re.compile('(?i)pattern', re.IGNORECASE)")
1704
1705 def test_unknown_flags(self):
1706 self.check_flags('random pattern', 0x123000,
1707 "re.compile('random pattern', 0x123000)")
1708 self.check_flags('random pattern', 0x123000|re.I,
1709 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1710
1711 def test_bytes(self):
1712 self.check(b'bytes pattern',
1713 "re.compile(b'bytes pattern')")
1714 self.check_flags(b'bytes pattern', re.A,
1715 "re.compile(b'bytes pattern', re.ASCII)")
1716
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001717 def test_locale(self):
1718 self.check_flags(b'bytes pattern', re.L,
1719 "re.compile(b'bytes pattern', re.LOCALE)")
1720
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001721 def test_quotes(self):
1722 self.check('random "double quoted" pattern',
1723 '''re.compile('random "double quoted" pattern')''')
1724 self.check("random 'single quoted' pattern",
1725 '''re.compile("random 'single quoted' pattern")''')
1726 self.check('''both 'single' and "double" quotes''',
1727 '''re.compile('both \\'single\\' and "double" quotes')''')
1728
1729 def test_long_pattern(self):
1730 pattern = 'Very %spattern' % ('long ' * 1000)
1731 r = repr(re.compile(pattern))
1732 self.assertLess(len(r), 300)
1733 self.assertEqual(r[:30], "re.compile('Very long long lon")
1734 r = repr(re.compile(pattern, re.I))
1735 self.assertLess(len(r), 300)
1736 self.assertEqual(r[:30], "re.compile('Very long long lon")
1737 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1738
1739
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001740class ImplementationTest(unittest.TestCase):
1741 """
1742 Test implementation details of the re module.
1743 """
1744
1745 def test_overlap_table(self):
1746 f = sre_compile._generate_overlap_table
1747 self.assertEqual(f(""), [])
1748 self.assertEqual(f("a"), [0])
1749 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1750 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1751 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1752 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1753
1754
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001755class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001756
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001757 def test_re_benchmarks(self):
1758 're_tests benchmarks'
1759 from test.re_tests import benchmarks
1760 for pattern, s in benchmarks:
1761 with self.subTest(pattern=pattern, string=s):
1762 p = re.compile(pattern)
1763 self.assertTrue(p.search(s))
1764 self.assertTrue(p.match(s))
1765 self.assertTrue(p.fullmatch(s))
1766 s2 = ' '*10000 + s + ' '*10000
1767 self.assertTrue(p.search(s2))
1768 self.assertTrue(p.match(s2, 10000))
1769 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1770 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001771
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001772 def test_re_tests(self):
1773 're_tests test suite'
1774 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1775 for t in tests:
1776 pattern = s = outcome = repl = expected = None
1777 if len(t) == 5:
1778 pattern, s, outcome, repl, expected = t
1779 elif len(t) == 3:
1780 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001781 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001782 raise ValueError('Test tuples should have 3 or 5 fields', t)
1783
1784 with self.subTest(pattern=pattern, string=s):
1785 if outcome == SYNTAX_ERROR: # Expected a syntax error
1786 with self.assertRaises(re.error):
1787 re.compile(pattern)
1788 continue
1789
1790 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001791 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001792 if outcome == FAIL:
1793 self.assertIsNone(result, 'Succeeded incorrectly')
1794 continue
1795
1796 with self.subTest():
1797 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001798 # Matched, as expected, so now we compute the
1799 # result string and compare it to our expected result.
1800 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001801 vardict = {'found': result.group(0),
1802 'groups': result.group(),
1803 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001804 for i in range(1, 100):
1805 try:
1806 gi = result.group(i)
1807 # Special hack because else the string concat fails:
1808 if gi is None:
1809 gi = "None"
1810 except IndexError:
1811 gi = "Error"
1812 vardict['g%d' % i] = gi
1813 for i in result.re.groupindex.keys():
1814 try:
1815 gi = result.group(i)
1816 if gi is None:
1817 gi = "None"
1818 except IndexError:
1819 gi = "Error"
1820 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001821 self.assertEqual(eval(repl, vardict), expected,
1822 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001823
Antoine Pitrou22628c42008-07-22 17:53:22 +00001824 # Try the match with both pattern and string converted to
1825 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001826 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001827 bpat = bytes(pattern, "ascii")
1828 bs = bytes(s, "ascii")
1829 except UnicodeEncodeError:
1830 # skip non-ascii tests
1831 pass
1832 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001833 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001834 obj = re.compile(bpat)
1835 self.assertTrue(obj.search(bs))
1836
1837 # Try the match with LOCALE enabled, and check that it
1838 # still succeeds.
1839 with self.subTest('locale-sensitive match'):
1840 obj = re.compile(bpat, re.LOCALE)
1841 result = obj.search(bs)
1842 if result is None:
1843 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001844
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001845 # Try the match with the search area limited to the extent
1846 # of the match and see if it still succeeds. \B will
1847 # break (because it won't match at the end or start of a
1848 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001849 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1850 and result is not None):
1851 with self.subTest('range-limited match'):
1852 obj = re.compile(pattern)
1853 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001854
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001855 # Try the match with IGNORECASE enabled, and check that it
1856 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001857 with self.subTest('case-insensitive match'):
1858 obj = re.compile(pattern, re.IGNORECASE)
1859 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001860
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001861 # Try the match with UNICODE locale enabled, and check
1862 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001863 with self.subTest('unicode-sensitive match'):
1864 obj = re.compile(pattern, re.UNICODE)
1865 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001866
Gregory P. Smith5a631832010-07-27 05:31:29 +00001867
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001868if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001869 unittest.main()