blob: 6803e02c0e2c83dd62bc839d5b9a9247577174c3 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
R David Murray44b548d2016-09-08 13:59:53 -0400116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300127 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
R David Murray44b548d2016-09-08 13:59:53 -0400130 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
R David Murray44b548d2016-09-08 13:59:53 -0400134 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
R David Murray44b548d2016-09-08 13:59:53 -0400221 re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
222 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
223 re.compile(r'(?P<a1>x)\1(?(1)y)')
224 self.checkPatternError(r'(?P<a>)(?P<a>)',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
R David Murray44b548d2016-09-08 13:59:53 -0400227 self.checkPatternError(r'(?P<a>(?P=a))',
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300228 "cannot refer to an open group", 10)
R David Murray44b548d2016-09-08 13:59:53 -0400229 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
230 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
231 self.checkPatternError(r'(?P=', 'missing group name', 4)
232 self.checkPatternError(r'(?P=)', 'missing group name', 4)
233 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
234 self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
235 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
236 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
237 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
238 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
239 self.checkPatternError(r'(?P<', 'missing group name', 4)
240 self.checkPatternError(r'(?P<>)', 'missing group name', 4)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200241 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
242 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
243 self.checkPatternError(r'(?(', 'missing group name', 3)
244 self.checkPatternError(r'(?())', 'missing group name', 3)
245 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
246 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
247 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
248 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200249 # New valid/invalid identifiers in Python 3
250 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
251 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200252 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300253 # Support > 100 groups.
254 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
255 pat = '(?:%s)(?(200)z|t)' % pat
256 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_symbolic_refs(self):
R David Murray44b548d2016-09-08 13:59:53 -0400259 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200260 'missing >, unterminated name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400261 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200262 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400263 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
264 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200265 "bad character in group name 'a a'", 3)
R David Murray44b548d2016-09-08 13:59:53 -0400266 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200267 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400268 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200269 "bad character in group name '1a1'", 3)
270 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
271 'invalid group reference')
272 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
273 'invalid group reference')
274 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
R David Murray44b548d2016-09-08 13:59:53 -0400275 re.sub('(?P<a>x)', r'\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300276 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
R David Murray44b548d2016-09-08 13:59:53 -0400278 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200279 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200280 # New valid/invalid identifiers in Python 3
281 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
282 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
R David Murray44b548d2016-09-08 13:59:53 -0400283 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200284 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300285 # Support > 100 groups.
286 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
R David Murray44b548d2016-09-08 13:59:53 -0400287 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000288
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 def test_re_subn(self):
290 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
291 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
292 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
293 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100294 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000295
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000296 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300297 for string in ":a:b::c", S(":a:b::c"):
298 self.assertTypedEqual(re.split(":", string),
299 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', ':', 'a', ':', 'b', '::', 'c'])
304 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
305 memoryview(b":a:b::c")):
306 self.assertTypedEqual(re.split(b":", string),
307 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300312 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
313 "\U0001d49c\U0001d49e\U0001d4b5"):
314 string = ":%s:%s::%s" % (a, b, c)
315 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200316 self.assertEqual(re.split(":+", string), ['', a, b, c])
317 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300318 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300319
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200320 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
321 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000322 ['', ':', 'a', ':', 'b', ':', 'c'])
323 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
324 ['', ':', 'a', ':b::', 'c'])
325 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
326 ['', None, ':', 'a', None, ':', '', 'b', None, '',
327 None, '::', 'c'])
328 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
329 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000330
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200331 for sep, expected in [
332 (':*', ['', 'a', 'b', 'c']),
333 ('(?::*)', ['', 'a', 'b', 'c']),
334 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
335 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
336 ]:
337 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
338 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
339
340 for sep, expected in [
341 ('', [':a:b::c']),
342 (r'\b', [':a:b::c']),
343 (r'(?=:)', [':a:b::c']),
344 (r'(?<=:)', [':a:b::c']),
345 ]:
346 with self.subTest(sep=sep), self.assertRaises(ValueError):
347 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
348
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000349 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100350 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
351 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
352 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000355 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200356 with self.assertWarns(FutureWarning):
357 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
358 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000359
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000360 def test_re_findall(self):
361 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300362 for string in "a:b::c:::d", S("a:b::c:::d"):
363 self.assertTypedEqual(re.findall(":+", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:+)", string),
366 [":", "::", ":::"])
367 self.assertTypedEqual(re.findall("(:)(:*)", string),
368 [(":", ""), (":", ":"), (":", "::")])
369 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
370 memoryview(b"a:b::c:::d")):
371 self.assertTypedEqual(re.findall(b":+", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:+)", string),
374 [b":", b"::", b":::"])
375 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
376 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300377 for x in ("\xe0", "\u0430", "\U0001d49c"):
378 xx = x * 2
379 xxx = x * 3
380 string = "a%sb%sc%sd" % (x, xx, xxx)
381 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
382 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
383 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
384 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000385
Skip Montanaro5ba00542003-04-25 16:00:14 +0000386 def test_bug_117612(self):
387 self.assertEqual(re.findall(r"(a|(b))", "aba"),
388 [("a", ""),("b", "b"),("a", "")])
389
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000390 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300391 for string in 'a', S('a'):
392 self.assertEqual(re.match('a', string).groups(), ())
393 self.assertEqual(re.match('(a)', string).groups(), ('a',))
394 self.assertEqual(re.match('(a)', string).group(0), 'a')
395 self.assertEqual(re.match('(a)', string).group(1), 'a')
396 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
397 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
398 self.assertEqual(re.match(b'a', string).groups(), ())
399 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
400 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
401 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
402 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300403 for a in ("\xe0", "\u0430", "\U0001d49c"):
404 self.assertEqual(re.match(a, a).groups(), ())
405 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
406 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
407 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
408 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000409
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000410 pat = re.compile('((a)|(b))(c)?')
411 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
412 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
413 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
414 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
415 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000416
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000417 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
418 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
419 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
420 (None, 'b', None))
421 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000422
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300423 def test_group(self):
424 class Index:
425 def __init__(self, value):
426 self.value = value
427 def __index__(self):
428 return self.value
429 # A single group
430 m = re.match('(a)(b)', 'ab')
431 self.assertEqual(m.group(), 'ab')
432 self.assertEqual(m.group(0), 'ab')
433 self.assertEqual(m.group(1), 'a')
434 self.assertEqual(m.group(Index(1)), 'a')
435 self.assertRaises(IndexError, m.group, -1)
436 self.assertRaises(IndexError, m.group, 3)
437 self.assertRaises(IndexError, m.group, 1<<1000)
438 self.assertRaises(IndexError, m.group, Index(1<<1000))
439 self.assertRaises(IndexError, m.group, 'x')
440 # Multiple groups
441 self.assertEqual(m.group(2, 1), ('b', 'a'))
442 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
443
Eric V. Smith605bdae2016-09-11 08:55:43 -0400444 def test_match_getitem(self):
445 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
446
447 m = pat.match('a')
448 self.assertEqual(m['a1'], 'a')
449 self.assertEqual(m['b2'], None)
450 self.assertEqual(m['c3'], None)
451 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
452 self.assertEqual(m[0], 'a')
453 self.assertEqual(m[1], 'a')
454 self.assertEqual(m[2], None)
455 self.assertEqual(m[3], None)
456 with self.assertRaisesRegex(IndexError, 'no such group'):
457 m['X']
458 with self.assertRaisesRegex(IndexError, 'no such group'):
459 m[-1]
460 with self.assertRaisesRegex(IndexError, 'no such group'):
461 m[4]
462 with self.assertRaisesRegex(IndexError, 'no such group'):
463 m[0, 1]
464 with self.assertRaisesRegex(IndexError, 'no such group'):
465 m[(0,)]
466 with self.assertRaisesRegex(IndexError, 'no such group'):
467 m[(0, 1)]
468 with self.assertRaisesRegex(KeyError, 'a2'):
469 'a1={a2}'.format_map(m)
470
471 m = pat.match('ac')
472 self.assertEqual(m['a1'], 'a')
473 self.assertEqual(m['b2'], None)
474 self.assertEqual(m['c3'], 'c')
475 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
476 self.assertEqual(m[0], 'ac')
477 self.assertEqual(m[1], 'a')
478 self.assertEqual(m[2], None)
479 self.assertEqual(m[3], 'c')
480
481 # Cannot assign.
482 with self.assertRaises(TypeError):
483 m[0] = 1
484
485 # No len().
486 self.assertRaises(TypeError, len, m)
487
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200488 def test_re_fullmatch(self):
489 # Issue 16203: Proposal: add re.fullmatch() method.
490 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
491 for string in "ab", S("ab"):
492 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
493 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
494 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
495 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
496 r = r"%s|%s" % (a, a + b)
497 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
498 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
499 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
500 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
501 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
502 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
503 self.assertIsNone(re.fullmatch(r"a+", "ab"))
504 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
505 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
506 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
507 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
508 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
509 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
510
511 self.assertEqual(
512 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
513 self.assertEqual(
514 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
515 self.assertEqual(
516 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
517
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000518 def test_re_groupref_exists(self):
R David Murray44b548d2016-09-08 13:59:53 -0400519 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000520 ('(', 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400521 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000522 (None, 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400523 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
524 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000525 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
526 ('a', 'b'))
R David Murray44b548d2016-09-08 13:59:53 -0400527 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000528 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400529 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000530 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400531 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000532 ('a', ''))
533
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000534 # Tests for bug #1177831: exercise groups other than the first group
535 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
536 self.assertEqual(p.match('abc').groups(),
537 ('a', 'b', 'c'))
538 self.assertEqual(p.match('ad').groups(),
539 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300540 self.assertIsNone(p.match('abd'))
541 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000542
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300543 # Support > 100 groups.
544 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
545 pat = '(?:%s)(?(200)z)' % pat
546 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000547
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200548 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
549 self.checkPatternError(r'()(?(1)a|b',
550 'missing ), unterminated subpattern', 2)
551 self.checkPatternError(r'()(?(1)a|b|c)',
552 'conditional backref with more than '
553 'two branches', 10)
554
555 def test_re_groupref_overflow(self):
R David Murray44b548d2016-09-08 13:59:53 -0400556 self.checkTemplateError('()', r'\g<%s>' % sre_constants.MAXGROUPS, 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200557 'invalid group reference', 3)
558 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
559 'invalid group reference', 10)
560
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561 def test_re_groupref(self):
562 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
563 ('|', 'a'))
564 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
565 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300566 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
567 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000568 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
569 ('a', 'a'))
570 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
571 (None, None))
572
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200573 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
574
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000575 def test_groupdict(self):
576 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
577 'first second').groupdict(),
578 {'first':'first', 'second':'second'})
579
580 def test_expand(self):
581 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
582 "first second")
583 .expand(r"\2 \1 \g<second> \g<first>"),
584 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300585 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
586 "first")
587 .expand(r"\2 \g<second>"),
588 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000589
590 def test_repeat_minmax(self):
R David Murray44b548d2016-09-08 13:59:53 -0400591 self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
592 self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
593 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
594 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000595
R David Murray44b548d2016-09-08 13:59:53 -0400596 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
597 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
598 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
599 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
600 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
601 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
602 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
603 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000604
R David Murray44b548d2016-09-08 13:59:53 -0400605 self.assertIsNone(re.match(r"^x{1}$", "xxx"))
606 self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
607 self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
608 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000609
R David Murray44b548d2016-09-08 13:59:53 -0400610 self.assertTrue(re.match(r"^x{3}$", "xxx"))
611 self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
612 self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
613 self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
614 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
615 self.assertTrue(re.match(r"^x{3}?$", "xxx"))
616 self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
617 self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
618 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000619
R David Murray44b548d2016-09-08 13:59:53 -0400620 self.assertIsNone(re.match(r"^x{}$", "xxx"))
621 self.assertTrue(re.match(r"^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000622
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200623 self.checkPatternError(r'x{2,1}',
624 'min repeat greater than max repeat', 2)
625
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000626 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000627 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000628 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000629 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
630 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
631 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
632 {'first': 1, 'other': 2})
633
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000634 self.assertEqual(re.match("(a)", "a").pos, 0)
635 self.assertEqual(re.match("(a)", "a").endpos, 1)
636 self.assertEqual(re.match("(a)", "a").string, "a")
637 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300638 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000639
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300640 # Issue 14260. groupindex should be non-modifiable mapping.
641 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
642 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
643 self.assertEqual(p.groupindex['other'], 2)
644 with self.assertRaises(TypeError):
645 p.groupindex['other'] = 0
646 self.assertEqual(p.groupindex['other'], 2)
647
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000648 def test_special_escapes(self):
649 self.assertEqual(re.search(r"\b(b.)\b",
650 "abcd abc bcd bx").group(1), "bx")
651 self.assertEqual(re.search(r"\B(b.)\B",
652 "abc bcd bc abxd").group(1), "bx")
653 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300654 "abcd abc bcd bx", re.ASCII).group(1), "bx")
655 self.assertEqual(re.search(r"\B(b.)\B",
656 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000657 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
658 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300659 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300660 self.assertEqual(re.search(br"\b(b.)\b",
661 b"abcd abc bcd bx").group(1), b"bx")
662 self.assertEqual(re.search(br"\B(b.)\B",
663 b"abc bcd bc abxd").group(1), b"bx")
664 self.assertEqual(re.search(br"\b(b.)\b",
665 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
666 self.assertEqual(re.search(br"\B(b.)\B",
667 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
668 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
669 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300670 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000671 self.assertEqual(re.search(r"\d\D\w\W\s\S",
672 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300673 self.assertEqual(re.search(br"\d\D\w\W\s\S",
674 b"1aa! a").group(0), b"1aa! a")
675 self.assertEqual(re.search(r"\d\D\w\W\s\S",
676 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300677 self.assertEqual(re.search(br"\d\D\w\W\s\S",
678 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000679
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200680 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200681 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200682 self.assertEqual(re.match(r"\(", '(').group(), '(')
683 self.assertIsNone(re.match(r"\(", ')'))
684 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200685 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
686 self.assertIsNone(re.match(r"[\]]", '['))
687 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
688 self.assertIsNone(re.match(r"[a\-c]", 'b'))
689 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
690 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200691 re.purge() # for warnings
692 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
693 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300694 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200695 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
696 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300697 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200698
Ezio Melotti5a045b92012-02-29 11:48:44 +0200699 def test_string_boundaries(self):
700 # See http://bugs.python.org/issue10713
701 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
702 "abc")
703 # There's a word boundary at the start of a string.
704 self.assertTrue(re.match(r"\b", "abc"))
705 # A non-empty string includes a non-boundary zero-length match.
706 self.assertTrue(re.search(r"\B", "abc"))
707 # There is no non-boundary match at the start of a string.
708 self.assertFalse(re.match(r"\B", "abc"))
709 # However, an empty string contains no word boundaries, and also no
710 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300711 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200712 # This one is questionable and different from the perlre behaviour,
713 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300714 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200715 # A single word-character string has two boundaries, but no
716 # non-boundary gaps.
717 self.assertEqual(len(re.findall(r"\b", "a")), 2)
718 self.assertEqual(len(re.findall(r"\B", "a")), 0)
719 # If there are no words, there are no boundaries
720 self.assertEqual(len(re.findall(r"\b", " ")), 0)
721 self.assertEqual(len(re.findall(r"\b", " ")), 0)
722 # Can match around the whitespace.
723 self.assertEqual(len(re.findall(r"\B", " ")), 2)
724
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000725 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000726 self.assertEqual(re.match("([\u2222\u2223])",
727 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300728 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300729 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000730
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100731 def test_big_codesize(self):
732 # Issue #1160
733 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300734 self.assertTrue(r.match('1000'))
735 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100736
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000737 def test_anyall(self):
738 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
739 "a\nb")
740 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
741 "a\n\nb")
742
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200743 def test_lookahead(self):
R David Murray44b548d2016-09-08 13:59:53 -0400744 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
745 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
746 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
747 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000748 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
749 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
750 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
751
752 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
753 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
754 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
755 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
756
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200757 # Group reference.
758 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
759 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
760 # Conditional group reference.
761 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
762 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
763 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
764 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
765 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
766 # Group used before defined.
767 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
768 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
769 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
770
771 def test_lookbehind(self):
772 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
773 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
774 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
775 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
776 # Group reference.
777 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
778 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
779 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
780 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
781 # Conditional group reference.
782 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
783 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
784 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
785 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
786 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
787 # Group used before defined.
788 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
789 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
790 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
791 # Group defined in the same lookbehind pattern
792 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
793 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
794 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
795 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
796
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000797 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000798 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300799 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000800 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
801 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
802 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
803 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
804 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
805 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
806 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
807 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
808
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200809 assert '\u212a'.lower() == 'k' # 'K'
810 self.assertTrue(re.match(r'K', '\u212a', re.I))
811 self.assertTrue(re.match(r'k', '\u212a', re.I))
812 self.assertTrue(re.match(r'\u212a', 'K', re.I))
813 self.assertTrue(re.match(r'\u212a', 'k', re.I))
814 assert '\u017f'.upper() == 'S' # 'ſ'
815 self.assertTrue(re.match(r'S', '\u017f', re.I))
816 self.assertTrue(re.match(r's', '\u017f', re.I))
817 self.assertTrue(re.match(r'\u017f', 'S', re.I))
818 self.assertTrue(re.match(r'\u017f', 's', re.I))
819 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
820 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
821 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
822
823 def test_ignore_case_set(self):
824 self.assertTrue(re.match(r'[19A]', 'A', re.I))
825 self.assertTrue(re.match(r'[19a]', 'a', re.I))
826 self.assertTrue(re.match(r'[19a]', 'A', re.I))
827 self.assertTrue(re.match(r'[19A]', 'a', re.I))
828 self.assertTrue(re.match(br'[19A]', b'A', re.I))
829 self.assertTrue(re.match(br'[19a]', b'a', re.I))
830 self.assertTrue(re.match(br'[19a]', b'A', re.I))
831 self.assertTrue(re.match(br'[19A]', b'a', re.I))
832 assert '\u212a'.lower() == 'k' # 'K'
833 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
834 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
835 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
836 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
837 assert '\u017f'.upper() == 'S' # 'ſ'
838 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
839 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
840 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
841 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
842 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
843 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
844 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
845
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200846 def test_ignore_case_range(self):
847 # Issues #3511, #17381.
848 self.assertTrue(re.match(r'[9-a]', '_', re.I))
849 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
850 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
851 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
852 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
853 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
854 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
855 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
856 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
857 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
858 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
859 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
860 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
861 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
862 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
863 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
864
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200865 assert '\u212a'.lower() == 'k' # 'K'
866 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
867 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
868 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
869 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
870 assert '\u017f'.upper() == 'S' # 'ſ'
871 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
872 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
873 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
874 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
875 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
876 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
877 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
878
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000879 def test_category(self):
880 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
881
882 def test_getlower(self):
883 import _sre
884 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
885 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
886 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200887 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000888
889 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300890 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200891 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
892 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000893
894 def test_not_literal(self):
R David Murray44b548d2016-09-08 13:59:53 -0400895 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
896 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000897
898 def test_search_coverage(self):
R David Murray44b548d2016-09-08 13:59:53 -0400899 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
900 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000901
Ezio Melottid2114eb2011-03-25 14:08:44 +0200902 def assertMatch(self, pattern, text, match=None, span=None,
903 matcher=re.match):
904 if match is None and span is None:
905 # the pattern matches the whole text
906 match = text
907 span = (0, len(text))
908 elif match is None or span is None:
909 raise ValueError('If match is not None, span should be specified '
910 '(and vice versa).')
911 m = matcher(pattern, text)
912 self.assertTrue(m)
913 self.assertEqual(m.group(), match)
914 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000915
Ezio Melottid2114eb2011-03-25 14:08:44 +0200916 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300917 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200918 p = ''.join(chr(i) for i in range(256))
919 for c in p:
920 if c in alnum_chars:
921 self.assertEqual(re.escape(c), c)
922 elif c == '\x00':
923 self.assertEqual(re.escape(c), '\\000')
924 else:
925 self.assertEqual(re.escape(c), '\\' + c)
926 self.assertMatch(re.escape(c), c)
927 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000928
Guido van Rossum698280d2008-09-10 17:44:35 +0000929 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300930 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200931 p = bytes(range(256))
932 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000933 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200934 if b in alnum_chars:
935 self.assertEqual(re.escape(b), b)
936 elif i == 0:
937 self.assertEqual(re.escape(b), b'\\000')
938 else:
939 self.assertEqual(re.escape(b), b'\\' + b)
940 self.assertMatch(re.escape(b), b)
941 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000942
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200943 def test_re_escape_non_ascii(self):
944 s = 'xxx\u2620\u2620\u2620xxx'
945 s_escaped = re.escape(s)
946 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
947 self.assertMatch(s_escaped, s)
948 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
949 'x\u2620\u2620\u2620x', (2, 7), re.search)
950
951 def test_re_escape_non_ascii_bytes(self):
952 b = 'y\u2620y\u2620y'.encode('utf-8')
953 b_escaped = re.escape(b)
954 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
955 self.assertMatch(b_escaped, b)
956 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
957 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000958
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300959 def test_pickling(self):
960 import pickle
961 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
962 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
963 pickled = pickle.dumps(oldpat, proto)
964 newpat = pickle.loads(pickled)
965 self.assertEqual(newpat, oldpat)
966 # current pickle expects the _compile() reconstructor in re module
967 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000968
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000969 def test_constants(self):
970 self.assertEqual(re.I, re.IGNORECASE)
971 self.assertEqual(re.L, re.LOCALE)
972 self.assertEqual(re.M, re.MULTILINE)
973 self.assertEqual(re.S, re.DOTALL)
974 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000975
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000976 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200977 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300978 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200979 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
980 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000981
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000982 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200983 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
984 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300985 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
986 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
987 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
988 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
989 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
990 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200991 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300992 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
993 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
994 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
995 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
996 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
997 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
998 self.assertTrue(re.match(r"\0", "\000"))
999 self.assertTrue(re.match(r"\08", "\0008"))
1000 self.assertTrue(re.match(r"\01", "\001"))
1001 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001002 self.checkPatternError(r"\567",
1003 r'octal escape value \567 outside of '
1004 r'range 0-0o377', 0)
1005 self.checkPatternError(r"\911", 'invalid group reference', 0)
1006 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1007 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1008 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1009 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1010 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1011 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1012 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001013
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001014 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001015 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1016 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001017 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1018 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1019 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1020 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1021 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1022 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1023 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1024 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001025 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001026 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1027 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1028 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1029 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1030 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1031 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001032 self.checkPatternError(r"[\567]",
1033 r'octal escape value \567 outside of '
1034 r'range 0-0o377', 1)
1035 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1036 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1037 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1038 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1039 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +03001040 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001041
1042 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001043 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001044 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1045 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1046 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1047 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1048 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1049 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001050 self.assertRaises(re.error, re.compile, br"\u1234")
1051 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001052 self.assertTrue(re.match(br"\0", b"\000"))
1053 self.assertTrue(re.match(br"\08", b"\0008"))
1054 self.assertTrue(re.match(br"\01", b"\001"))
1055 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001056 self.checkPatternError(br"\567",
1057 r'octal escape value \567 outside of '
1058 r'range 0-0o377', 0)
1059 self.checkPatternError(br"\911", 'invalid group reference', 0)
1060 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1061 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001062
1063 def test_sre_byte_class_literals(self):
1064 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001065 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1066 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1067 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1068 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1069 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1070 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1071 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1072 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001073 self.assertRaises(re.error, re.compile, br"[\u1234]")
1074 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001075 self.checkPatternError(br"[\567]",
1076 r'octal escape value \567 outside of '
1077 r'range 0-0o377', 1)
1078 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1079 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1080
1081 def test_character_set_errors(self):
1082 self.checkPatternError(r'[', 'unterminated character set', 0)
1083 self.checkPatternError(r'[^', 'unterminated character set', 0)
1084 self.checkPatternError(r'[a', 'unterminated character set', 0)
1085 # bug 545855 -- This pattern failed to cause a compile error as it
1086 # should, instead provoking a TypeError.
1087 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1088 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1089 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1090 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001091
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001092 def test_bug_113254(self):
1093 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1094 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1095 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1096
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001097 def test_bug_527371(self):
1098 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001099 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001100 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1101 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
R David Murray44b548d2016-09-08 13:59:53 -04001102 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1103 self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001104
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001105 def test_bug_418626(self):
1106 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1107 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1108 # pattern '*?' on a long string.
1109 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1110 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1111 20003)
1112 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001113 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001114 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001115 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001116
1117 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001118 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001119 self.assertEqual(re.compile(pat) and 1, 1)
1120
Skip Montanaro1e703c62003-04-25 15:40:28 +00001121 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001122 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001123 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001124 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1125 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1126 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001127
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001128 def test_nothing_to_repeat(self):
1129 for reps in '*', '+', '?', '{1,2}':
1130 for mod in '', '?':
1131 self.checkPatternError('%s%s' % (reps, mod),
1132 'nothing to repeat', 0)
1133 self.checkPatternError('(?:%s%s)' % (reps, mod),
1134 'nothing to repeat', 3)
1135
1136 def test_multiple_repeat(self):
1137 for outer_reps in '*', '+', '{1,2}':
1138 for outer_mod in '', '?':
1139 outer_op = outer_reps + outer_mod
1140 for inner_reps in '*', '+', '?', '{1,2}':
1141 for inner_mod in '', '?':
1142 inner_op = inner_reps + inner_mod
1143 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1144 'multiple repeat', 1 + len(inner_op))
1145
Serhiy Storchakafa468162013-02-16 21:23:53 +02001146 def test_unlimited_zero_width_repeat(self):
1147 # Issue #9669
1148 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1149 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1150 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1151 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1152 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1153 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1154
Skip Montanaro1e703c62003-04-25 15:40:28 +00001155 def test_scanner(self):
1156 def s_ident(scanner, token): return token
1157 def s_operator(scanner, token): return "op%s" % token
1158 def s_float(scanner, token): return float(token)
1159 def s_int(scanner, token): return int(token)
1160
1161 scanner = Scanner([
1162 (r"[a-zA-Z_]\w*", s_ident),
1163 (r"\d+\.\d*", s_float),
1164 (r"\d+", s_int),
1165 (r"=|\+|-|\*|/", s_operator),
1166 (r"\s+", None),
1167 ])
1168
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001169 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001170
Skip Montanaro1e703c62003-04-25 15:40:28 +00001171 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1172 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1173 'op+', 'bar'], ''))
1174
Skip Montanaro5ba00542003-04-25 16:00:14 +00001175 def test_bug_448951(self):
1176 # bug 448951 (similar to 429357, but with single char match)
1177 # (Also test greedy matches.)
1178 for op in '','?','*':
1179 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1180 (None, None))
1181 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1182 ('a:', 'a'))
1183
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001184 def test_bug_725106(self):
1185 # capturing groups in alternatives in repeats
1186 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1187 ('b', 'a'))
1188 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1189 ('c', 'b'))
1190 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1191 ('b', None))
1192 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1193 ('b', None))
1194 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1195 ('b', 'a'))
1196 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1197 ('c', 'b'))
1198 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1199 ('b', None))
1200 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1201 ('b', None))
1202
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001203 def test_bug_725149(self):
1204 # mark_stack_base restoring before restoring marks
1205 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1206 ('a', None))
1207 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1208 ('a', None, None))
1209
Just van Rossum12723ba2003-07-02 20:03:04 +00001210 def test_bug_764548(self):
1211 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001212 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001213 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001214 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001215
Skip Montanaro5ba00542003-04-25 16:00:14 +00001216 def test_finditer(self):
1217 iter = re.finditer(r":+", "a:b::c:::d")
1218 self.assertEqual([item.group(0) for item in iter],
1219 [":", "::", ":::"])
1220
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001221 pat = re.compile(r":+")
1222 iter = pat.finditer("a:b::c:::d", 1, 10)
1223 self.assertEqual([item.group(0) for item in iter],
1224 [":", "::", ":::"])
1225
1226 pat = re.compile(r":+")
1227 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1228 self.assertEqual([item.group(0) for item in iter],
1229 [":", "::", ":::"])
1230
1231 pat = re.compile(r":+")
1232 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1233 self.assertEqual([item.group(0) for item in iter],
1234 [":", "::", ":::"])
1235
1236 pat = re.compile(r":+")
1237 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1238 self.assertEqual([item.group(0) for item in iter],
1239 ["::", "::"])
1240
Thomas Wouters40a088d2008-03-18 20:19:54 +00001241 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001242 self.assertIsNot(re.compile('bug_926075'),
1243 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001244
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001245 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001246 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001247 self.assertEqual(re.compile(pattern).split("a.b.c"),
1248 ['a','b','c'])
1249
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001250 def test_bug_581080(self):
1251 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001252 self.assertEqual(next(iter).span(), (1,2))
1253 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001254
1255 scanner = re.compile(r"\s").scanner("a b")
1256 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001257 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001258
1259 def test_bug_817234(self):
1260 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001261 self.assertEqual(next(iter).span(), (0, 4))
1262 self.assertEqual(next(iter).span(), (4, 4))
1263 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001264
Mark Dickinson1f268282009-07-28 17:22:36 +00001265 def test_bug_6561(self):
1266 # '\d' should match characters in Unicode category 'Nd'
1267 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1268 # Letter) or 'No' (Number, Other).
1269 decimal_digits = [
1270 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1271 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1272 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1273 ]
1274 for x in decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001275 self.assertEqual(re.match(r'^\d$', x).group(0), x)
Mark Dickinson1f268282009-07-28 17:22:36 +00001276
1277 not_decimal_digits = [
1278 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1279 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1280 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1281 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1282 ]
1283 for x in not_decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001284 self.assertIsNone(re.match(r'^\d$', x))
Mark Dickinson1f268282009-07-28 17:22:36 +00001285
Guido van Rossumd8faa362007-04-27 19:54:29 +00001286 def test_empty_array(self):
1287 # SF buf 1647541
1288 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001289 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001290 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001291 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001292 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001293
Christian Heimes072c0f12008-01-03 23:01:04 +00001294 def test_inline_flags(self):
1295 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001296 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1297 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001298
1299 p = re.compile(upper_char, re.I | re.U)
1300 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001301 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001302
1303 p = re.compile(lower_char, re.I | re.U)
1304 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001305 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001306
1307 p = re.compile('(?i)' + upper_char, re.U)
1308 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001309 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001310
1311 p = re.compile('(?i)' + lower_char, re.U)
1312 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001313 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001314
1315 p = re.compile('(?iu)' + upper_char)
1316 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001317 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001318
1319 p = re.compile('(?iu)' + lower_char)
1320 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001321 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001322
Serhiy Storchakad65cd092016-09-11 01:39:01 +03001323 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
1324 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
1325
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001326 p = upper_char + '(?i)'
1327 with self.assertWarns(DeprecationWarning) as warns:
1328 self.assertTrue(re.match(p, lower_char))
1329 self.assertEqual(
1330 str(warns.warnings[0].message),
1331 'Flags not at the start of the expression %s' % p
1332 )
1333
1334 p = upper_char + '(?i)%s' % ('.?' * 100)
1335 with self.assertWarns(DeprecationWarning) as warns:
1336 self.assertTrue(re.match(p, lower_char))
1337 self.assertEqual(
1338 str(warns.warnings[0].message),
1339 'Flags not at the start of the expression %s (truncated)' % p[:20]
1340 )
Serhiy Storchakabd48d272016-09-11 12:50:02 +03001341
Christian Heimes25bb7832008-01-11 16:17:00 +00001342 def test_dollar_matches_twice(self):
1343 "$ matches the end of string, and just before the terminating \n"
1344 pattern = re.compile('$')
1345 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1346 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1347 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1348
1349 pattern = re.compile('$', re.MULTILINE)
1350 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1351 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1352 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1353
Antoine Pitroufd036452008-08-19 17:56:33 +00001354 def test_bytes_str_mixing(self):
1355 # Mixing str and bytes is disallowed
1356 pat = re.compile('.')
1357 bpat = re.compile(b'.')
1358 self.assertRaises(TypeError, pat.match, b'b')
1359 self.assertRaises(TypeError, bpat.match, 'b')
1360 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1361 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1362 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1363 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1364 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1365 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1366
1367 def test_ascii_and_unicode_flag(self):
1368 # String patterns
1369 for flags in (0, re.UNICODE):
1370 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001371 self.assertTrue(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001372 pat = re.compile(r'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001373 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001374 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001375 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001376 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001377 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001378 pat = re.compile(r'\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001379 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001380 pat = re.compile(r'(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001381 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001382 # Bytes patterns
1383 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001384 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001385 self.assertIsNone(pat.match(b'\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001386 pat = re.compile(br'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001387 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001388 # Incompatibilities
R David Murray44b548d2016-09-08 13:59:53 -04001389 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
1390 self.assertRaises(ValueError, re.compile, br'(?u)\w')
1391 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1392 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1393 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
1394 self.assertRaises(ValueError, re.compile, r'(?au)\w')
Antoine Pitroufd036452008-08-19 17:56:33 +00001395
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001396 def test_locale_flag(self):
1397 import locale
1398 _, enc = locale.getlocale(locale.LC_CTYPE)
1399 # Search non-ASCII letter
1400 for i in range(128, 256):
1401 try:
1402 c = bytes([i]).decode(enc)
1403 sletter = c.lower()
1404 if sletter == c: continue
1405 bletter = sletter.encode(enc)
1406 if len(bletter) != 1: continue
1407 if bletter.decode(enc) != sletter: continue
1408 bpat = re.escape(bytes([i]))
1409 break
1410 except (UnicodeError, TypeError):
1411 pass
1412 else:
1413 bletter = None
1414 bpat = b'A'
1415 # Bytes patterns
1416 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1417 if bletter:
1418 self.assertTrue(pat.match(bletter))
1419 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1420 if bletter:
1421 self.assertTrue(pat.match(bletter))
1422 pat = re.compile(bpat, re.IGNORECASE)
1423 if bletter:
1424 self.assertIsNone(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001425 pat = re.compile(br'\w', re.LOCALE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001426 if bletter:
1427 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001428 pat = re.compile(br'(?L)\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001429 if bletter:
1430 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001431 pat = re.compile(br'\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001432 if bletter:
1433 self.assertIsNone(pat.match(bletter))
1434 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001435 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
1436 self.assertRaises(ValueError, re.compile, '(?L)')
1437 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1438 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1439 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
1440 self.assertRaises(ValueError, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001441
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001442 def test_scoped_flags(self):
1443 self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1444 self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1445 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1446 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1447 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1448 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1449
1450 self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1451 self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1452 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1453 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1454
1455 self.checkPatternError(r'(?a:\w)',
1456 'bad inline flags: cannot turn on global flag', 3)
1457 self.checkPatternError(r'(?a)(?-a:\w)',
1458 'bad inline flags: cannot turn off global flag', 8)
1459 self.checkPatternError(r'(?i-i:a)',
1460 'bad inline flags: flag turned on and off', 5)
1461
1462 self.checkPatternError(r'(?-', 'missing flag', 3)
1463 self.checkPatternError(r'(?-+', 'missing flag', 3)
1464 self.checkPatternError(r'(?-z', 'unknown flag', 3)
1465 self.checkPatternError(r'(?-i', 'missing :', 4)
1466 self.checkPatternError(r'(?-i)', 'missing :', 4)
1467 self.checkPatternError(r'(?-i+', 'missing :', 4)
1468 self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1469 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1470 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1471 self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1472 self.checkPatternError(r'(?iz', 'unknown flag', 3)
1473
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001474 def test_bug_6509(self):
1475 # Replacement strings of both types must parse properly.
1476 # all strings
R David Murray44b548d2016-09-08 13:59:53 -04001477 pat = re.compile(r'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001478 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1479 pat = re.compile('a(.)')
1480 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1481 pat = re.compile('..')
1482 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1483
1484 # all bytes
R David Murray44b548d2016-09-08 13:59:53 -04001485 pat = re.compile(br'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001486 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1487 pat = re.compile(b'a(.)')
1488 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1489 pat = re.compile(b'..')
1490 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1491
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001492 def test_dealloc(self):
1493 # issue 3299: check for segfault in debug build
1494 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001495 # the overflow limit is different on wide and narrow builds and it
1496 # depends on the definition of SRE_CODE (see sre.h).
1497 # 2**128 should be big enough to overflow on both. For smaller values
1498 # a RuntimeError is raised instead of OverflowError.
1499 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001500 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001501 with self.assertRaises(OverflowError):
1502 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1503 with self.assertRaises(TypeError):
1504 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001507 self.assertTrue(re.search("123.*-", '123abc-'))
1508 self.assertTrue(re.search("123.*-", '123\xe9-'))
1509 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1510 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1511 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512
Ezio Melottidf723e12012-03-13 01:29:48 +02001513 def test_compile(self):
1514 # Test return value when given string and pattern as parameter
1515 pattern = re.compile('random pattern')
1516 self.assertIsInstance(pattern, re._pattern_type)
1517 same_pattern = re.compile(pattern)
1518 self.assertIsInstance(same_pattern, re._pattern_type)
1519 self.assertIs(same_pattern, pattern)
1520 # Test behaviour when not given a string or pattern as parameter
1521 self.assertRaises(TypeError, re.compile, 0)
1522
Antoine Pitroub33941a2012-12-03 20:55:56 +01001523 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001524 def test_large_search(self, size):
1525 # Issue #10182: indices were 32-bit-truncated.
1526 s = 'a' * size
1527 m = re.search('$', s)
1528 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001529 self.assertEqual(m.start(), size)
1530 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001531
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001532 # The huge memuse is because of re.sub() using a list and a join()
1533 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001534 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001535 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001536 # Issue #10182: indices were 32-bit-truncated.
1537 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001538 r, n = re.subn('', '', s)
1539 self.assertEqual(r, s)
1540 self.assertEqual(n, size + 1)
1541
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001542 def test_bug_16688(self):
1543 # Issue 16688: Backreferences make case-insensitive regex fail on
1544 # non-ASCII strings.
1545 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1546 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001547
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001548 def test_repeat_minmax_overflow(self):
1549 # Issue #13169
1550 string = "x" * 100000
1551 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1552 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1553 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1554 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1555 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1556 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1557 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1558 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1559 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1560 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1561 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1562
1563 @cpython_only
1564 def test_repeat_minmax_overflow_maxrepeat(self):
1565 try:
1566 from _sre import MAXREPEAT
1567 except ImportError:
1568 self.skipTest('requires _sre.MAXREPEAT constant')
1569 string = "x" * 100000
1570 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1571 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1572 (0, 100000))
1573 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1574 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1575 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1576 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1577
R David Murray26dfaac92013-04-14 13:00:54 -04001578 def test_backref_group_name_in_exception(self):
1579 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001580 self.checkPatternError('(?P=<foo>)',
1581 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001582
1583 def test_group_name_in_exception(self):
1584 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001585 self.checkPatternError('(?P<?foo>)',
1586 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001587
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001588 def test_issue17998(self):
1589 for reps in '*', '+', '?', '{1}':
1590 for mod in '', '?':
1591 pattern = '.' + reps + mod + 'yz'
1592 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1593 ['xyz'], msg=pattern)
1594 pattern = pattern.encode()
1595 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1596 [b'xyz'], msg=pattern)
1597
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001598 def test_match_repr(self):
1599 for string in '[abracadabra]', S('[abracadabra]'):
1600 m = re.search(r'(.+)(.*?)\1', string)
1601 self.assertEqual(repr(m), "<%s.%s object; "
1602 "span=(1, 12), match='abracadabra'>" %
1603 (type(m).__module__, type(m).__qualname__))
1604 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1605 bytearray(b'[abracadabra]'),
1606 memoryview(b'[abracadabra]')):
R David Murray44b548d2016-09-08 13:59:53 -04001607 m = re.search(br'(.+)(.*?)\1', string)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001608 self.assertEqual(repr(m), "<%s.%s object; "
1609 "span=(1, 12), match=b'abracadabra'>" %
1610 (type(m).__module__, type(m).__qualname__))
1611
1612 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1613 self.assertEqual(repr(first), "<%s.%s object; "
1614 "span=(0, 2), match='aa'>" %
1615 (type(second).__module__, type(first).__qualname__))
1616 self.assertEqual(repr(second), "<%s.%s object; "
1617 "span=(3, 5), match='bb'>" %
1618 (type(second).__module__, type(second).__qualname__))
1619
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001620
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001621 def test_bug_2537(self):
1622 # issue 2537: empty submatches
1623 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1624 for inner_op in ('{0,}', '*', '?'):
1625 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1626 m = r.match("xyyzy")
1627 self.assertEqual(m.group(0), "xyy")
1628 self.assertEqual(m.group(1), "")
1629 self.assertEqual(m.group(2), "y")
1630
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001631 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001632 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001633 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001634 re.compile(pat, re.DEBUG)
1635 dump = '''\
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001636SUBPATTERN 1 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001637 LITERAL 46
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001638SUBPATTERN None 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001639 BRANCH
1640 IN
1641 LITERAL 99
1642 LITERAL 104
1643 OR
1644 LITERAL 112
1645 LITERAL 121
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001646SUBPATTERN None 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001647 GROUPREF_EXISTS 1
1648 AT AT_END
1649 ELSE
1650 LITERAL 58
1651 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001652'''
1653 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001654 # Debug output is output again even a second time (bypassing
1655 # the cache -- issue #20426).
1656 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001657 re.compile(pat, re.DEBUG)
1658 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001659
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001660 def test_keyword_parameters(self):
1661 # Issue #20283: Accepting the string keyword parameter.
1662 pat = re.compile(r'(ab)')
1663 self.assertEqual(
1664 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1665 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001666 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1667 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001668 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1669 self.assertEqual(
1670 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1671 self.assertEqual(
1672 pat.split(string='abracadabra', maxsplit=1),
1673 ['', 'ab', 'racadabra'])
1674 self.assertEqual(
1675 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1676 (7, 9))
1677
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001678 def test_bug_20998(self):
1679 # Issue #20998: Fullmatch of repeated single character pattern
1680 # with ignore case.
1681 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1682
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001683 def test_locale_caching(self):
1684 # Issue #22410
1685 oldlocale = locale.setlocale(locale.LC_CTYPE)
1686 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1687 for loc in 'en_US.iso88591', 'en_US.utf8':
1688 try:
1689 locale.setlocale(locale.LC_CTYPE, loc)
1690 except locale.Error:
1691 # Unsupported locale on this system
1692 self.skipTest('test needs %s locale' % loc)
1693
1694 re.purge()
1695 self.check_en_US_iso88591()
1696 self.check_en_US_utf8()
1697 re.purge()
1698 self.check_en_US_utf8()
1699 self.check_en_US_iso88591()
1700
1701 def check_en_US_iso88591(self):
1702 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1703 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1704 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1705 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1706 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1707 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1708 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1709
1710 def check_en_US_utf8(self):
1711 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1712 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1713 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1714 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1715 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1716 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1717 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1718
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001719 def test_error(self):
1720 with self.assertRaises(re.error) as cm:
1721 re.compile('(\u20ac))')
1722 err = cm.exception
1723 self.assertIsInstance(err.pattern, str)
1724 self.assertEqual(err.pattern, '(\u20ac))')
1725 self.assertEqual(err.pos, 3)
1726 self.assertEqual(err.lineno, 1)
1727 self.assertEqual(err.colno, 4)
1728 self.assertIn(err.msg, str(err))
1729 self.assertIn(' at position 3', str(err))
1730 self.assertNotIn(' at position 3', err.msg)
1731 # Bytes pattern
1732 with self.assertRaises(re.error) as cm:
1733 re.compile(b'(\xa4))')
1734 err = cm.exception
1735 self.assertIsInstance(err.pattern, bytes)
1736 self.assertEqual(err.pattern, b'(\xa4))')
1737 self.assertEqual(err.pos, 3)
1738 # Multiline pattern
1739 with self.assertRaises(re.error) as cm:
1740 re.compile("""
1741 (
1742 abc
1743 )
1744 )
1745 (
1746 """, re.VERBOSE)
1747 err = cm.exception
1748 self.assertEqual(err.pos, 77)
1749 self.assertEqual(err.lineno, 5)
1750 self.assertEqual(err.colno, 17)
1751 self.assertIn(err.msg, str(err))
1752 self.assertIn(' at position 77', str(err))
1753 self.assertIn('(line 5, column 17)', str(err))
1754
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001755 def test_misc_errors(self):
1756 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1757 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1758 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1759 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1760 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1761 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001762 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001763 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1764 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1765 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1766 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1767
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001768
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001769class PatternReprTests(unittest.TestCase):
1770 def check(self, pattern, expected):
1771 self.assertEqual(repr(re.compile(pattern)), expected)
1772
1773 def check_flags(self, pattern, flags, expected):
1774 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1775
1776 def test_without_flags(self):
1777 self.check('random pattern',
1778 "re.compile('random pattern')")
1779
1780 def test_single_flag(self):
1781 self.check_flags('random pattern', re.IGNORECASE,
1782 "re.compile('random pattern', re.IGNORECASE)")
1783
1784 def test_multiple_flags(self):
1785 self.check_flags('random pattern', re.I|re.S|re.X,
1786 "re.compile('random pattern', "
1787 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1788
1789 def test_unicode_flag(self):
1790 self.check_flags('random pattern', re.U,
1791 "re.compile('random pattern')")
1792 self.check_flags('random pattern', re.I|re.S|re.U,
1793 "re.compile('random pattern', "
1794 "re.IGNORECASE|re.DOTALL)")
1795
1796 def test_inline_flags(self):
1797 self.check('(?i)pattern',
1798 "re.compile('(?i)pattern', re.IGNORECASE)")
1799
1800 def test_unknown_flags(self):
1801 self.check_flags('random pattern', 0x123000,
1802 "re.compile('random pattern', 0x123000)")
1803 self.check_flags('random pattern', 0x123000|re.I,
1804 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1805
1806 def test_bytes(self):
1807 self.check(b'bytes pattern',
1808 "re.compile(b'bytes pattern')")
1809 self.check_flags(b'bytes pattern', re.A,
1810 "re.compile(b'bytes pattern', re.ASCII)")
1811
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001812 def test_locale(self):
1813 self.check_flags(b'bytes pattern', re.L,
1814 "re.compile(b'bytes pattern', re.LOCALE)")
1815
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001816 def test_quotes(self):
1817 self.check('random "double quoted" pattern',
1818 '''re.compile('random "double quoted" pattern')''')
1819 self.check("random 'single quoted' pattern",
1820 '''re.compile("random 'single quoted' pattern")''')
1821 self.check('''both 'single' and "double" quotes''',
1822 '''re.compile('both \\'single\\' and "double" quotes')''')
1823
1824 def test_long_pattern(self):
1825 pattern = 'Very %spattern' % ('long ' * 1000)
1826 r = repr(re.compile(pattern))
1827 self.assertLess(len(r), 300)
1828 self.assertEqual(r[:30], "re.compile('Very long long lon")
1829 r = repr(re.compile(pattern, re.I))
1830 self.assertLess(len(r), 300)
1831 self.assertEqual(r[:30], "re.compile('Very long long lon")
1832 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1833
1834
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001835class ImplementationTest(unittest.TestCase):
1836 """
1837 Test implementation details of the re module.
1838 """
1839
1840 def test_overlap_table(self):
1841 f = sre_compile._generate_overlap_table
1842 self.assertEqual(f(""), [])
1843 self.assertEqual(f("a"), [0])
1844 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1845 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1846 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1847 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1848
1849
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001850class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001851
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001852 def test_re_benchmarks(self):
1853 're_tests benchmarks'
1854 from test.re_tests import benchmarks
1855 for pattern, s in benchmarks:
1856 with self.subTest(pattern=pattern, string=s):
1857 p = re.compile(pattern)
1858 self.assertTrue(p.search(s))
1859 self.assertTrue(p.match(s))
1860 self.assertTrue(p.fullmatch(s))
1861 s2 = ' '*10000 + s + ' '*10000
1862 self.assertTrue(p.search(s2))
1863 self.assertTrue(p.match(s2, 10000))
1864 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1865 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001866
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001867 def test_re_tests(self):
1868 're_tests test suite'
1869 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1870 for t in tests:
1871 pattern = s = outcome = repl = expected = None
1872 if len(t) == 5:
1873 pattern, s, outcome, repl, expected = t
1874 elif len(t) == 3:
1875 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001876 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001877 raise ValueError('Test tuples should have 3 or 5 fields', t)
1878
1879 with self.subTest(pattern=pattern, string=s):
1880 if outcome == SYNTAX_ERROR: # Expected a syntax error
1881 with self.assertRaises(re.error):
1882 re.compile(pattern)
1883 continue
1884
1885 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001886 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001887 if outcome == FAIL:
1888 self.assertIsNone(result, 'Succeeded incorrectly')
1889 continue
1890
1891 with self.subTest():
1892 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001893 # Matched, as expected, so now we compute the
1894 # result string and compare it to our expected result.
1895 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001896 vardict = {'found': result.group(0),
1897 'groups': result.group(),
1898 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001899 for i in range(1, 100):
1900 try:
1901 gi = result.group(i)
1902 # Special hack because else the string concat fails:
1903 if gi is None:
1904 gi = "None"
1905 except IndexError:
1906 gi = "Error"
1907 vardict['g%d' % i] = gi
1908 for i in result.re.groupindex.keys():
1909 try:
1910 gi = result.group(i)
1911 if gi is None:
1912 gi = "None"
1913 except IndexError:
1914 gi = "Error"
1915 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001916 self.assertEqual(eval(repl, vardict), expected,
1917 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001918
Antoine Pitrou22628c42008-07-22 17:53:22 +00001919 # Try the match with both pattern and string converted to
1920 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001921 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001922 bpat = bytes(pattern, "ascii")
1923 bs = bytes(s, "ascii")
1924 except UnicodeEncodeError:
1925 # skip non-ascii tests
1926 pass
1927 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001928 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001929 obj = re.compile(bpat)
1930 self.assertTrue(obj.search(bs))
1931
1932 # Try the match with LOCALE enabled, and check that it
1933 # still succeeds.
1934 with self.subTest('locale-sensitive match'):
1935 obj = re.compile(bpat, re.LOCALE)
1936 result = obj.search(bs)
1937 if result is None:
1938 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001939
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001940 # Try the match with the search area limited to the extent
1941 # of the match and see if it still succeeds. \B will
1942 # break (because it won't match at the end or start of a
1943 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001944 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1945 and result is not None):
1946 with self.subTest('range-limited match'):
1947 obj = re.compile(pattern)
1948 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001949
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001950 # Try the match with IGNORECASE enabled, and check that it
1951 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001952 with self.subTest('case-insensitive match'):
1953 obj = re.compile(pattern, re.IGNORECASE)
1954 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001955
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001956 # Try the match with UNICODE locale enabled, and check
1957 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001958 with self.subTest('unicode-sensitive match'):
1959 obj = re.compile(pattern, re.UNICODE)
1960 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001961
Gregory P. Smith5a631832010-07-27 05:31:29 +00001962
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001963if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001964 unittest.main()