blob: 5b716125f97400884386a2464c7fe044a1f06746 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000116 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
127 with self.assertWarns(DeprecationWarning):
128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000130 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
134 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
221 re.compile('(?P<a>x)(?P=a)(?(a)y)')
222 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300223 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200224 self.checkPatternError('(?P<a>)(?P<a>)',
225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
227 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
228 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
229 self.checkPatternError('(?P=', 'missing group name', 4)
230 self.checkPatternError('(?P=)', 'missing group name', 4)
231 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
232 self.checkPatternError('(?P=a)', "unknown group name 'a'")
233 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
234 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
235 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
236 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
237 self.checkPatternError('(?P<', 'missing group name', 4)
238 self.checkPatternError('(?P<>)', 'missing group name', 4)
239 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
240 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
241 self.checkPatternError(r'(?(', 'missing group name', 3)
242 self.checkPatternError(r'(?())', 'missing group name', 3)
243 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
244 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
245 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
246 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200247 # New valid/invalid identifiers in Python 3
248 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
249 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200250 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300251 # Support > 100 groups.
252 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
253 pat = '(?:%s)(?(200)z|t)' % pat
254 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200255
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000256 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200257 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
258 'missing >, unterminated name', 3)
259 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
260 'missing group name', 3)
261 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
262 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
263 "bad character in group name 'a a'", 3)
264 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
265 'missing group name', 3)
266 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
267 "bad character in group name '1a1'", 3)
268 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
269 'invalid group reference')
270 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
271 'invalid group reference')
272 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
273 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300274 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
275 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200276 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
277 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200278 # New valid/invalid identifiers in Python 3
279 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
280 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200281 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
282 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300283 # Support > 100 groups.
284 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
285 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000286
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000287 def test_re_subn(self):
288 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
289 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
290 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
291 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100292 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000293
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000294 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300295 for string in ":a:b::c", S(":a:b::c"):
296 self.assertTypedEqual(re.split(":", string),
297 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200298 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300299 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', ':', 'a', ':', 'b', '::', 'c'])
302 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
303 memoryview(b":a:b::c")):
304 self.assertTypedEqual(re.split(b":", string),
305 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200306 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300307 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300310 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
311 "\U0001d49c\U0001d49e\U0001d4b5"):
312 string = ":%s:%s::%s" % (a, b, c)
313 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200314 self.assertEqual(re.split(":+", string), ['', a, b, c])
315 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300316 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300317
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200318 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
319 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000320 ['', ':', 'a', ':', 'b', ':', 'c'])
321 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
322 ['', ':', 'a', ':b::', 'c'])
323 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
324 ['', None, ':', 'a', None, ':', '', 'b', None, '',
325 None, '::', 'c'])
326 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
327 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000328
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200329 for sep, expected in [
330 (':*', ['', 'a', 'b', 'c']),
331 ('(?::*)', ['', 'a', 'b', 'c']),
332 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
333 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
334 ]:
335 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
336 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
337
338 for sep, expected in [
339 ('', [':a:b::c']),
340 (r'\b', [':a:b::c']),
341 (r'(?=:)', [':a:b::c']),
342 (r'(?<=:)', [':a:b::c']),
343 ]:
344 with self.subTest(sep=sep), self.assertRaises(ValueError):
345 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 with self.assertWarns(FutureWarning):
355 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
356 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000357
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000358 def test_re_findall(self):
359 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300360 for string in "a:b::c:::d", S("a:b::c:::d"):
361 self.assertTypedEqual(re.findall(":+", string),
362 [":", "::", ":::"])
363 self.assertTypedEqual(re.findall("(:+)", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:)(:*)", string),
366 [(":", ""), (":", ":"), (":", "::")])
367 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
368 memoryview(b"a:b::c:::d")):
369 self.assertTypedEqual(re.findall(b":+", string),
370 [b":", b"::", b":::"])
371 self.assertTypedEqual(re.findall(b"(:+)", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
374 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300375 for x in ("\xe0", "\u0430", "\U0001d49c"):
376 xx = x * 2
377 xxx = x * 3
378 string = "a%sb%sc%sd" % (x, xx, xxx)
379 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
380 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
381 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
382 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000383
Skip Montanaro5ba00542003-04-25 16:00:14 +0000384 def test_bug_117612(self):
385 self.assertEqual(re.findall(r"(a|(b))", "aba"),
386 [("a", ""),("b", "b"),("a", "")])
387
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000388 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300389 for string in 'a', S('a'):
390 self.assertEqual(re.match('a', string).groups(), ())
391 self.assertEqual(re.match('(a)', string).groups(), ('a',))
392 self.assertEqual(re.match('(a)', string).group(0), 'a')
393 self.assertEqual(re.match('(a)', string).group(1), 'a')
394 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
395 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
396 self.assertEqual(re.match(b'a', string).groups(), ())
397 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
398 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
399 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
400 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300401 for a in ("\xe0", "\u0430", "\U0001d49c"):
402 self.assertEqual(re.match(a, a).groups(), ())
403 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
404 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
405 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
406 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000407
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000408 pat = re.compile('((a)|(b))(c)?')
409 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
410 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
411 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
412 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
413 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000414
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000415 # A single group
416 m = re.match('(a)', 'a')
417 self.assertEqual(m.group(0), 'a')
418 self.assertEqual(m.group(0), 'a')
419 self.assertEqual(m.group(1), 'a')
420 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000421
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000422 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
423 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
424 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
425 (None, 'b', None))
426 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000427
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200428 def test_re_fullmatch(self):
429 # Issue 16203: Proposal: add re.fullmatch() method.
430 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
431 for string in "ab", S("ab"):
432 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
433 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
434 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
435 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
436 r = r"%s|%s" % (a, a + b)
437 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
438 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
439 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
440 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
441 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
442 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
443 self.assertIsNone(re.fullmatch(r"a+", "ab"))
444 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
445 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
446 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
447 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
448 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
449 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
450
451 self.assertEqual(
452 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
453 self.assertEqual(
454 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
455 self.assertEqual(
456 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
457
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000458 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000459 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
460 ('(', 'a'))
461 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
462 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300463 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
464 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000465 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
466 ('a', 'b'))
467 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
468 (None, 'd'))
469 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
470 (None, 'd'))
471 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
472 ('a', ''))
473
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000474 # Tests for bug #1177831: exercise groups other than the first group
475 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
476 self.assertEqual(p.match('abc').groups(),
477 ('a', 'b', 'c'))
478 self.assertEqual(p.match('ad').groups(),
479 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300480 self.assertIsNone(p.match('abd'))
481 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000482
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300483 # Support > 100 groups.
484 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
485 pat = '(?:%s)(?(200)z)' % pat
486 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000487
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200488 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
489 self.checkPatternError(r'()(?(1)a|b',
490 'missing ), unterminated subpattern', 2)
491 self.checkPatternError(r'()(?(1)a|b|c)',
492 'conditional backref with more than '
493 'two branches', 10)
494
495 def test_re_groupref_overflow(self):
496 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
497 'invalid group reference', 3)
498 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
499 'invalid group reference', 10)
500
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000501 def test_re_groupref(self):
502 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
503 ('|', 'a'))
504 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
505 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300506 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
507 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000508 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
509 ('a', 'a'))
510 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
511 (None, None))
512
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200513 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
514
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 def test_groupdict(self):
516 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
517 'first second').groupdict(),
518 {'first':'first', 'second':'second'})
519
520 def test_expand(self):
521 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
522 "first second")
523 .expand(r"\2 \1 \g<second> \g<first>"),
524 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300525 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
526 "first")
527 .expand(r"\2 \g<second>"),
528 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000529
530 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.match("^(\w){1}$", "abc"))
532 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
533 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
534 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000535
536 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
537 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
538 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
539 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
540 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
541 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
542 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
543 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
544
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300545 self.assertIsNone(re.match("^x{1}$", "xxx"))
546 self.assertIsNone(re.match("^x{1}?$", "xxx"))
547 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
548 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000549
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300550 self.assertTrue(re.match("^x{3}$", "xxx"))
551 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200552 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300553 self.assertTrue(re.match("^x{1,4}$", "xxx"))
554 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
555 self.assertTrue(re.match("^x{3}?$", "xxx"))
556 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
557 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
558 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000559
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300560 self.assertIsNone(re.match("^x{}$", "xxx"))
561 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000562
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200563 self.checkPatternError(r'x{2,1}',
564 'min repeat greater than max repeat', 2)
565
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000566 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000567 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000568 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000569 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
570 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
571 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
572 {'first': 1, 'other': 2})
573
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000574 self.assertEqual(re.match("(a)", "a").pos, 0)
575 self.assertEqual(re.match("(a)", "a").endpos, 1)
576 self.assertEqual(re.match("(a)", "a").string, "a")
577 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300578 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000579
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300580 # Issue 14260. groupindex should be non-modifiable mapping.
581 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
582 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
583 self.assertEqual(p.groupindex['other'], 2)
584 with self.assertRaises(TypeError):
585 p.groupindex['other'] = 0
586 self.assertEqual(p.groupindex['other'], 2)
587
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000588 def test_special_escapes(self):
589 self.assertEqual(re.search(r"\b(b.)\b",
590 "abcd abc bcd bx").group(1), "bx")
591 self.assertEqual(re.search(r"\B(b.)\B",
592 "abc bcd bc abxd").group(1), "bx")
593 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300594 "abcd abc bcd bx", re.ASCII).group(1), "bx")
595 self.assertEqual(re.search(r"\B(b.)\B",
596 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000597 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
598 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300599 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300600 self.assertEqual(re.search(br"\b(b.)\b",
601 b"abcd abc bcd bx").group(1), b"bx")
602 self.assertEqual(re.search(br"\B(b.)\B",
603 b"abc bcd bc abxd").group(1), b"bx")
604 self.assertEqual(re.search(br"\b(b.)\b",
605 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
606 self.assertEqual(re.search(br"\B(b.)\B",
607 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
608 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
609 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300610 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000611 self.assertEqual(re.search(r"\d\D\w\W\s\S",
612 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300613 self.assertEqual(re.search(br"\d\D\w\W\s\S",
614 b"1aa! a").group(0), b"1aa! a")
615 self.assertEqual(re.search(r"\d\D\w\W\s\S",
616 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300617 self.assertEqual(re.search(br"\d\D\w\W\s\S",
618 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000619
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200620 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200621 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200622 self.assertEqual(re.match(r"\(", '(').group(), '(')
623 self.assertIsNone(re.match(r"\(", ')'))
624 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200625 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
626 self.assertIsNone(re.match(r"[\]]", '['))
627 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
628 self.assertIsNone(re.match(r"[a\-c]", 'b'))
629 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
630 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200631 re.purge() # for warnings
632 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
633 with self.subTest(c):
634 with self.assertWarns(DeprecationWarning):
635 self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
636 self.assertIsNone(re.match('\\%c' % c, 'a'))
637 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
638 with self.subTest(c):
639 with self.assertWarns(DeprecationWarning):
640 self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
641 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200642
Ezio Melotti5a045b92012-02-29 11:48:44 +0200643 def test_string_boundaries(self):
644 # See http://bugs.python.org/issue10713
645 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
646 "abc")
647 # There's a word boundary at the start of a string.
648 self.assertTrue(re.match(r"\b", "abc"))
649 # A non-empty string includes a non-boundary zero-length match.
650 self.assertTrue(re.search(r"\B", "abc"))
651 # There is no non-boundary match at the start of a string.
652 self.assertFalse(re.match(r"\B", "abc"))
653 # However, an empty string contains no word boundaries, and also no
654 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300655 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200656 # This one is questionable and different from the perlre behaviour,
657 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300658 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200659 # A single word-character string has two boundaries, but no
660 # non-boundary gaps.
661 self.assertEqual(len(re.findall(r"\b", "a")), 2)
662 self.assertEqual(len(re.findall(r"\B", "a")), 0)
663 # If there are no words, there are no boundaries
664 self.assertEqual(len(re.findall(r"\b", " ")), 0)
665 self.assertEqual(len(re.findall(r"\b", " ")), 0)
666 # Can match around the whitespace.
667 self.assertEqual(len(re.findall(r"\B", " ")), 2)
668
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000669 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000670 self.assertEqual(re.match("([\u2222\u2223])",
671 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300672 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300673 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000674
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100675 def test_big_codesize(self):
676 # Issue #1160
677 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300678 self.assertTrue(r.match('1000'))
679 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100680
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000681 def test_anyall(self):
682 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
683 "a\nb")
684 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
685 "a\n\nb")
686
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200687 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000688 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
689 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
690 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
691 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
692 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
693 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
694 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
695
696 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
697 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
698 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
699 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
700
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200701 # Group reference.
702 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
703 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
704 # Conditional group reference.
705 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
706 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
707 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
708 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
709 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
710 # Group used before defined.
711 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
712 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
713 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
714
715 def test_lookbehind(self):
716 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
717 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
718 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
719 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
720 # Group reference.
721 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
722 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
723 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
724 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
725 # Conditional group reference.
726 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
727 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
728 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
729 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
730 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
731 # Group used before defined.
732 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
733 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
734 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
735 # Group defined in the same lookbehind pattern
736 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
737 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
738 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
739 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
740
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000741 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000742 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300743 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000744 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
745 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
746 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
747 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
748 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
749 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
750 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
751 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
752
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200753 assert '\u212a'.lower() == 'k' # 'K'
754 self.assertTrue(re.match(r'K', '\u212a', re.I))
755 self.assertTrue(re.match(r'k', '\u212a', re.I))
756 self.assertTrue(re.match(r'\u212a', 'K', re.I))
757 self.assertTrue(re.match(r'\u212a', 'k', re.I))
758 assert '\u017f'.upper() == 'S' # 'ſ'
759 self.assertTrue(re.match(r'S', '\u017f', re.I))
760 self.assertTrue(re.match(r's', '\u017f', re.I))
761 self.assertTrue(re.match(r'\u017f', 'S', re.I))
762 self.assertTrue(re.match(r'\u017f', 's', re.I))
763 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
764 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
765 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
766
767 def test_ignore_case_set(self):
768 self.assertTrue(re.match(r'[19A]', 'A', re.I))
769 self.assertTrue(re.match(r'[19a]', 'a', re.I))
770 self.assertTrue(re.match(r'[19a]', 'A', re.I))
771 self.assertTrue(re.match(r'[19A]', 'a', re.I))
772 self.assertTrue(re.match(br'[19A]', b'A', re.I))
773 self.assertTrue(re.match(br'[19a]', b'a', re.I))
774 self.assertTrue(re.match(br'[19a]', b'A', re.I))
775 self.assertTrue(re.match(br'[19A]', b'a', re.I))
776 assert '\u212a'.lower() == 'k' # 'K'
777 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
778 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
779 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
780 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
781 assert '\u017f'.upper() == 'S' # 'ſ'
782 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
783 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
784 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
785 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
786 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
787 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
788 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
789
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200790 def test_ignore_case_range(self):
791 # Issues #3511, #17381.
792 self.assertTrue(re.match(r'[9-a]', '_', re.I))
793 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
794 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
795 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
796 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
797 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
798 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
799 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
800 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
801 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
802 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
803 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
804 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
805 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
806 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
807 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
808
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200809 assert '\u212a'.lower() == 'k' # 'K'
810 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
811 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
812 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
813 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
814 assert '\u017f'.upper() == 'S' # 'ſ'
815 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
816 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
817 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
818 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
819 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
820 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
821 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
822
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000823 def test_category(self):
824 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
825
826 def test_getlower(self):
827 import _sre
828 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
829 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
830 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200831 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000832
833 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300834 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200835 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
836 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000837
838 def test_not_literal(self):
839 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
840 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
841
842 def test_search_coverage(self):
843 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
844 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
845
Ezio Melottid2114eb2011-03-25 14:08:44 +0200846 def assertMatch(self, pattern, text, match=None, span=None,
847 matcher=re.match):
848 if match is None and span is None:
849 # the pattern matches the whole text
850 match = text
851 span = (0, len(text))
852 elif match is None or span is None:
853 raise ValueError('If match is not None, span should be specified '
854 '(and vice versa).')
855 m = matcher(pattern, text)
856 self.assertTrue(m)
857 self.assertEqual(m.group(), match)
858 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000859
Ezio Melottid2114eb2011-03-25 14:08:44 +0200860 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300861 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200862 p = ''.join(chr(i) for i in range(256))
863 for c in p:
864 if c in alnum_chars:
865 self.assertEqual(re.escape(c), c)
866 elif c == '\x00':
867 self.assertEqual(re.escape(c), '\\000')
868 else:
869 self.assertEqual(re.escape(c), '\\' + c)
870 self.assertMatch(re.escape(c), c)
871 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000872
Guido van Rossum698280d2008-09-10 17:44:35 +0000873 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300874 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200875 p = bytes(range(256))
876 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000877 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200878 if b in alnum_chars:
879 self.assertEqual(re.escape(b), b)
880 elif i == 0:
881 self.assertEqual(re.escape(b), b'\\000')
882 else:
883 self.assertEqual(re.escape(b), b'\\' + b)
884 self.assertMatch(re.escape(b), b)
885 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000886
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200887 def test_re_escape_non_ascii(self):
888 s = 'xxx\u2620\u2620\u2620xxx'
889 s_escaped = re.escape(s)
890 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
891 self.assertMatch(s_escaped, s)
892 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
893 'x\u2620\u2620\u2620x', (2, 7), re.search)
894
895 def test_re_escape_non_ascii_bytes(self):
896 b = 'y\u2620y\u2620y'.encode('utf-8')
897 b_escaped = re.escape(b)
898 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
899 self.assertMatch(b_escaped, b)
900 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
901 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000902
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300903 def test_pickling(self):
904 import pickle
905 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
906 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
907 pickled = pickle.dumps(oldpat, proto)
908 newpat = pickle.loads(pickled)
909 self.assertEqual(newpat, oldpat)
910 # current pickle expects the _compile() reconstructor in re module
911 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000912
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000913 def test_constants(self):
914 self.assertEqual(re.I, re.IGNORECASE)
915 self.assertEqual(re.L, re.LOCALE)
916 self.assertEqual(re.M, re.MULTILINE)
917 self.assertEqual(re.S, re.DOTALL)
918 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000919
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000920 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200921 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300922 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200923 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
924 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000925
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000926 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200927 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
928 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300929 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
930 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
931 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
932 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
933 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
934 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200935 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300936 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
937 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
938 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
939 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
940 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
941 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
942 self.assertTrue(re.match(r"\0", "\000"))
943 self.assertTrue(re.match(r"\08", "\0008"))
944 self.assertTrue(re.match(r"\01", "\001"))
945 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200946 self.checkPatternError(r"\567",
947 r'octal escape value \567 outside of '
948 r'range 0-0o377', 0)
949 self.checkPatternError(r"\911", 'invalid group reference', 0)
950 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
951 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
952 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
953 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
954 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
955 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
956 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000957
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000958 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200959 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
960 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300961 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
962 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
963 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
964 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
965 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
966 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
967 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
968 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200969 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300970 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
971 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
972 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
973 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
974 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
975 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200976 self.checkPatternError(r"[\567]",
977 r'octal escape value \567 outside of '
978 r'range 0-0o377', 1)
979 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
980 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
981 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
982 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
983 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300984 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200985
986 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000987 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300988 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
989 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
990 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
991 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
992 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
993 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200994 with self.assertWarns(DeprecationWarning):
995 self.assertTrue(re.match(br"\u1234", b'u1234'))
996 with self.assertWarns(DeprecationWarning):
997 self.assertTrue(re.match(br"\U00012345", b'U00012345'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300998 self.assertTrue(re.match(br"\0", b"\000"))
999 self.assertTrue(re.match(br"\08", b"\0008"))
1000 self.assertTrue(re.match(br"\01", b"\001"))
1001 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001002 self.checkPatternError(br"\567",
1003 r'octal escape value \567 outside of '
1004 r'range 0-0o377', 0)
1005 self.checkPatternError(br"\911", 'invalid group reference', 0)
1006 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1007 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001008
1009 def test_sre_byte_class_literals(self):
1010 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001011 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1012 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1013 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1014 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1015 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1016 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1017 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1018 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001019 with self.assertWarns(DeprecationWarning):
1020 self.assertTrue(re.match(br"[\u1234]", b'u'))
1021 with self.assertWarns(DeprecationWarning):
1022 self.assertTrue(re.match(br"[\U00012345]", b'U'))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001023 self.checkPatternError(br"[\567]",
1024 r'octal escape value \567 outside of '
1025 r'range 0-0o377', 1)
1026 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1027 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1028
1029 def test_character_set_errors(self):
1030 self.checkPatternError(r'[', 'unterminated character set', 0)
1031 self.checkPatternError(r'[^', 'unterminated character set', 0)
1032 self.checkPatternError(r'[a', 'unterminated character set', 0)
1033 # bug 545855 -- This pattern failed to cause a compile error as it
1034 # should, instead provoking a TypeError.
1035 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1036 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1037 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1038 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001039
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001040 def test_bug_113254(self):
1041 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1042 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1043 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1044
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001045 def test_bug_527371(self):
1046 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001047 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001048 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1049 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1050 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1051 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1052
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001053 def test_bug_418626(self):
1054 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1055 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1056 # pattern '*?' on a long string.
1057 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1058 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1059 20003)
1060 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001061 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001062 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001064
1065 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001066 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001067 self.assertEqual(re.compile(pat) and 1, 1)
1068
Skip Montanaro1e703c62003-04-25 15:40:28 +00001069 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001070 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001071 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1073 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1074 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001075
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001076 def test_nothing_to_repeat(self):
1077 for reps in '*', '+', '?', '{1,2}':
1078 for mod in '', '?':
1079 self.checkPatternError('%s%s' % (reps, mod),
1080 'nothing to repeat', 0)
1081 self.checkPatternError('(?:%s%s)' % (reps, mod),
1082 'nothing to repeat', 3)
1083
1084 def test_multiple_repeat(self):
1085 for outer_reps in '*', '+', '{1,2}':
1086 for outer_mod in '', '?':
1087 outer_op = outer_reps + outer_mod
1088 for inner_reps in '*', '+', '?', '{1,2}':
1089 for inner_mod in '', '?':
1090 inner_op = inner_reps + inner_mod
1091 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1092 'multiple repeat', 1 + len(inner_op))
1093
Serhiy Storchakafa468162013-02-16 21:23:53 +02001094 def test_unlimited_zero_width_repeat(self):
1095 # Issue #9669
1096 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1097 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1098 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1099 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1100 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1101 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1102
Skip Montanaro1e703c62003-04-25 15:40:28 +00001103 def test_scanner(self):
1104 def s_ident(scanner, token): return token
1105 def s_operator(scanner, token): return "op%s" % token
1106 def s_float(scanner, token): return float(token)
1107 def s_int(scanner, token): return int(token)
1108
1109 scanner = Scanner([
1110 (r"[a-zA-Z_]\w*", s_ident),
1111 (r"\d+\.\d*", s_float),
1112 (r"\d+", s_int),
1113 (r"=|\+|-|\*|/", s_operator),
1114 (r"\s+", None),
1115 ])
1116
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001117 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001118
Skip Montanaro1e703c62003-04-25 15:40:28 +00001119 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1120 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1121 'op+', 'bar'], ''))
1122
Skip Montanaro5ba00542003-04-25 16:00:14 +00001123 def test_bug_448951(self):
1124 # bug 448951 (similar to 429357, but with single char match)
1125 # (Also test greedy matches.)
1126 for op in '','?','*':
1127 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1128 (None, None))
1129 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1130 ('a:', 'a'))
1131
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001132 def test_bug_725106(self):
1133 # capturing groups in alternatives in repeats
1134 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1135 ('b', 'a'))
1136 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1137 ('c', 'b'))
1138 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1139 ('b', None))
1140 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1141 ('b', None))
1142 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1143 ('b', 'a'))
1144 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1145 ('c', 'b'))
1146 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1147 ('b', None))
1148 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1149 ('b', None))
1150
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001151 def test_bug_725149(self):
1152 # mark_stack_base restoring before restoring marks
1153 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1154 ('a', None))
1155 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1156 ('a', None, None))
1157
Just van Rossum12723ba2003-07-02 20:03:04 +00001158 def test_bug_764548(self):
1159 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001160 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001161 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001162 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001163
Skip Montanaro5ba00542003-04-25 16:00:14 +00001164 def test_finditer(self):
1165 iter = re.finditer(r":+", "a:b::c:::d")
1166 self.assertEqual([item.group(0) for item in iter],
1167 [":", "::", ":::"])
1168
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001169 pat = re.compile(r":+")
1170 iter = pat.finditer("a:b::c:::d", 1, 10)
1171 self.assertEqual([item.group(0) for item in iter],
1172 [":", "::", ":::"])
1173
1174 pat = re.compile(r":+")
1175 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1176 self.assertEqual([item.group(0) for item in iter],
1177 [":", "::", ":::"])
1178
1179 pat = re.compile(r":+")
1180 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1181 self.assertEqual([item.group(0) for item in iter],
1182 [":", "::", ":::"])
1183
1184 pat = re.compile(r":+")
1185 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1186 self.assertEqual([item.group(0) for item in iter],
1187 ["::", "::"])
1188
Thomas Wouters40a088d2008-03-18 20:19:54 +00001189 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001190 self.assertIsNot(re.compile('bug_926075'),
1191 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001192
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001193 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001194 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001195 self.assertEqual(re.compile(pattern).split("a.b.c"),
1196 ['a','b','c'])
1197
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001198 def test_bug_581080(self):
1199 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001200 self.assertEqual(next(iter).span(), (1,2))
1201 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001202
1203 scanner = re.compile(r"\s").scanner("a b")
1204 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001205 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001206
1207 def test_bug_817234(self):
1208 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001209 self.assertEqual(next(iter).span(), (0, 4))
1210 self.assertEqual(next(iter).span(), (4, 4))
1211 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001212
Mark Dickinson1f268282009-07-28 17:22:36 +00001213 def test_bug_6561(self):
1214 # '\d' should match characters in Unicode category 'Nd'
1215 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1216 # Letter) or 'No' (Number, Other).
1217 decimal_digits = [
1218 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1219 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1220 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1221 ]
1222 for x in decimal_digits:
1223 self.assertEqual(re.match('^\d$', x).group(0), x)
1224
1225 not_decimal_digits = [
1226 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1227 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1228 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1229 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1230 ]
1231 for x in not_decimal_digits:
1232 self.assertIsNone(re.match('^\d$', x))
1233
Guido van Rossumd8faa362007-04-27 19:54:29 +00001234 def test_empty_array(self):
1235 # SF buf 1647541
1236 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001237 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001238 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001239 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001240 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001241
Christian Heimes072c0f12008-01-03 23:01:04 +00001242 def test_inline_flags(self):
1243 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001244 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1245 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001246
1247 p = re.compile(upper_char, re.I | re.U)
1248 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001249 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001250
1251 p = re.compile(lower_char, re.I | re.U)
1252 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001253 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001254
1255 p = re.compile('(?i)' + upper_char, re.U)
1256 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001257 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001258
1259 p = re.compile('(?i)' + lower_char, re.U)
1260 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001261 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001262
1263 p = re.compile('(?iu)' + upper_char)
1264 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001265 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001266
1267 p = re.compile('(?iu)' + lower_char)
1268 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001269 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001270
Christian Heimes25bb7832008-01-11 16:17:00 +00001271 def test_dollar_matches_twice(self):
1272 "$ matches the end of string, and just before the terminating \n"
1273 pattern = re.compile('$')
1274 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1275 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1276 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1277
1278 pattern = re.compile('$', re.MULTILINE)
1279 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1280 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1281 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1282
Antoine Pitroufd036452008-08-19 17:56:33 +00001283 def test_bytes_str_mixing(self):
1284 # Mixing str and bytes is disallowed
1285 pat = re.compile('.')
1286 bpat = re.compile(b'.')
1287 self.assertRaises(TypeError, pat.match, b'b')
1288 self.assertRaises(TypeError, bpat.match, 'b')
1289 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1290 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1291 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1292 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1293 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1294 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1295
1296 def test_ascii_and_unicode_flag(self):
1297 # String patterns
1298 for flags in (0, re.UNICODE):
1299 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001300 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001301 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001302 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001303 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001304 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001305 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001306 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001307 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001308 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001309 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001310 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001311 # Bytes patterns
1312 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001313 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001314 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001315 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001316 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001317 # Incompatibilities
1318 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1319 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1320 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1321 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1322 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1323 self.assertRaises(ValueError, re.compile, '(?au)\w')
1324
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001325 def test_locale_flag(self):
1326 import locale
1327 _, enc = locale.getlocale(locale.LC_CTYPE)
1328 # Search non-ASCII letter
1329 for i in range(128, 256):
1330 try:
1331 c = bytes([i]).decode(enc)
1332 sletter = c.lower()
1333 if sletter == c: continue
1334 bletter = sletter.encode(enc)
1335 if len(bletter) != 1: continue
1336 if bletter.decode(enc) != sletter: continue
1337 bpat = re.escape(bytes([i]))
1338 break
1339 except (UnicodeError, TypeError):
1340 pass
1341 else:
1342 bletter = None
1343 bpat = b'A'
1344 # Bytes patterns
1345 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1346 if bletter:
1347 self.assertTrue(pat.match(bletter))
1348 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1349 if bletter:
1350 self.assertTrue(pat.match(bletter))
1351 pat = re.compile(bpat, re.IGNORECASE)
1352 if bletter:
1353 self.assertIsNone(pat.match(bletter))
1354 pat = re.compile(b'\w', re.LOCALE)
1355 if bletter:
1356 self.assertTrue(pat.match(bletter))
1357 pat = re.compile(b'(?L)\w')
1358 if bletter:
1359 self.assertTrue(pat.match(bletter))
1360 pat = re.compile(b'\w')
1361 if bletter:
1362 self.assertIsNone(pat.match(bletter))
1363 # Incompatibilities
1364 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1365 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1366 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1367 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1368 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1369 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1370
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001371 def test_bug_6509(self):
1372 # Replacement strings of both types must parse properly.
1373 # all strings
1374 pat = re.compile('a(\w)')
1375 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1376 pat = re.compile('a(.)')
1377 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1378 pat = re.compile('..')
1379 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1380
1381 # all bytes
1382 pat = re.compile(b'a(\w)')
1383 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1384 pat = re.compile(b'a(.)')
1385 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1386 pat = re.compile(b'..')
1387 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1388
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001389 def test_dealloc(self):
1390 # issue 3299: check for segfault in debug build
1391 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001392 # the overflow limit is different on wide and narrow builds and it
1393 # depends on the definition of SRE_CODE (see sre.h).
1394 # 2**128 should be big enough to overflow on both. For smaller values
1395 # a RuntimeError is raised instead of OverflowError.
1396 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001397 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001398 with self.assertRaises(OverflowError):
1399 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1400 with self.assertRaises(TypeError):
1401 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001404 self.assertTrue(re.search("123.*-", '123abc-'))
1405 self.assertTrue(re.search("123.*-", '123\xe9-'))
1406 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1407 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1408 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409
Ezio Melottidf723e12012-03-13 01:29:48 +02001410 def test_compile(self):
1411 # Test return value when given string and pattern as parameter
1412 pattern = re.compile('random pattern')
1413 self.assertIsInstance(pattern, re._pattern_type)
1414 same_pattern = re.compile(pattern)
1415 self.assertIsInstance(same_pattern, re._pattern_type)
1416 self.assertIs(same_pattern, pattern)
1417 # Test behaviour when not given a string or pattern as parameter
1418 self.assertRaises(TypeError, re.compile, 0)
1419
Ezio Melottife8e6e72013-01-11 08:32:01 +02001420 def test_bug_13899(self):
1421 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1422 # nothing. Ditto B and Z.
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001423 with self.assertWarns(DeprecationWarning):
1424 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1425 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melottife8e6e72013-01-11 08:32:01 +02001426
Antoine Pitroub33941a2012-12-03 20:55:56 +01001427 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001428 def test_large_search(self, size):
1429 # Issue #10182: indices were 32-bit-truncated.
1430 s = 'a' * size
1431 m = re.search('$', s)
1432 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001433 self.assertEqual(m.start(), size)
1434 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001435
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001436 # The huge memuse is because of re.sub() using a list and a join()
1437 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001438 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001439 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001440 # Issue #10182: indices were 32-bit-truncated.
1441 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001442 r, n = re.subn('', '', s)
1443 self.assertEqual(r, s)
1444 self.assertEqual(n, size + 1)
1445
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001446 def test_bug_16688(self):
1447 # Issue 16688: Backreferences make case-insensitive regex fail on
1448 # non-ASCII strings.
1449 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1450 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001451
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001452 def test_repeat_minmax_overflow(self):
1453 # Issue #13169
1454 string = "x" * 100000
1455 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1456 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1457 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1458 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1459 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1460 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1461 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1462 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1463 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1464 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1465 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1466
1467 @cpython_only
1468 def test_repeat_minmax_overflow_maxrepeat(self):
1469 try:
1470 from _sre import MAXREPEAT
1471 except ImportError:
1472 self.skipTest('requires _sre.MAXREPEAT constant')
1473 string = "x" * 100000
1474 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1475 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1476 (0, 100000))
1477 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1478 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1479 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1480 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1481
R David Murray26dfaac92013-04-14 13:00:54 -04001482 def test_backref_group_name_in_exception(self):
1483 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001484 self.checkPatternError('(?P=<foo>)',
1485 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001486
1487 def test_group_name_in_exception(self):
1488 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001489 self.checkPatternError('(?P<?foo>)',
1490 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001491
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001492 def test_issue17998(self):
1493 for reps in '*', '+', '?', '{1}':
1494 for mod in '', '?':
1495 pattern = '.' + reps + mod + 'yz'
1496 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1497 ['xyz'], msg=pattern)
1498 pattern = pattern.encode()
1499 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1500 [b'xyz'], msg=pattern)
1501
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001502 def test_match_repr(self):
1503 for string in '[abracadabra]', S('[abracadabra]'):
1504 m = re.search(r'(.+)(.*?)\1', string)
1505 self.assertEqual(repr(m), "<%s.%s object; "
1506 "span=(1, 12), match='abracadabra'>" %
1507 (type(m).__module__, type(m).__qualname__))
1508 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1509 bytearray(b'[abracadabra]'),
1510 memoryview(b'[abracadabra]')):
1511 m = re.search(rb'(.+)(.*?)\1', string)
1512 self.assertEqual(repr(m), "<%s.%s object; "
1513 "span=(1, 12), match=b'abracadabra'>" %
1514 (type(m).__module__, type(m).__qualname__))
1515
1516 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1517 self.assertEqual(repr(first), "<%s.%s object; "
1518 "span=(0, 2), match='aa'>" %
1519 (type(second).__module__, type(first).__qualname__))
1520 self.assertEqual(repr(second), "<%s.%s object; "
1521 "span=(3, 5), match='bb'>" %
1522 (type(second).__module__, type(second).__qualname__))
1523
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001524
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001525 def test_bug_2537(self):
1526 # issue 2537: empty submatches
1527 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1528 for inner_op in ('{0,}', '*', '?'):
1529 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1530 m = r.match("xyyzy")
1531 self.assertEqual(m.group(0), "xyy")
1532 self.assertEqual(m.group(1), "")
1533 self.assertEqual(m.group(2), "y")
1534
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001535 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001536 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001537 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001538 re.compile(pat, re.DEBUG)
1539 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001540SUBPATTERN 1
1541 LITERAL 46
1542SUBPATTERN None
1543 BRANCH
1544 IN
1545 LITERAL 99
1546 LITERAL 104
1547 OR
1548 LITERAL 112
1549 LITERAL 121
1550SUBPATTERN None
1551 GROUPREF_EXISTS 1
1552 AT AT_END
1553 ELSE
1554 LITERAL 58
1555 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001556'''
1557 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001558 # Debug output is output again even a second time (bypassing
1559 # the cache -- issue #20426).
1560 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001561 re.compile(pat, re.DEBUG)
1562 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001563
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001564 def test_keyword_parameters(self):
1565 # Issue #20283: Accepting the string keyword parameter.
1566 pat = re.compile(r'(ab)')
1567 self.assertEqual(
1568 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1569 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001570 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1571 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001572 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1573 self.assertEqual(
1574 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1575 self.assertEqual(
1576 pat.split(string='abracadabra', maxsplit=1),
1577 ['', 'ab', 'racadabra'])
1578 self.assertEqual(
1579 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1580 (7, 9))
1581
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001582 def test_bug_20998(self):
1583 # Issue #20998: Fullmatch of repeated single character pattern
1584 # with ignore case.
1585 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1586
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001587 def test_locale_caching(self):
1588 # Issue #22410
1589 oldlocale = locale.setlocale(locale.LC_CTYPE)
1590 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1591 for loc in 'en_US.iso88591', 'en_US.utf8':
1592 try:
1593 locale.setlocale(locale.LC_CTYPE, loc)
1594 except locale.Error:
1595 # Unsupported locale on this system
1596 self.skipTest('test needs %s locale' % loc)
1597
1598 re.purge()
1599 self.check_en_US_iso88591()
1600 self.check_en_US_utf8()
1601 re.purge()
1602 self.check_en_US_utf8()
1603 self.check_en_US_iso88591()
1604
1605 def check_en_US_iso88591(self):
1606 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1607 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1608 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1609 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1610 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1611 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1612 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1613
1614 def check_en_US_utf8(self):
1615 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1616 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1617 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1618 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1619 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1620 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1621 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1622
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001623 def test_error(self):
1624 with self.assertRaises(re.error) as cm:
1625 re.compile('(\u20ac))')
1626 err = cm.exception
1627 self.assertIsInstance(err.pattern, str)
1628 self.assertEqual(err.pattern, '(\u20ac))')
1629 self.assertEqual(err.pos, 3)
1630 self.assertEqual(err.lineno, 1)
1631 self.assertEqual(err.colno, 4)
1632 self.assertIn(err.msg, str(err))
1633 self.assertIn(' at position 3', str(err))
1634 self.assertNotIn(' at position 3', err.msg)
1635 # Bytes pattern
1636 with self.assertRaises(re.error) as cm:
1637 re.compile(b'(\xa4))')
1638 err = cm.exception
1639 self.assertIsInstance(err.pattern, bytes)
1640 self.assertEqual(err.pattern, b'(\xa4))')
1641 self.assertEqual(err.pos, 3)
1642 # Multiline pattern
1643 with self.assertRaises(re.error) as cm:
1644 re.compile("""
1645 (
1646 abc
1647 )
1648 )
1649 (
1650 """, re.VERBOSE)
1651 err = cm.exception
1652 self.assertEqual(err.pos, 77)
1653 self.assertEqual(err.lineno, 5)
1654 self.assertEqual(err.colno, 17)
1655 self.assertIn(err.msg, str(err))
1656 self.assertIn(' at position 77', str(err))
1657 self.assertIn('(line 5, column 17)', str(err))
1658
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001659 def test_misc_errors(self):
1660 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1661 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1662 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1663 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1664 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1665 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1666 self.checkPatternError(r'(?i', 'missing )', 3)
1667 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1668 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1669 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1670 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1671
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001672
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001673class PatternReprTests(unittest.TestCase):
1674 def check(self, pattern, expected):
1675 self.assertEqual(repr(re.compile(pattern)), expected)
1676
1677 def check_flags(self, pattern, flags, expected):
1678 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1679
1680 def test_without_flags(self):
1681 self.check('random pattern',
1682 "re.compile('random pattern')")
1683
1684 def test_single_flag(self):
1685 self.check_flags('random pattern', re.IGNORECASE,
1686 "re.compile('random pattern', re.IGNORECASE)")
1687
1688 def test_multiple_flags(self):
1689 self.check_flags('random pattern', re.I|re.S|re.X,
1690 "re.compile('random pattern', "
1691 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1692
1693 def test_unicode_flag(self):
1694 self.check_flags('random pattern', re.U,
1695 "re.compile('random pattern')")
1696 self.check_flags('random pattern', re.I|re.S|re.U,
1697 "re.compile('random pattern', "
1698 "re.IGNORECASE|re.DOTALL)")
1699
1700 def test_inline_flags(self):
1701 self.check('(?i)pattern',
1702 "re.compile('(?i)pattern', re.IGNORECASE)")
1703
1704 def test_unknown_flags(self):
1705 self.check_flags('random pattern', 0x123000,
1706 "re.compile('random pattern', 0x123000)")
1707 self.check_flags('random pattern', 0x123000|re.I,
1708 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1709
1710 def test_bytes(self):
1711 self.check(b'bytes pattern',
1712 "re.compile(b'bytes pattern')")
1713 self.check_flags(b'bytes pattern', re.A,
1714 "re.compile(b'bytes pattern', re.ASCII)")
1715
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001716 def test_locale(self):
1717 self.check_flags(b'bytes pattern', re.L,
1718 "re.compile(b'bytes pattern', re.LOCALE)")
1719
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001720 def test_quotes(self):
1721 self.check('random "double quoted" pattern',
1722 '''re.compile('random "double quoted" pattern')''')
1723 self.check("random 'single quoted' pattern",
1724 '''re.compile("random 'single quoted' pattern")''')
1725 self.check('''both 'single' and "double" quotes''',
1726 '''re.compile('both \\'single\\' and "double" quotes')''')
1727
1728 def test_long_pattern(self):
1729 pattern = 'Very %spattern' % ('long ' * 1000)
1730 r = repr(re.compile(pattern))
1731 self.assertLess(len(r), 300)
1732 self.assertEqual(r[:30], "re.compile('Very long long lon")
1733 r = repr(re.compile(pattern, re.I))
1734 self.assertLess(len(r), 300)
1735 self.assertEqual(r[:30], "re.compile('Very long long lon")
1736 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1737
1738
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001739class ImplementationTest(unittest.TestCase):
1740 """
1741 Test implementation details of the re module.
1742 """
1743
1744 def test_overlap_table(self):
1745 f = sre_compile._generate_overlap_table
1746 self.assertEqual(f(""), [])
1747 self.assertEqual(f("a"), [0])
1748 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1749 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1750 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1751 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1752
1753
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001754class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001755
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001756 def test_re_benchmarks(self):
1757 're_tests benchmarks'
1758 from test.re_tests import benchmarks
1759 for pattern, s in benchmarks:
1760 with self.subTest(pattern=pattern, string=s):
1761 p = re.compile(pattern)
1762 self.assertTrue(p.search(s))
1763 self.assertTrue(p.match(s))
1764 self.assertTrue(p.fullmatch(s))
1765 s2 = ' '*10000 + s + ' '*10000
1766 self.assertTrue(p.search(s2))
1767 self.assertTrue(p.match(s2, 10000))
1768 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1769 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001770
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001771 def test_re_tests(self):
1772 're_tests test suite'
1773 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1774 for t in tests:
1775 pattern = s = outcome = repl = expected = None
1776 if len(t) == 5:
1777 pattern, s, outcome, repl, expected = t
1778 elif len(t) == 3:
1779 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001780 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001781 raise ValueError('Test tuples should have 3 or 5 fields', t)
1782
1783 with self.subTest(pattern=pattern, string=s):
1784 if outcome == SYNTAX_ERROR: # Expected a syntax error
1785 with self.assertRaises(re.error):
1786 re.compile(pattern)
1787 continue
1788
1789 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001790 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001791 if outcome == FAIL:
1792 self.assertIsNone(result, 'Succeeded incorrectly')
1793 continue
1794
1795 with self.subTest():
1796 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001797 # Matched, as expected, so now we compute the
1798 # result string and compare it to our expected result.
1799 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001800 vardict = {'found': result.group(0),
1801 'groups': result.group(),
1802 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001803 for i in range(1, 100):
1804 try:
1805 gi = result.group(i)
1806 # Special hack because else the string concat fails:
1807 if gi is None:
1808 gi = "None"
1809 except IndexError:
1810 gi = "Error"
1811 vardict['g%d' % i] = gi
1812 for i in result.re.groupindex.keys():
1813 try:
1814 gi = result.group(i)
1815 if gi is None:
1816 gi = "None"
1817 except IndexError:
1818 gi = "Error"
1819 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001820 self.assertEqual(eval(repl, vardict), expected,
1821 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001822
Antoine Pitrou22628c42008-07-22 17:53:22 +00001823 # Try the match with both pattern and string converted to
1824 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001825 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001826 bpat = bytes(pattern, "ascii")
1827 bs = bytes(s, "ascii")
1828 except UnicodeEncodeError:
1829 # skip non-ascii tests
1830 pass
1831 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001832 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001833 obj = re.compile(bpat)
1834 self.assertTrue(obj.search(bs))
1835
1836 # Try the match with LOCALE enabled, and check that it
1837 # still succeeds.
1838 with self.subTest('locale-sensitive match'):
1839 obj = re.compile(bpat, re.LOCALE)
1840 result = obj.search(bs)
1841 if result is None:
1842 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001843
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001844 # Try the match with the search area limited to the extent
1845 # of the match and see if it still succeeds. \B will
1846 # break (because it won't match at the end or start of a
1847 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001848 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1849 and result is not None):
1850 with self.subTest('range-limited match'):
1851 obj = re.compile(pattern)
1852 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001853
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001854 # Try the match with IGNORECASE enabled, and check that it
1855 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001856 with self.subTest('case-insensitive match'):
1857 obj = re.compile(pattern, re.IGNORECASE)
1858 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001859
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001860 # Try the match with UNICODE locale enabled, and check
1861 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001862 with self.subTest('unicode-sensitive match'):
1863 obj = re.compile(pattern, re.UNICODE)
1864 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001865
Gregory P. Smith5a631832010-07-27 05:31:29 +00001866
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001867if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001868 unittest.main()