blob: f415f3f8c4b988d5c4f5e1dd98783d86542745ec [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000116 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
127 with self.assertWarns(DeprecationWarning):
128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000130 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
134 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
221 re.compile('(?P<a>x)(?P=a)(?(a)y)')
222 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300223 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200224 self.checkPatternError('(?P<a>)(?P<a>)',
225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300227 self.checkPatternError('(?P<a>(?P=a))',
228 "cannot refer to an open group", 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200229 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
230 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
231 self.checkPatternError('(?P=', 'missing group name', 4)
232 self.checkPatternError('(?P=)', 'missing group name', 4)
233 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
234 self.checkPatternError('(?P=a)', "unknown group name 'a'")
235 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
236 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
237 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
238 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
239 self.checkPatternError('(?P<', 'missing group name', 4)
240 self.checkPatternError('(?P<>)', 'missing group name', 4)
241 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
242 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
243 self.checkPatternError(r'(?(', 'missing group name', 3)
244 self.checkPatternError(r'(?())', 'missing group name', 3)
245 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
246 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
247 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
248 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200249 # New valid/invalid identifiers in Python 3
250 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
251 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200252 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300253 # Support > 100 groups.
254 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
255 pat = '(?:%s)(?(200)z|t)' % pat
256 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200259 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
260 'missing >, unterminated name', 3)
261 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
262 'missing group name', 3)
263 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
264 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
265 "bad character in group name 'a a'", 3)
266 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
267 'missing group name', 3)
268 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
269 "bad character in group name '1a1'", 3)
270 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
271 'invalid group reference')
272 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
273 'invalid group reference')
274 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
275 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300276 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200278 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
279 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200280 # New valid/invalid identifiers in Python 3
281 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
282 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200283 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
284 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300285 # Support > 100 groups.
286 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
287 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000288
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 def test_re_subn(self):
290 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
291 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
292 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
293 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100294 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000295
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000296 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300297 for string in ":a:b::c", S(":a:b::c"):
298 self.assertTypedEqual(re.split(":", string),
299 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', ':', 'a', ':', 'b', '::', 'c'])
304 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
305 memoryview(b":a:b::c")):
306 self.assertTypedEqual(re.split(b":", string),
307 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300312 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
313 "\U0001d49c\U0001d49e\U0001d4b5"):
314 string = ":%s:%s::%s" % (a, b, c)
315 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200316 self.assertEqual(re.split(":+", string), ['', a, b, c])
317 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300318 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300319
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200320 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
321 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000322 ['', ':', 'a', ':', 'b', ':', 'c'])
323 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
324 ['', ':', 'a', ':b::', 'c'])
325 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
326 ['', None, ':', 'a', None, ':', '', 'b', None, '',
327 None, '::', 'c'])
328 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
329 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000330
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200331 for sep, expected in [
332 (':*', ['', 'a', 'b', 'c']),
333 ('(?::*)', ['', 'a', 'b', 'c']),
334 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
335 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
336 ]:
337 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
338 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
339
340 for sep, expected in [
341 ('', [':a:b::c']),
342 (r'\b', [':a:b::c']),
343 (r'(?=:)', [':a:b::c']),
344 (r'(?<=:)', [':a:b::c']),
345 ]:
346 with self.subTest(sep=sep), self.assertRaises(ValueError):
347 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
348
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000349 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100350 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
351 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
352 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000355 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200356 with self.assertWarns(FutureWarning):
357 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
358 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000359
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000360 def test_re_findall(self):
361 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300362 for string in "a:b::c:::d", S("a:b::c:::d"):
363 self.assertTypedEqual(re.findall(":+", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:+)", string),
366 [":", "::", ":::"])
367 self.assertTypedEqual(re.findall("(:)(:*)", string),
368 [(":", ""), (":", ":"), (":", "::")])
369 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
370 memoryview(b"a:b::c:::d")):
371 self.assertTypedEqual(re.findall(b":+", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:+)", string),
374 [b":", b"::", b":::"])
375 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
376 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300377 for x in ("\xe0", "\u0430", "\U0001d49c"):
378 xx = x * 2
379 xxx = x * 3
380 string = "a%sb%sc%sd" % (x, xx, xxx)
381 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
382 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
383 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
384 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000385
Skip Montanaro5ba00542003-04-25 16:00:14 +0000386 def test_bug_117612(self):
387 self.assertEqual(re.findall(r"(a|(b))", "aba"),
388 [("a", ""),("b", "b"),("a", "")])
389
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000390 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300391 for string in 'a', S('a'):
392 self.assertEqual(re.match('a', string).groups(), ())
393 self.assertEqual(re.match('(a)', string).groups(), ('a',))
394 self.assertEqual(re.match('(a)', string).group(0), 'a')
395 self.assertEqual(re.match('(a)', string).group(1), 'a')
396 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
397 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
398 self.assertEqual(re.match(b'a', string).groups(), ())
399 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
400 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
401 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
402 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300403 for a in ("\xe0", "\u0430", "\U0001d49c"):
404 self.assertEqual(re.match(a, a).groups(), ())
405 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
406 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
407 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
408 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000409
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000410 pat = re.compile('((a)|(b))(c)?')
411 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
412 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
413 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
414 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
415 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000416
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000417 # A single group
418 m = re.match('(a)', 'a')
419 self.assertEqual(m.group(0), 'a')
420 self.assertEqual(m.group(0), 'a')
421 self.assertEqual(m.group(1), 'a')
422 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000423
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000424 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
425 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
426 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
427 (None, 'b', None))
428 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000429
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200430 def test_re_fullmatch(self):
431 # Issue 16203: Proposal: add re.fullmatch() method.
432 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
433 for string in "ab", S("ab"):
434 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
435 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
436 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
437 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
438 r = r"%s|%s" % (a, a + b)
439 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
440 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
441 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
442 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
443 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
444 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
445 self.assertIsNone(re.fullmatch(r"a+", "ab"))
446 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
447 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
448 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
449 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
450 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
451 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
452
453 self.assertEqual(
454 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
455 self.assertEqual(
456 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
457 self.assertEqual(
458 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
459
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000461 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
462 ('(', 'a'))
463 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
464 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300465 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
466 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000467 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
468 ('a', 'b'))
469 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
470 (None, 'd'))
471 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
472 (None, 'd'))
473 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
474 ('a', ''))
475
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000476 # Tests for bug #1177831: exercise groups other than the first group
477 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
478 self.assertEqual(p.match('abc').groups(),
479 ('a', 'b', 'c'))
480 self.assertEqual(p.match('ad').groups(),
481 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300482 self.assertIsNone(p.match('abd'))
483 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000484
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300485 # Support > 100 groups.
486 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
487 pat = '(?:%s)(?(200)z)' % pat
488 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000489
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200490 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
491 self.checkPatternError(r'()(?(1)a|b',
492 'missing ), unterminated subpattern', 2)
493 self.checkPatternError(r'()(?(1)a|b|c)',
494 'conditional backref with more than '
495 'two branches', 10)
496
497 def test_re_groupref_overflow(self):
498 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
499 'invalid group reference', 3)
500 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
501 'invalid group reference', 10)
502
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000503 def test_re_groupref(self):
504 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
505 ('|', 'a'))
506 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
507 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300508 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
509 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000510 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
511 ('a', 'a'))
512 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
513 (None, None))
514
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200515 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
516
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000517 def test_groupdict(self):
518 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
519 'first second').groupdict(),
520 {'first':'first', 'second':'second'})
521
522 def test_expand(self):
523 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
524 "first second")
525 .expand(r"\2 \1 \g<second> \g<first>"),
526 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300527 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
528 "first")
529 .expand(r"\2 \g<second>"),
530 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000531
532 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300533 self.assertIsNone(re.match("^(\w){1}$", "abc"))
534 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
535 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
536 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000537
538 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
539 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
540 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
541 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
542 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
543 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
544 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
545 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
546
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300547 self.assertIsNone(re.match("^x{1}$", "xxx"))
548 self.assertIsNone(re.match("^x{1}?$", "xxx"))
549 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
550 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000551
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300552 self.assertTrue(re.match("^x{3}$", "xxx"))
553 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200554 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300555 self.assertTrue(re.match("^x{1,4}$", "xxx"))
556 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
557 self.assertTrue(re.match("^x{3}?$", "xxx"))
558 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
559 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
560 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300562 self.assertIsNone(re.match("^x{}$", "xxx"))
563 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000564
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200565 self.checkPatternError(r'x{2,1}',
566 'min repeat greater than max repeat', 2)
567
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000568 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000569 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000570 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000571 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
572 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
573 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
574 {'first': 1, 'other': 2})
575
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000576 self.assertEqual(re.match("(a)", "a").pos, 0)
577 self.assertEqual(re.match("(a)", "a").endpos, 1)
578 self.assertEqual(re.match("(a)", "a").string, "a")
579 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300580 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000581
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300582 # Issue 14260. groupindex should be non-modifiable mapping.
583 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
584 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
585 self.assertEqual(p.groupindex['other'], 2)
586 with self.assertRaises(TypeError):
587 p.groupindex['other'] = 0
588 self.assertEqual(p.groupindex['other'], 2)
589
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000590 def test_special_escapes(self):
591 self.assertEqual(re.search(r"\b(b.)\b",
592 "abcd abc bcd bx").group(1), "bx")
593 self.assertEqual(re.search(r"\B(b.)\B",
594 "abc bcd bc abxd").group(1), "bx")
595 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300596 "abcd abc bcd bx", re.ASCII).group(1), "bx")
597 self.assertEqual(re.search(r"\B(b.)\B",
598 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000599 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
600 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300601 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300602 self.assertEqual(re.search(br"\b(b.)\b",
603 b"abcd abc bcd bx").group(1), b"bx")
604 self.assertEqual(re.search(br"\B(b.)\B",
605 b"abc bcd bc abxd").group(1), b"bx")
606 self.assertEqual(re.search(br"\b(b.)\b",
607 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
608 self.assertEqual(re.search(br"\B(b.)\B",
609 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
610 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
611 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300612 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613 self.assertEqual(re.search(r"\d\D\w\W\s\S",
614 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300615 self.assertEqual(re.search(br"\d\D\w\W\s\S",
616 b"1aa! a").group(0), b"1aa! a")
617 self.assertEqual(re.search(r"\d\D\w\W\s\S",
618 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300619 self.assertEqual(re.search(br"\d\D\w\W\s\S",
620 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000621
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200622 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200623 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200624 self.assertEqual(re.match(r"\(", '(').group(), '(')
625 self.assertIsNone(re.match(r"\(", ')'))
626 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200627 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
628 self.assertIsNone(re.match(r"[\]]", '['))
629 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
630 self.assertIsNone(re.match(r"[a\-c]", 'b'))
631 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
632 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200633 re.purge() # for warnings
634 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
635 with self.subTest(c):
636 with self.assertWarns(DeprecationWarning):
637 self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
638 self.assertIsNone(re.match('\\%c' % c, 'a'))
639 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
640 with self.subTest(c):
641 with self.assertWarns(DeprecationWarning):
642 self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
643 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200644
Ezio Melotti5a045b92012-02-29 11:48:44 +0200645 def test_string_boundaries(self):
646 # See http://bugs.python.org/issue10713
647 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
648 "abc")
649 # There's a word boundary at the start of a string.
650 self.assertTrue(re.match(r"\b", "abc"))
651 # A non-empty string includes a non-boundary zero-length match.
652 self.assertTrue(re.search(r"\B", "abc"))
653 # There is no non-boundary match at the start of a string.
654 self.assertFalse(re.match(r"\B", "abc"))
655 # However, an empty string contains no word boundaries, and also no
656 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300657 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200658 # This one is questionable and different from the perlre behaviour,
659 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300660 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200661 # A single word-character string has two boundaries, but no
662 # non-boundary gaps.
663 self.assertEqual(len(re.findall(r"\b", "a")), 2)
664 self.assertEqual(len(re.findall(r"\B", "a")), 0)
665 # If there are no words, there are no boundaries
666 self.assertEqual(len(re.findall(r"\b", " ")), 0)
667 self.assertEqual(len(re.findall(r"\b", " ")), 0)
668 # Can match around the whitespace.
669 self.assertEqual(len(re.findall(r"\B", " ")), 2)
670
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000671 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 self.assertEqual(re.match("([\u2222\u2223])",
673 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300674 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300675 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000676
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100677 def test_big_codesize(self):
678 # Issue #1160
679 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300680 self.assertTrue(r.match('1000'))
681 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100682
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000683 def test_anyall(self):
684 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
685 "a\nb")
686 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
687 "a\n\nb")
688
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200689 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000690 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
691 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
692 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
693 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
694 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
695 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
696 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
697
698 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
699 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
700 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
701 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
702
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200703 # Group reference.
704 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
705 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
706 # Conditional group reference.
707 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
708 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
709 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
710 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
711 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
712 # Group used before defined.
713 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
714 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
715 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
716
717 def test_lookbehind(self):
718 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
719 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
720 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
721 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
722 # Group reference.
723 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
724 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
725 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
726 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
727 # Conditional group reference.
728 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
729 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
730 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
731 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
732 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
733 # Group used before defined.
734 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
735 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
736 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
737 # Group defined in the same lookbehind pattern
738 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
739 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
740 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
741 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
742
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000743 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000744 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300745 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000746 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
747 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
748 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
749 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
750 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
751 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
752 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
753 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
754
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200755 assert '\u212a'.lower() == 'k' # 'K'
756 self.assertTrue(re.match(r'K', '\u212a', re.I))
757 self.assertTrue(re.match(r'k', '\u212a', re.I))
758 self.assertTrue(re.match(r'\u212a', 'K', re.I))
759 self.assertTrue(re.match(r'\u212a', 'k', re.I))
760 assert '\u017f'.upper() == 'S' # 'ſ'
761 self.assertTrue(re.match(r'S', '\u017f', re.I))
762 self.assertTrue(re.match(r's', '\u017f', re.I))
763 self.assertTrue(re.match(r'\u017f', 'S', re.I))
764 self.assertTrue(re.match(r'\u017f', 's', re.I))
765 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
766 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
767 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
768
769 def test_ignore_case_set(self):
770 self.assertTrue(re.match(r'[19A]', 'A', re.I))
771 self.assertTrue(re.match(r'[19a]', 'a', re.I))
772 self.assertTrue(re.match(r'[19a]', 'A', re.I))
773 self.assertTrue(re.match(r'[19A]', 'a', re.I))
774 self.assertTrue(re.match(br'[19A]', b'A', re.I))
775 self.assertTrue(re.match(br'[19a]', b'a', re.I))
776 self.assertTrue(re.match(br'[19a]', b'A', re.I))
777 self.assertTrue(re.match(br'[19A]', b'a', re.I))
778 assert '\u212a'.lower() == 'k' # 'K'
779 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
780 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
781 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
782 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
783 assert '\u017f'.upper() == 'S' # 'ſ'
784 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
785 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
786 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
787 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
788 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
789 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
790 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
791
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200792 def test_ignore_case_range(self):
793 # Issues #3511, #17381.
794 self.assertTrue(re.match(r'[9-a]', '_', re.I))
795 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
796 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
797 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
798 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
799 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
800 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
801 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
802 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
803 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
804 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
805 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
806 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
807 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
808 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
809 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
810
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200811 assert '\u212a'.lower() == 'k' # 'K'
812 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
813 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
814 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
815 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
816 assert '\u017f'.upper() == 'S' # 'ſ'
817 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
818 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
819 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
820 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
821 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
822 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
823 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
824
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000825 def test_category(self):
826 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
827
828 def test_getlower(self):
829 import _sre
830 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
831 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
832 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200833 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000834
835 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300836 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200837 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
838 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000839
840 def test_not_literal(self):
841 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
842 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
843
844 def test_search_coverage(self):
845 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
846 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
847
Ezio Melottid2114eb2011-03-25 14:08:44 +0200848 def assertMatch(self, pattern, text, match=None, span=None,
849 matcher=re.match):
850 if match is None and span is None:
851 # the pattern matches the whole text
852 match = text
853 span = (0, len(text))
854 elif match is None or span is None:
855 raise ValueError('If match is not None, span should be specified '
856 '(and vice versa).')
857 m = matcher(pattern, text)
858 self.assertTrue(m)
859 self.assertEqual(m.group(), match)
860 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000861
Ezio Melottid2114eb2011-03-25 14:08:44 +0200862 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300863 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200864 p = ''.join(chr(i) for i in range(256))
865 for c in p:
866 if c in alnum_chars:
867 self.assertEqual(re.escape(c), c)
868 elif c == '\x00':
869 self.assertEqual(re.escape(c), '\\000')
870 else:
871 self.assertEqual(re.escape(c), '\\' + c)
872 self.assertMatch(re.escape(c), c)
873 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000874
Guido van Rossum698280d2008-09-10 17:44:35 +0000875 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300876 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200877 p = bytes(range(256))
878 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000879 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200880 if b in alnum_chars:
881 self.assertEqual(re.escape(b), b)
882 elif i == 0:
883 self.assertEqual(re.escape(b), b'\\000')
884 else:
885 self.assertEqual(re.escape(b), b'\\' + b)
886 self.assertMatch(re.escape(b), b)
887 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000888
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200889 def test_re_escape_non_ascii(self):
890 s = 'xxx\u2620\u2620\u2620xxx'
891 s_escaped = re.escape(s)
892 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
893 self.assertMatch(s_escaped, s)
894 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
895 'x\u2620\u2620\u2620x', (2, 7), re.search)
896
897 def test_re_escape_non_ascii_bytes(self):
898 b = 'y\u2620y\u2620y'.encode('utf-8')
899 b_escaped = re.escape(b)
900 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
901 self.assertMatch(b_escaped, b)
902 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
903 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000904
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300905 def test_pickling(self):
906 import pickle
907 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
908 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
909 pickled = pickle.dumps(oldpat, proto)
910 newpat = pickle.loads(pickled)
911 self.assertEqual(newpat, oldpat)
912 # current pickle expects the _compile() reconstructor in re module
913 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000914
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000915 def test_constants(self):
916 self.assertEqual(re.I, re.IGNORECASE)
917 self.assertEqual(re.L, re.LOCALE)
918 self.assertEqual(re.M, re.MULTILINE)
919 self.assertEqual(re.S, re.DOTALL)
920 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000921
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000922 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200923 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300924 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200925 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
926 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000927
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000928 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200929 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
930 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300931 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
932 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
933 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
934 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
935 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
936 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200937 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300938 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
939 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
940 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
941 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
942 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
943 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
944 self.assertTrue(re.match(r"\0", "\000"))
945 self.assertTrue(re.match(r"\08", "\0008"))
946 self.assertTrue(re.match(r"\01", "\001"))
947 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200948 self.checkPatternError(r"\567",
949 r'octal escape value \567 outside of '
950 r'range 0-0o377', 0)
951 self.checkPatternError(r"\911", 'invalid group reference', 0)
952 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
953 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
954 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
955 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
956 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
957 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
958 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000959
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000960 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200961 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
962 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300963 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
964 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
965 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
966 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
967 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
968 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
969 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
970 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200971 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300972 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
973 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
974 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
975 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
976 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
977 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200978 self.checkPatternError(r"[\567]",
979 r'octal escape value \567 outside of '
980 r'range 0-0o377', 1)
981 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
982 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
983 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
984 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
985 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300986 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200987
988 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000989 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300990 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
991 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
992 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
993 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
994 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
995 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200996 with self.assertWarns(DeprecationWarning):
997 self.assertTrue(re.match(br"\u1234", b'u1234'))
998 with self.assertWarns(DeprecationWarning):
999 self.assertTrue(re.match(br"\U00012345", b'U00012345'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001000 self.assertTrue(re.match(br"\0", b"\000"))
1001 self.assertTrue(re.match(br"\08", b"\0008"))
1002 self.assertTrue(re.match(br"\01", b"\001"))
1003 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001004 self.checkPatternError(br"\567",
1005 r'octal escape value \567 outside of '
1006 r'range 0-0o377', 0)
1007 self.checkPatternError(br"\911", 'invalid group reference', 0)
1008 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1009 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001010
1011 def test_sre_byte_class_literals(self):
1012 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001013 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1014 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1015 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1016 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1017 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1018 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1019 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1020 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001021 with self.assertWarns(DeprecationWarning):
1022 self.assertTrue(re.match(br"[\u1234]", b'u'))
1023 with self.assertWarns(DeprecationWarning):
1024 self.assertTrue(re.match(br"[\U00012345]", b'U'))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001025 self.checkPatternError(br"[\567]",
1026 r'octal escape value \567 outside of '
1027 r'range 0-0o377', 1)
1028 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1029 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1030
1031 def test_character_set_errors(self):
1032 self.checkPatternError(r'[', 'unterminated character set', 0)
1033 self.checkPatternError(r'[^', 'unterminated character set', 0)
1034 self.checkPatternError(r'[a', 'unterminated character set', 0)
1035 # bug 545855 -- This pattern failed to cause a compile error as it
1036 # should, instead provoking a TypeError.
1037 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1038 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1039 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1040 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001041
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001042 def test_bug_113254(self):
1043 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1044 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1045 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1046
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001047 def test_bug_527371(self):
1048 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001049 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001050 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1051 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1052 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1053 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1054
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001055 def test_bug_418626(self):
1056 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1057 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1058 # pattern '*?' on a long string.
1059 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1060 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1061 20003)
1062 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001064 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001066
1067 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001069 self.assertEqual(re.compile(pat) and 1, 1)
1070
Skip Montanaro1e703c62003-04-25 15:40:28 +00001071 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001073 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001074 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1075 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1076 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001077
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001078 def test_nothing_to_repeat(self):
1079 for reps in '*', '+', '?', '{1,2}':
1080 for mod in '', '?':
1081 self.checkPatternError('%s%s' % (reps, mod),
1082 'nothing to repeat', 0)
1083 self.checkPatternError('(?:%s%s)' % (reps, mod),
1084 'nothing to repeat', 3)
1085
1086 def test_multiple_repeat(self):
1087 for outer_reps in '*', '+', '{1,2}':
1088 for outer_mod in '', '?':
1089 outer_op = outer_reps + outer_mod
1090 for inner_reps in '*', '+', '?', '{1,2}':
1091 for inner_mod in '', '?':
1092 inner_op = inner_reps + inner_mod
1093 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1094 'multiple repeat', 1 + len(inner_op))
1095
Serhiy Storchakafa468162013-02-16 21:23:53 +02001096 def test_unlimited_zero_width_repeat(self):
1097 # Issue #9669
1098 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1099 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1100 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1101 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1102 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1103 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1104
Skip Montanaro1e703c62003-04-25 15:40:28 +00001105 def test_scanner(self):
1106 def s_ident(scanner, token): return token
1107 def s_operator(scanner, token): return "op%s" % token
1108 def s_float(scanner, token): return float(token)
1109 def s_int(scanner, token): return int(token)
1110
1111 scanner = Scanner([
1112 (r"[a-zA-Z_]\w*", s_ident),
1113 (r"\d+\.\d*", s_float),
1114 (r"\d+", s_int),
1115 (r"=|\+|-|\*|/", s_operator),
1116 (r"\s+", None),
1117 ])
1118
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001119 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001120
Skip Montanaro1e703c62003-04-25 15:40:28 +00001121 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1122 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1123 'op+', 'bar'], ''))
1124
Skip Montanaro5ba00542003-04-25 16:00:14 +00001125 def test_bug_448951(self):
1126 # bug 448951 (similar to 429357, but with single char match)
1127 # (Also test greedy matches.)
1128 for op in '','?','*':
1129 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1130 (None, None))
1131 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1132 ('a:', 'a'))
1133
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001134 def test_bug_725106(self):
1135 # capturing groups in alternatives in repeats
1136 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1137 ('b', 'a'))
1138 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1139 ('c', 'b'))
1140 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1141 ('b', None))
1142 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1143 ('b', None))
1144 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1145 ('b', 'a'))
1146 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1147 ('c', 'b'))
1148 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1149 ('b', None))
1150 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1151 ('b', None))
1152
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001153 def test_bug_725149(self):
1154 # mark_stack_base restoring before restoring marks
1155 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1156 ('a', None))
1157 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1158 ('a', None, None))
1159
Just van Rossum12723ba2003-07-02 20:03:04 +00001160 def test_bug_764548(self):
1161 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001163 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001164 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001165
Skip Montanaro5ba00542003-04-25 16:00:14 +00001166 def test_finditer(self):
1167 iter = re.finditer(r":+", "a:b::c:::d")
1168 self.assertEqual([item.group(0) for item in iter],
1169 [":", "::", ":::"])
1170
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001171 pat = re.compile(r":+")
1172 iter = pat.finditer("a:b::c:::d", 1, 10)
1173 self.assertEqual([item.group(0) for item in iter],
1174 [":", "::", ":::"])
1175
1176 pat = re.compile(r":+")
1177 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1178 self.assertEqual([item.group(0) for item in iter],
1179 [":", "::", ":::"])
1180
1181 pat = re.compile(r":+")
1182 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1183 self.assertEqual([item.group(0) for item in iter],
1184 [":", "::", ":::"])
1185
1186 pat = re.compile(r":+")
1187 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1188 self.assertEqual([item.group(0) for item in iter],
1189 ["::", "::"])
1190
Thomas Wouters40a088d2008-03-18 20:19:54 +00001191 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001192 self.assertIsNot(re.compile('bug_926075'),
1193 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001194
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001195 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001196 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001197 self.assertEqual(re.compile(pattern).split("a.b.c"),
1198 ['a','b','c'])
1199
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001200 def test_bug_581080(self):
1201 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001202 self.assertEqual(next(iter).span(), (1,2))
1203 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001204
1205 scanner = re.compile(r"\s").scanner("a b")
1206 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001207 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001208
1209 def test_bug_817234(self):
1210 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001211 self.assertEqual(next(iter).span(), (0, 4))
1212 self.assertEqual(next(iter).span(), (4, 4))
1213 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001214
Mark Dickinson1f268282009-07-28 17:22:36 +00001215 def test_bug_6561(self):
1216 # '\d' should match characters in Unicode category 'Nd'
1217 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1218 # Letter) or 'No' (Number, Other).
1219 decimal_digits = [
1220 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1221 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1222 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1223 ]
1224 for x in decimal_digits:
1225 self.assertEqual(re.match('^\d$', x).group(0), x)
1226
1227 not_decimal_digits = [
1228 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1229 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1230 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1231 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1232 ]
1233 for x in not_decimal_digits:
1234 self.assertIsNone(re.match('^\d$', x))
1235
Guido van Rossumd8faa362007-04-27 19:54:29 +00001236 def test_empty_array(self):
1237 # SF buf 1647541
1238 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001239 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001240 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001241 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001242 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001243
Christian Heimes072c0f12008-01-03 23:01:04 +00001244 def test_inline_flags(self):
1245 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001246 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1247 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001248
1249 p = re.compile(upper_char, re.I | re.U)
1250 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001251 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001252
1253 p = re.compile(lower_char, re.I | re.U)
1254 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001255 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001256
1257 p = re.compile('(?i)' + upper_char, re.U)
1258 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001259 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001260
1261 p = re.compile('(?i)' + lower_char, re.U)
1262 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001263 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001264
1265 p = re.compile('(?iu)' + upper_char)
1266 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001267 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001268
1269 p = re.compile('(?iu)' + lower_char)
1270 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001271 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001272
Serhiy Storchakacc66a652016-09-11 01:39:51 +03001273 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
1274 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
1275
Christian Heimes25bb7832008-01-11 16:17:00 +00001276 def test_dollar_matches_twice(self):
1277 "$ matches the end of string, and just before the terminating \n"
1278 pattern = re.compile('$')
1279 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1280 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1281 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1282
1283 pattern = re.compile('$', re.MULTILINE)
1284 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1285 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1286 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1287
Antoine Pitroufd036452008-08-19 17:56:33 +00001288 def test_bytes_str_mixing(self):
1289 # Mixing str and bytes is disallowed
1290 pat = re.compile('.')
1291 bpat = re.compile(b'.')
1292 self.assertRaises(TypeError, pat.match, b'b')
1293 self.assertRaises(TypeError, bpat.match, 'b')
1294 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1295 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1296 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1297 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1298 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1299 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1300
1301 def test_ascii_and_unicode_flag(self):
1302 # String patterns
1303 for flags in (0, re.UNICODE):
1304 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001305 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001306 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001307 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001308 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001309 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001310 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001311 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001312 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001313 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001314 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001315 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001316 # Bytes patterns
1317 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001318 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001319 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001320 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001321 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001322 # Incompatibilities
1323 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1324 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1325 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1326 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1327 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1328 self.assertRaises(ValueError, re.compile, '(?au)\w')
1329
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001330 def test_locale_flag(self):
1331 import locale
1332 _, enc = locale.getlocale(locale.LC_CTYPE)
1333 # Search non-ASCII letter
1334 for i in range(128, 256):
1335 try:
1336 c = bytes([i]).decode(enc)
1337 sletter = c.lower()
1338 if sletter == c: continue
1339 bletter = sletter.encode(enc)
1340 if len(bletter) != 1: continue
1341 if bletter.decode(enc) != sletter: continue
1342 bpat = re.escape(bytes([i]))
1343 break
1344 except (UnicodeError, TypeError):
1345 pass
1346 else:
1347 bletter = None
1348 bpat = b'A'
1349 # Bytes patterns
1350 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1351 if bletter:
1352 self.assertTrue(pat.match(bletter))
1353 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1354 if bletter:
1355 self.assertTrue(pat.match(bletter))
1356 pat = re.compile(bpat, re.IGNORECASE)
1357 if bletter:
1358 self.assertIsNone(pat.match(bletter))
1359 pat = re.compile(b'\w', re.LOCALE)
1360 if bletter:
1361 self.assertTrue(pat.match(bletter))
1362 pat = re.compile(b'(?L)\w')
1363 if bletter:
1364 self.assertTrue(pat.match(bletter))
1365 pat = re.compile(b'\w')
1366 if bletter:
1367 self.assertIsNone(pat.match(bletter))
1368 # Incompatibilities
1369 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1370 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1371 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1372 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1373 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1374 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1375
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001376 def test_bug_6509(self):
1377 # Replacement strings of both types must parse properly.
1378 # all strings
1379 pat = re.compile('a(\w)')
1380 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1381 pat = re.compile('a(.)')
1382 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1383 pat = re.compile('..')
1384 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1385
1386 # all bytes
1387 pat = re.compile(b'a(\w)')
1388 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1389 pat = re.compile(b'a(.)')
1390 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1391 pat = re.compile(b'..')
1392 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1393
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001394 def test_dealloc(self):
1395 # issue 3299: check for segfault in debug build
1396 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001397 # the overflow limit is different on wide and narrow builds and it
1398 # depends on the definition of SRE_CODE (see sre.h).
1399 # 2**128 should be big enough to overflow on both. For smaller values
1400 # a RuntimeError is raised instead of OverflowError.
1401 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001402 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001403 with self.assertRaises(OverflowError):
1404 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1405 with self.assertRaises(TypeError):
1406 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001409 self.assertTrue(re.search("123.*-", '123abc-'))
1410 self.assertTrue(re.search("123.*-", '123\xe9-'))
1411 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1412 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1413 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414
Ezio Melottidf723e12012-03-13 01:29:48 +02001415 def test_compile(self):
1416 # Test return value when given string and pattern as parameter
1417 pattern = re.compile('random pattern')
1418 self.assertIsInstance(pattern, re._pattern_type)
1419 same_pattern = re.compile(pattern)
1420 self.assertIsInstance(same_pattern, re._pattern_type)
1421 self.assertIs(same_pattern, pattern)
1422 # Test behaviour when not given a string or pattern as parameter
1423 self.assertRaises(TypeError, re.compile, 0)
1424
Ezio Melottife8e6e72013-01-11 08:32:01 +02001425 def test_bug_13899(self):
1426 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1427 # nothing. Ditto B and Z.
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001428 with self.assertWarns(DeprecationWarning):
1429 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1430 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melottife8e6e72013-01-11 08:32:01 +02001431
Antoine Pitroub33941a2012-12-03 20:55:56 +01001432 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001433 def test_large_search(self, size):
1434 # Issue #10182: indices were 32-bit-truncated.
1435 s = 'a' * size
1436 m = re.search('$', s)
1437 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001438 self.assertEqual(m.start(), size)
1439 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001440
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001441 # The huge memuse is because of re.sub() using a list and a join()
1442 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001443 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001444 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001445 # Issue #10182: indices were 32-bit-truncated.
1446 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001447 r, n = re.subn('', '', s)
1448 self.assertEqual(r, s)
1449 self.assertEqual(n, size + 1)
1450
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001451 def test_bug_16688(self):
1452 # Issue 16688: Backreferences make case-insensitive regex fail on
1453 # non-ASCII strings.
1454 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1455 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001456
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001457 def test_repeat_minmax_overflow(self):
1458 # Issue #13169
1459 string = "x" * 100000
1460 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1461 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1462 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1463 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1464 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1465 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1466 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1467 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1468 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1469 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1470 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1471
1472 @cpython_only
1473 def test_repeat_minmax_overflow_maxrepeat(self):
1474 try:
1475 from _sre import MAXREPEAT
1476 except ImportError:
1477 self.skipTest('requires _sre.MAXREPEAT constant')
1478 string = "x" * 100000
1479 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1480 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1481 (0, 100000))
1482 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1483 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1484 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1485 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1486
R David Murray26dfaac92013-04-14 13:00:54 -04001487 def test_backref_group_name_in_exception(self):
1488 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001489 self.checkPatternError('(?P=<foo>)',
1490 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001491
1492 def test_group_name_in_exception(self):
1493 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001494 self.checkPatternError('(?P<?foo>)',
1495 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001496
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001497 def test_issue17998(self):
1498 for reps in '*', '+', '?', '{1}':
1499 for mod in '', '?':
1500 pattern = '.' + reps + mod + 'yz'
1501 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1502 ['xyz'], msg=pattern)
1503 pattern = pattern.encode()
1504 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1505 [b'xyz'], msg=pattern)
1506
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001507 def test_match_repr(self):
1508 for string in '[abracadabra]', S('[abracadabra]'):
1509 m = re.search(r'(.+)(.*?)\1', string)
1510 self.assertEqual(repr(m), "<%s.%s object; "
1511 "span=(1, 12), match='abracadabra'>" %
1512 (type(m).__module__, type(m).__qualname__))
1513 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1514 bytearray(b'[abracadabra]'),
1515 memoryview(b'[abracadabra]')):
1516 m = re.search(rb'(.+)(.*?)\1', string)
1517 self.assertEqual(repr(m), "<%s.%s object; "
1518 "span=(1, 12), match=b'abracadabra'>" %
1519 (type(m).__module__, type(m).__qualname__))
1520
1521 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1522 self.assertEqual(repr(first), "<%s.%s object; "
1523 "span=(0, 2), match='aa'>" %
1524 (type(second).__module__, type(first).__qualname__))
1525 self.assertEqual(repr(second), "<%s.%s object; "
1526 "span=(3, 5), match='bb'>" %
1527 (type(second).__module__, type(second).__qualname__))
1528
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001529
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001530 def test_bug_2537(self):
1531 # issue 2537: empty submatches
1532 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1533 for inner_op in ('{0,}', '*', '?'):
1534 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1535 m = r.match("xyyzy")
1536 self.assertEqual(m.group(0), "xyy")
1537 self.assertEqual(m.group(1), "")
1538 self.assertEqual(m.group(2), "y")
1539
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001540 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001541 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001542 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001543 re.compile(pat, re.DEBUG)
1544 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001545SUBPATTERN 1
1546 LITERAL 46
1547SUBPATTERN None
1548 BRANCH
1549 IN
1550 LITERAL 99
1551 LITERAL 104
1552 OR
1553 LITERAL 112
1554 LITERAL 121
1555SUBPATTERN None
1556 GROUPREF_EXISTS 1
1557 AT AT_END
1558 ELSE
1559 LITERAL 58
1560 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001561'''
1562 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001563 # Debug output is output again even a second time (bypassing
1564 # the cache -- issue #20426).
1565 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001566 re.compile(pat, re.DEBUG)
1567 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001568
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001569 def test_keyword_parameters(self):
1570 # Issue #20283: Accepting the string keyword parameter.
1571 pat = re.compile(r'(ab)')
1572 self.assertEqual(
1573 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1574 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001575 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1576 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001577 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1578 self.assertEqual(
1579 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1580 self.assertEqual(
1581 pat.split(string='abracadabra', maxsplit=1),
1582 ['', 'ab', 'racadabra'])
1583 self.assertEqual(
1584 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1585 (7, 9))
1586
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001587 def test_bug_20998(self):
1588 # Issue #20998: Fullmatch of repeated single character pattern
1589 # with ignore case.
1590 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1591
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001592 def test_locale_caching(self):
1593 # Issue #22410
1594 oldlocale = locale.setlocale(locale.LC_CTYPE)
1595 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1596 for loc in 'en_US.iso88591', 'en_US.utf8':
1597 try:
1598 locale.setlocale(locale.LC_CTYPE, loc)
1599 except locale.Error:
1600 # Unsupported locale on this system
1601 self.skipTest('test needs %s locale' % loc)
1602
1603 re.purge()
1604 self.check_en_US_iso88591()
1605 self.check_en_US_utf8()
1606 re.purge()
1607 self.check_en_US_utf8()
1608 self.check_en_US_iso88591()
1609
1610 def check_en_US_iso88591(self):
1611 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1612 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1613 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1614 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1615 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1616 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1617 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1618
1619 def check_en_US_utf8(self):
1620 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1621 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1622 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1623 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1624 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1625 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1626 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1627
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001628 def test_error(self):
1629 with self.assertRaises(re.error) as cm:
1630 re.compile('(\u20ac))')
1631 err = cm.exception
1632 self.assertIsInstance(err.pattern, str)
1633 self.assertEqual(err.pattern, '(\u20ac))')
1634 self.assertEqual(err.pos, 3)
1635 self.assertEqual(err.lineno, 1)
1636 self.assertEqual(err.colno, 4)
1637 self.assertIn(err.msg, str(err))
1638 self.assertIn(' at position 3', str(err))
1639 self.assertNotIn(' at position 3', err.msg)
1640 # Bytes pattern
1641 with self.assertRaises(re.error) as cm:
1642 re.compile(b'(\xa4))')
1643 err = cm.exception
1644 self.assertIsInstance(err.pattern, bytes)
1645 self.assertEqual(err.pattern, b'(\xa4))')
1646 self.assertEqual(err.pos, 3)
1647 # Multiline pattern
1648 with self.assertRaises(re.error) as cm:
1649 re.compile("""
1650 (
1651 abc
1652 )
1653 )
1654 (
1655 """, re.VERBOSE)
1656 err = cm.exception
1657 self.assertEqual(err.pos, 77)
1658 self.assertEqual(err.lineno, 5)
1659 self.assertEqual(err.colno, 17)
1660 self.assertIn(err.msg, str(err))
1661 self.assertIn(' at position 77', str(err))
1662 self.assertIn('(line 5, column 17)', str(err))
1663
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001664 def test_misc_errors(self):
1665 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1666 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1667 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1668 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1669 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1670 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1671 self.checkPatternError(r'(?i', 'missing )', 3)
1672 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1673 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1674 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1675 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1676
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001677
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001678class PatternReprTests(unittest.TestCase):
1679 def check(self, pattern, expected):
1680 self.assertEqual(repr(re.compile(pattern)), expected)
1681
1682 def check_flags(self, pattern, flags, expected):
1683 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1684
1685 def test_without_flags(self):
1686 self.check('random pattern',
1687 "re.compile('random pattern')")
1688
1689 def test_single_flag(self):
1690 self.check_flags('random pattern', re.IGNORECASE,
1691 "re.compile('random pattern', re.IGNORECASE)")
1692
1693 def test_multiple_flags(self):
1694 self.check_flags('random pattern', re.I|re.S|re.X,
1695 "re.compile('random pattern', "
1696 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1697
1698 def test_unicode_flag(self):
1699 self.check_flags('random pattern', re.U,
1700 "re.compile('random pattern')")
1701 self.check_flags('random pattern', re.I|re.S|re.U,
1702 "re.compile('random pattern', "
1703 "re.IGNORECASE|re.DOTALL)")
1704
1705 def test_inline_flags(self):
1706 self.check('(?i)pattern',
1707 "re.compile('(?i)pattern', re.IGNORECASE)")
1708
1709 def test_unknown_flags(self):
1710 self.check_flags('random pattern', 0x123000,
1711 "re.compile('random pattern', 0x123000)")
1712 self.check_flags('random pattern', 0x123000|re.I,
1713 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1714
1715 def test_bytes(self):
1716 self.check(b'bytes pattern',
1717 "re.compile(b'bytes pattern')")
1718 self.check_flags(b'bytes pattern', re.A,
1719 "re.compile(b'bytes pattern', re.ASCII)")
1720
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001721 def test_locale(self):
1722 self.check_flags(b'bytes pattern', re.L,
1723 "re.compile(b'bytes pattern', re.LOCALE)")
1724
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001725 def test_quotes(self):
1726 self.check('random "double quoted" pattern',
1727 '''re.compile('random "double quoted" pattern')''')
1728 self.check("random 'single quoted' pattern",
1729 '''re.compile("random 'single quoted' pattern")''')
1730 self.check('''both 'single' and "double" quotes''',
1731 '''re.compile('both \\'single\\' and "double" quotes')''')
1732
1733 def test_long_pattern(self):
1734 pattern = 'Very %spattern' % ('long ' * 1000)
1735 r = repr(re.compile(pattern))
1736 self.assertLess(len(r), 300)
1737 self.assertEqual(r[:30], "re.compile('Very long long lon")
1738 r = repr(re.compile(pattern, re.I))
1739 self.assertLess(len(r), 300)
1740 self.assertEqual(r[:30], "re.compile('Very long long lon")
1741 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1742
1743
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001744class ImplementationTest(unittest.TestCase):
1745 """
1746 Test implementation details of the re module.
1747 """
1748
1749 def test_overlap_table(self):
1750 f = sre_compile._generate_overlap_table
1751 self.assertEqual(f(""), [])
1752 self.assertEqual(f("a"), [0])
1753 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1754 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1755 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1756 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1757
1758
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001759class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001760
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001761 def test_re_benchmarks(self):
1762 're_tests benchmarks'
1763 from test.re_tests import benchmarks
1764 for pattern, s in benchmarks:
1765 with self.subTest(pattern=pattern, string=s):
1766 p = re.compile(pattern)
1767 self.assertTrue(p.search(s))
1768 self.assertTrue(p.match(s))
1769 self.assertTrue(p.fullmatch(s))
1770 s2 = ' '*10000 + s + ' '*10000
1771 self.assertTrue(p.search(s2))
1772 self.assertTrue(p.match(s2, 10000))
1773 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1774 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001775
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001776 def test_re_tests(self):
1777 're_tests test suite'
1778 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1779 for t in tests:
1780 pattern = s = outcome = repl = expected = None
1781 if len(t) == 5:
1782 pattern, s, outcome, repl, expected = t
1783 elif len(t) == 3:
1784 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001785 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001786 raise ValueError('Test tuples should have 3 or 5 fields', t)
1787
1788 with self.subTest(pattern=pattern, string=s):
1789 if outcome == SYNTAX_ERROR: # Expected a syntax error
1790 with self.assertRaises(re.error):
1791 re.compile(pattern)
1792 continue
1793
1794 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001795 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001796 if outcome == FAIL:
1797 self.assertIsNone(result, 'Succeeded incorrectly')
1798 continue
1799
1800 with self.subTest():
1801 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001802 # Matched, as expected, so now we compute the
1803 # result string and compare it to our expected result.
1804 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001805 vardict = {'found': result.group(0),
1806 'groups': result.group(),
1807 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001808 for i in range(1, 100):
1809 try:
1810 gi = result.group(i)
1811 # Special hack because else the string concat fails:
1812 if gi is None:
1813 gi = "None"
1814 except IndexError:
1815 gi = "Error"
1816 vardict['g%d' % i] = gi
1817 for i in result.re.groupindex.keys():
1818 try:
1819 gi = result.group(i)
1820 if gi is None:
1821 gi = "None"
1822 except IndexError:
1823 gi = "Error"
1824 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001825 self.assertEqual(eval(repl, vardict), expected,
1826 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001827
Antoine Pitrou22628c42008-07-22 17:53:22 +00001828 # Try the match with both pattern and string converted to
1829 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001830 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001831 bpat = bytes(pattern, "ascii")
1832 bs = bytes(s, "ascii")
1833 except UnicodeEncodeError:
1834 # skip non-ascii tests
1835 pass
1836 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001837 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001838 obj = re.compile(bpat)
1839 self.assertTrue(obj.search(bs))
1840
1841 # Try the match with LOCALE enabled, and check that it
1842 # still succeeds.
1843 with self.subTest('locale-sensitive match'):
1844 obj = re.compile(bpat, re.LOCALE)
1845 result = obj.search(bs)
1846 if result is None:
1847 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001848
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001849 # Try the match with the search area limited to the extent
1850 # of the match and see if it still succeeds. \B will
1851 # break (because it won't match at the end or start of a
1852 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001853 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1854 and result is not None):
1855 with self.subTest('range-limited match'):
1856 obj = re.compile(pattern)
1857 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001858
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001859 # Try the match with IGNORECASE enabled, and check that it
1860 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001861 with self.subTest('case-insensitive match'):
1862 obj = re.compile(pattern, re.IGNORECASE)
1863 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001864
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001865 # Try the match with UNICODE locale enabled, and check
1866 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001867 with self.subTest('unicode-sensitive match'):
1868 obj = re.compile(pattern, re.UNICODE)
1869 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001870
Gregory P. Smith5a631832010-07-27 05:31:29 +00001871
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001872if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001873 unittest.main()