blob: 7a741416b48c74dad593d699e6a65c2fbdd6e7e3 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000116 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
127 with self.assertWarns(DeprecationWarning):
128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000130 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
134 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
221 re.compile('(?P<a>x)(?P=a)(?(a)y)')
222 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300223 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200224 self.checkPatternError('(?P<a>)(?P<a>)',
225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300227 self.checkPatternError('(?P<a>(?P=a))',
228 "cannot refer to an open group", 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200229 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
230 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
231 self.checkPatternError('(?P=', 'missing group name', 4)
232 self.checkPatternError('(?P=)', 'missing group name', 4)
233 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
234 self.checkPatternError('(?P=a)', "unknown group name 'a'")
235 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
236 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
237 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
238 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
239 self.checkPatternError('(?P<', 'missing group name', 4)
240 self.checkPatternError('(?P<>)', 'missing group name', 4)
241 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
242 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
243 self.checkPatternError(r'(?(', 'missing group name', 3)
244 self.checkPatternError(r'(?())', 'missing group name', 3)
245 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
246 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
247 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
248 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200249 # New valid/invalid identifiers in Python 3
250 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
251 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200252 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300253 # Support > 100 groups.
254 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
255 pat = '(?:%s)(?(200)z|t)' % pat
256 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200259 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
260 'missing >, unterminated name', 3)
261 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
262 'missing group name', 3)
263 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
264 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
265 "bad character in group name 'a a'", 3)
266 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
267 'missing group name', 3)
268 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
269 "bad character in group name '1a1'", 3)
270 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
271 'invalid group reference')
272 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
273 'invalid group reference')
274 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
275 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300276 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200278 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
279 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200280 # New valid/invalid identifiers in Python 3
281 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
282 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200283 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
284 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300285 # Support > 100 groups.
286 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
287 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000288
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 def test_re_subn(self):
290 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
291 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
292 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
293 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100294 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000295
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000296 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300297 for string in ":a:b::c", S(":a:b::c"):
298 self.assertTypedEqual(re.split(":", string),
299 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', ':', 'a', ':', 'b', '::', 'c'])
304 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
305 memoryview(b":a:b::c")):
306 self.assertTypedEqual(re.split(b":", string),
307 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300312 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
313 "\U0001d49c\U0001d49e\U0001d4b5"):
314 string = ":%s:%s::%s" % (a, b, c)
315 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200316 self.assertEqual(re.split(":+", string), ['', a, b, c])
317 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300318 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300319
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200320 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
321 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000322 ['', ':', 'a', ':', 'b', ':', 'c'])
323 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
324 ['', ':', 'a', ':b::', 'c'])
325 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
326 ['', None, ':', 'a', None, ':', '', 'b', None, '',
327 None, '::', 'c'])
328 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
329 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000330
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200331 for sep, expected in [
332 (':*', ['', 'a', 'b', 'c']),
333 ('(?::*)', ['', 'a', 'b', 'c']),
334 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
335 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
336 ]:
337 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
338 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
339
340 for sep, expected in [
341 ('', [':a:b::c']),
342 (r'\b', [':a:b::c']),
343 (r'(?=:)', [':a:b::c']),
344 (r'(?<=:)', [':a:b::c']),
345 ]:
346 with self.subTest(sep=sep), self.assertRaises(ValueError):
347 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
348
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000349 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100350 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
351 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
352 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000355 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200356 with self.assertWarns(FutureWarning):
357 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
358 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000359
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000360 def test_re_findall(self):
361 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300362 for string in "a:b::c:::d", S("a:b::c:::d"):
363 self.assertTypedEqual(re.findall(":+", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:+)", string),
366 [":", "::", ":::"])
367 self.assertTypedEqual(re.findall("(:)(:*)", string),
368 [(":", ""), (":", ":"), (":", "::")])
369 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
370 memoryview(b"a:b::c:::d")):
371 self.assertTypedEqual(re.findall(b":+", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:+)", string),
374 [b":", b"::", b":::"])
375 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
376 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300377 for x in ("\xe0", "\u0430", "\U0001d49c"):
378 xx = x * 2
379 xxx = x * 3
380 string = "a%sb%sc%sd" % (x, xx, xxx)
381 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
382 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
383 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
384 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000385
Skip Montanaro5ba00542003-04-25 16:00:14 +0000386 def test_bug_117612(self):
387 self.assertEqual(re.findall(r"(a|(b))", "aba"),
388 [("a", ""),("b", "b"),("a", "")])
389
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000390 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300391 for string in 'a', S('a'):
392 self.assertEqual(re.match('a', string).groups(), ())
393 self.assertEqual(re.match('(a)', string).groups(), ('a',))
394 self.assertEqual(re.match('(a)', string).group(0), 'a')
395 self.assertEqual(re.match('(a)', string).group(1), 'a')
396 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
397 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
398 self.assertEqual(re.match(b'a', string).groups(), ())
399 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
400 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
401 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
402 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300403 for a in ("\xe0", "\u0430", "\U0001d49c"):
404 self.assertEqual(re.match(a, a).groups(), ())
405 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
406 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
407 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
408 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000409
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000410 pat = re.compile('((a)|(b))(c)?')
411 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
412 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
413 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
414 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
415 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000416
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000417 # A single group
418 m = re.match('(a)', 'a')
419 self.assertEqual(m.group(0), 'a')
420 self.assertEqual(m.group(0), 'a')
421 self.assertEqual(m.group(1), 'a')
422 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000423
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000424 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
425 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
426 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
427 (None, 'b', None))
428 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000429
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200430 def test_re_fullmatch(self):
431 # Issue 16203: Proposal: add re.fullmatch() method.
432 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
433 for string in "ab", S("ab"):
434 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
435 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
436 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
437 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
438 r = r"%s|%s" % (a, a + b)
439 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
440 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
441 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
442 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
443 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
444 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
445 self.assertIsNone(re.fullmatch(r"a+", "ab"))
446 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
447 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
448 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
449 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
450 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
451 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
452
453 self.assertEqual(
454 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
455 self.assertEqual(
456 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
457 self.assertEqual(
458 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
459
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000461 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
462 ('(', 'a'))
463 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
464 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300465 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
466 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000467 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
468 ('a', 'b'))
469 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
470 (None, 'd'))
471 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
472 (None, 'd'))
473 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
474 ('a', ''))
475
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000476 # Tests for bug #1177831: exercise groups other than the first group
477 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
478 self.assertEqual(p.match('abc').groups(),
479 ('a', 'b', 'c'))
480 self.assertEqual(p.match('ad').groups(),
481 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300482 self.assertIsNone(p.match('abd'))
483 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000484
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300485 # Support > 100 groups.
486 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
487 pat = '(?:%s)(?(200)z)' % pat
488 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000489
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200490 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
491 self.checkPatternError(r'()(?(1)a|b',
492 'missing ), unterminated subpattern', 2)
493 self.checkPatternError(r'()(?(1)a|b|c)',
494 'conditional backref with more than '
495 'two branches', 10)
496
497 def test_re_groupref_overflow(self):
498 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
499 'invalid group reference', 3)
500 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
501 'invalid group reference', 10)
502
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000503 def test_re_groupref(self):
504 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
505 ('|', 'a'))
506 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
507 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300508 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
509 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000510 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
511 ('a', 'a'))
512 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
513 (None, None))
514
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200515 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
516
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000517 def test_groupdict(self):
518 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
519 'first second').groupdict(),
520 {'first':'first', 'second':'second'})
521
522 def test_expand(self):
523 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
524 "first second")
525 .expand(r"\2 \1 \g<second> \g<first>"),
526 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300527 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
528 "first")
529 .expand(r"\2 \g<second>"),
530 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000531
532 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300533 self.assertIsNone(re.match("^(\w){1}$", "abc"))
534 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
535 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
536 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000537
538 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
539 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
540 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
541 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
542 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
543 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
544 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
545 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
546
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300547 self.assertIsNone(re.match("^x{1}$", "xxx"))
548 self.assertIsNone(re.match("^x{1}?$", "xxx"))
549 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
550 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000551
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300552 self.assertTrue(re.match("^x{3}$", "xxx"))
553 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200554 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300555 self.assertTrue(re.match("^x{1,4}$", "xxx"))
556 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
557 self.assertTrue(re.match("^x{3}?$", "xxx"))
558 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
559 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
560 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300562 self.assertIsNone(re.match("^x{}$", "xxx"))
563 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000564
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200565 self.checkPatternError(r'x{2,1}',
566 'min repeat greater than max repeat', 2)
567
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000568 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000569 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000570 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000571 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
572 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
573 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
574 {'first': 1, 'other': 2})
575
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000576 self.assertEqual(re.match("(a)", "a").pos, 0)
577 self.assertEqual(re.match("(a)", "a").endpos, 1)
578 self.assertEqual(re.match("(a)", "a").string, "a")
579 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300580 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000581
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300582 # Issue 14260. groupindex should be non-modifiable mapping.
583 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
584 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
585 self.assertEqual(p.groupindex['other'], 2)
586 with self.assertRaises(TypeError):
587 p.groupindex['other'] = 0
588 self.assertEqual(p.groupindex['other'], 2)
589
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000590 def test_special_escapes(self):
591 self.assertEqual(re.search(r"\b(b.)\b",
592 "abcd abc bcd bx").group(1), "bx")
593 self.assertEqual(re.search(r"\B(b.)\B",
594 "abc bcd bc abxd").group(1), "bx")
595 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300596 "abcd abc bcd bx", re.ASCII).group(1), "bx")
597 self.assertEqual(re.search(r"\B(b.)\B",
598 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000599 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
600 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300601 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300602 self.assertEqual(re.search(br"\b(b.)\b",
603 b"abcd abc bcd bx").group(1), b"bx")
604 self.assertEqual(re.search(br"\B(b.)\B",
605 b"abc bcd bc abxd").group(1), b"bx")
606 self.assertEqual(re.search(br"\b(b.)\b",
607 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
608 self.assertEqual(re.search(br"\B(b.)\B",
609 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
610 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
611 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300612 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613 self.assertEqual(re.search(r"\d\D\w\W\s\S",
614 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300615 self.assertEqual(re.search(br"\d\D\w\W\s\S",
616 b"1aa! a").group(0), b"1aa! a")
617 self.assertEqual(re.search(r"\d\D\w\W\s\S",
618 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300619 self.assertEqual(re.search(br"\d\D\w\W\s\S",
620 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000621
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200622 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200623 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200624 self.assertEqual(re.match(r"\(", '(').group(), '(')
625 self.assertIsNone(re.match(r"\(", ')'))
626 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200627 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
628 self.assertIsNone(re.match(r"[\]]", '['))
629 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
630 self.assertIsNone(re.match(r"[a\-c]", 'b'))
631 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
632 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200633 re.purge() # for warnings
634 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
635 with self.subTest(c):
636 with self.assertWarns(DeprecationWarning):
637 self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
638 self.assertIsNone(re.match('\\%c' % c, 'a'))
639 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
640 with self.subTest(c):
641 with self.assertWarns(DeprecationWarning):
642 self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
643 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200644
Ezio Melotti5a045b92012-02-29 11:48:44 +0200645 def test_string_boundaries(self):
646 # See http://bugs.python.org/issue10713
647 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
648 "abc")
649 # There's a word boundary at the start of a string.
650 self.assertTrue(re.match(r"\b", "abc"))
651 # A non-empty string includes a non-boundary zero-length match.
652 self.assertTrue(re.search(r"\B", "abc"))
653 # There is no non-boundary match at the start of a string.
654 self.assertFalse(re.match(r"\B", "abc"))
655 # However, an empty string contains no word boundaries, and also no
656 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300657 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200658 # This one is questionable and different from the perlre behaviour,
659 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300660 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200661 # A single word-character string has two boundaries, but no
662 # non-boundary gaps.
663 self.assertEqual(len(re.findall(r"\b", "a")), 2)
664 self.assertEqual(len(re.findall(r"\B", "a")), 0)
665 # If there are no words, there are no boundaries
666 self.assertEqual(len(re.findall(r"\b", " ")), 0)
667 self.assertEqual(len(re.findall(r"\b", " ")), 0)
668 # Can match around the whitespace.
669 self.assertEqual(len(re.findall(r"\B", " ")), 2)
670
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000671 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000672 self.assertEqual(re.match("([\u2222\u2223])",
673 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300674 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300675 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000676
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100677 def test_big_codesize(self):
678 # Issue #1160
679 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300680 self.assertTrue(r.match('1000'))
681 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100682
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000683 def test_anyall(self):
684 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
685 "a\nb")
686 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
687 "a\n\nb")
688
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200689 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000690 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
691 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
692 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
693 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
694 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
695 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
696 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
697
698 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
699 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
700 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
701 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
702
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200703 # Group reference.
704 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
705 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
706 # Conditional group reference.
707 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
708 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
709 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
710 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
711 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
712 # Group used before defined.
713 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
714 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
715 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
716
717 def test_lookbehind(self):
718 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
719 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
720 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
721 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
722 # Group reference.
723 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
724 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
725 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
726 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
727 # Conditional group reference.
728 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
729 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
730 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
731 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
732 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
733 # Group used before defined.
734 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
735 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
736 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
737 # Group defined in the same lookbehind pattern
738 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
739 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
740 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
741 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
742
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000743 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000744 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300745 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000746 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
747 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
748 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
749 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
750 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
751 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
752 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
753 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
754
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200755 assert '\u212a'.lower() == 'k' # 'K'
756 self.assertTrue(re.match(r'K', '\u212a', re.I))
757 self.assertTrue(re.match(r'k', '\u212a', re.I))
758 self.assertTrue(re.match(r'\u212a', 'K', re.I))
759 self.assertTrue(re.match(r'\u212a', 'k', re.I))
760 assert '\u017f'.upper() == 'S' # 'ſ'
761 self.assertTrue(re.match(r'S', '\u017f', re.I))
762 self.assertTrue(re.match(r's', '\u017f', re.I))
763 self.assertTrue(re.match(r'\u017f', 'S', re.I))
764 self.assertTrue(re.match(r'\u017f', 's', re.I))
765 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
766 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
767 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
768
769 def test_ignore_case_set(self):
770 self.assertTrue(re.match(r'[19A]', 'A', re.I))
771 self.assertTrue(re.match(r'[19a]', 'a', re.I))
772 self.assertTrue(re.match(r'[19a]', 'A', re.I))
773 self.assertTrue(re.match(r'[19A]', 'a', re.I))
774 self.assertTrue(re.match(br'[19A]', b'A', re.I))
775 self.assertTrue(re.match(br'[19a]', b'a', re.I))
776 self.assertTrue(re.match(br'[19a]', b'A', re.I))
777 self.assertTrue(re.match(br'[19A]', b'a', re.I))
778 assert '\u212a'.lower() == 'k' # 'K'
779 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
780 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
781 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
782 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
783 assert '\u017f'.upper() == 'S' # 'ſ'
784 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
785 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
786 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
787 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
788 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
789 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
790 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
791
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200792 def test_ignore_case_range(self):
793 # Issues #3511, #17381.
794 self.assertTrue(re.match(r'[9-a]', '_', re.I))
795 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
796 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
797 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
798 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
799 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
800 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
801 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
802 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
803 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
804 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
805 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
806 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
807 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
808 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
809 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
810
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200811 assert '\u212a'.lower() == 'k' # 'K'
812 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
813 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
814 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
815 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
816 assert '\u017f'.upper() == 'S' # 'ſ'
817 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
818 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
819 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
820 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
821 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
822 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
823 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
824
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000825 def test_category(self):
826 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
827
828 def test_getlower(self):
829 import _sre
830 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
831 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
832 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200833 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000834
835 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300836 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200837 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
838 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000839
840 def test_not_literal(self):
841 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
842 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
843
844 def test_search_coverage(self):
845 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
846 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
847
Ezio Melottid2114eb2011-03-25 14:08:44 +0200848 def assertMatch(self, pattern, text, match=None, span=None,
849 matcher=re.match):
850 if match is None and span is None:
851 # the pattern matches the whole text
852 match = text
853 span = (0, len(text))
854 elif match is None or span is None:
855 raise ValueError('If match is not None, span should be specified '
856 '(and vice versa).')
857 m = matcher(pattern, text)
858 self.assertTrue(m)
859 self.assertEqual(m.group(), match)
860 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000861
Ezio Melottid2114eb2011-03-25 14:08:44 +0200862 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300863 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200864 p = ''.join(chr(i) for i in range(256))
865 for c in p:
866 if c in alnum_chars:
867 self.assertEqual(re.escape(c), c)
868 elif c == '\x00':
869 self.assertEqual(re.escape(c), '\\000')
870 else:
871 self.assertEqual(re.escape(c), '\\' + c)
872 self.assertMatch(re.escape(c), c)
873 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000874
Guido van Rossum698280d2008-09-10 17:44:35 +0000875 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300876 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200877 p = bytes(range(256))
878 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000879 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200880 if b in alnum_chars:
881 self.assertEqual(re.escape(b), b)
882 elif i == 0:
883 self.assertEqual(re.escape(b), b'\\000')
884 else:
885 self.assertEqual(re.escape(b), b'\\' + b)
886 self.assertMatch(re.escape(b), b)
887 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000888
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200889 def test_re_escape_non_ascii(self):
890 s = 'xxx\u2620\u2620\u2620xxx'
891 s_escaped = re.escape(s)
892 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
893 self.assertMatch(s_escaped, s)
894 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
895 'x\u2620\u2620\u2620x', (2, 7), re.search)
896
897 def test_re_escape_non_ascii_bytes(self):
898 b = 'y\u2620y\u2620y'.encode('utf-8')
899 b_escaped = re.escape(b)
900 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
901 self.assertMatch(b_escaped, b)
902 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
903 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000904
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300905 def test_pickling(self):
906 import pickle
907 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
908 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
909 pickled = pickle.dumps(oldpat, proto)
910 newpat = pickle.loads(pickled)
911 self.assertEqual(newpat, oldpat)
912 # current pickle expects the _compile() reconstructor in re module
913 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000914
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000915 def test_constants(self):
916 self.assertEqual(re.I, re.IGNORECASE)
917 self.assertEqual(re.L, re.LOCALE)
918 self.assertEqual(re.M, re.MULTILINE)
919 self.assertEqual(re.S, re.DOTALL)
920 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000921
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000922 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200923 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300924 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200925 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
926 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000927
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000928 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200929 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
930 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300931 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
932 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
933 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
934 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
935 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
936 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200937 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300938 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
939 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
940 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
941 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
942 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
943 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
944 self.assertTrue(re.match(r"\0", "\000"))
945 self.assertTrue(re.match(r"\08", "\0008"))
946 self.assertTrue(re.match(r"\01", "\001"))
947 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200948 self.checkPatternError(r"\567",
949 r'octal escape value \567 outside of '
950 r'range 0-0o377', 0)
951 self.checkPatternError(r"\911", 'invalid group reference', 0)
952 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
953 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
954 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
955 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
956 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
957 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
958 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000959
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000960 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200961 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
962 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300963 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
964 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
965 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
966 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
967 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
968 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
969 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
970 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200971 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300972 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
973 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
974 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
975 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
976 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
977 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200978 self.checkPatternError(r"[\567]",
979 r'octal escape value \567 outside of '
980 r'range 0-0o377', 1)
981 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
982 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
983 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
984 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
985 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300986 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200987
988 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000989 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300990 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
991 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
992 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
993 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
994 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
995 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200996 with self.assertWarns(DeprecationWarning):
997 self.assertTrue(re.match(br"\u1234", b'u1234'))
998 with self.assertWarns(DeprecationWarning):
999 self.assertTrue(re.match(br"\U00012345", b'U00012345'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001000 self.assertTrue(re.match(br"\0", b"\000"))
1001 self.assertTrue(re.match(br"\08", b"\0008"))
1002 self.assertTrue(re.match(br"\01", b"\001"))
1003 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001004 self.checkPatternError(br"\567",
1005 r'octal escape value \567 outside of '
1006 r'range 0-0o377', 0)
1007 self.checkPatternError(br"\911", 'invalid group reference', 0)
1008 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1009 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001010
1011 def test_sre_byte_class_literals(self):
1012 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001013 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1014 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1015 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1016 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1017 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1018 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1019 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1020 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001021 with self.assertWarns(DeprecationWarning):
1022 self.assertTrue(re.match(br"[\u1234]", b'u'))
1023 with self.assertWarns(DeprecationWarning):
1024 self.assertTrue(re.match(br"[\U00012345]", b'U'))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001025 self.checkPatternError(br"[\567]",
1026 r'octal escape value \567 outside of '
1027 r'range 0-0o377', 1)
1028 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1029 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1030
1031 def test_character_set_errors(self):
1032 self.checkPatternError(r'[', 'unterminated character set', 0)
1033 self.checkPatternError(r'[^', 'unterminated character set', 0)
1034 self.checkPatternError(r'[a', 'unterminated character set', 0)
1035 # bug 545855 -- This pattern failed to cause a compile error as it
1036 # should, instead provoking a TypeError.
1037 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1038 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1039 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1040 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001041
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001042 def test_bug_113254(self):
1043 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1044 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1045 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1046
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001047 def test_bug_527371(self):
1048 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001049 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001050 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1051 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1052 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1053 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1054
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001055 def test_bug_418626(self):
1056 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1057 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1058 # pattern '*?' on a long string.
1059 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1060 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1061 20003)
1062 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001063 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001064 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001065 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001066
1067 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001068 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001069 self.assertEqual(re.compile(pat) and 1, 1)
1070
Skip Montanaro1e703c62003-04-25 15:40:28 +00001071 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001072 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001073 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001074 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1075 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1076 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001077
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001078 def test_nothing_to_repeat(self):
1079 for reps in '*', '+', '?', '{1,2}':
1080 for mod in '', '?':
1081 self.checkPatternError('%s%s' % (reps, mod),
1082 'nothing to repeat', 0)
1083 self.checkPatternError('(?:%s%s)' % (reps, mod),
1084 'nothing to repeat', 3)
1085
1086 def test_multiple_repeat(self):
1087 for outer_reps in '*', '+', '{1,2}':
1088 for outer_mod in '', '?':
1089 outer_op = outer_reps + outer_mod
1090 for inner_reps in '*', '+', '?', '{1,2}':
1091 for inner_mod in '', '?':
1092 inner_op = inner_reps + inner_mod
1093 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1094 'multiple repeat', 1 + len(inner_op))
1095
Serhiy Storchakafa468162013-02-16 21:23:53 +02001096 def test_unlimited_zero_width_repeat(self):
1097 # Issue #9669
1098 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1099 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1100 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1101 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1102 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1103 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1104
Skip Montanaro1e703c62003-04-25 15:40:28 +00001105 def test_scanner(self):
1106 def s_ident(scanner, token): return token
1107 def s_operator(scanner, token): return "op%s" % token
1108 def s_float(scanner, token): return float(token)
1109 def s_int(scanner, token): return int(token)
1110
1111 scanner = Scanner([
1112 (r"[a-zA-Z_]\w*", s_ident),
1113 (r"\d+\.\d*", s_float),
1114 (r"\d+", s_int),
1115 (r"=|\+|-|\*|/", s_operator),
1116 (r"\s+", None),
1117 ])
1118
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001119 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001120
Skip Montanaro1e703c62003-04-25 15:40:28 +00001121 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1122 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1123 'op+', 'bar'], ''))
1124
Skip Montanaro5ba00542003-04-25 16:00:14 +00001125 def test_bug_448951(self):
1126 # bug 448951 (similar to 429357, but with single char match)
1127 # (Also test greedy matches.)
1128 for op in '','?','*':
1129 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1130 (None, None))
1131 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1132 ('a:', 'a'))
1133
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001134 def test_bug_725106(self):
1135 # capturing groups in alternatives in repeats
1136 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1137 ('b', 'a'))
1138 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1139 ('c', 'b'))
1140 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1141 ('b', None))
1142 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1143 ('b', None))
1144 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1145 ('b', 'a'))
1146 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1147 ('c', 'b'))
1148 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1149 ('b', None))
1150 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1151 ('b', None))
1152
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001153 def test_bug_725149(self):
1154 # mark_stack_base restoring before restoring marks
1155 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1156 ('a', None))
1157 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1158 ('a', None, None))
1159
Just van Rossum12723ba2003-07-02 20:03:04 +00001160 def test_bug_764548(self):
1161 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001162 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001163 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001164 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001165
Skip Montanaro5ba00542003-04-25 16:00:14 +00001166 def test_finditer(self):
1167 iter = re.finditer(r":+", "a:b::c:::d")
1168 self.assertEqual([item.group(0) for item in iter],
1169 [":", "::", ":::"])
1170
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001171 pat = re.compile(r":+")
1172 iter = pat.finditer("a:b::c:::d", 1, 10)
1173 self.assertEqual([item.group(0) for item in iter],
1174 [":", "::", ":::"])
1175
1176 pat = re.compile(r":+")
1177 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1178 self.assertEqual([item.group(0) for item in iter],
1179 [":", "::", ":::"])
1180
1181 pat = re.compile(r":+")
1182 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1183 self.assertEqual([item.group(0) for item in iter],
1184 [":", "::", ":::"])
1185
1186 pat = re.compile(r":+")
1187 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1188 self.assertEqual([item.group(0) for item in iter],
1189 ["::", "::"])
1190
Thomas Wouters40a088d2008-03-18 20:19:54 +00001191 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001192 self.assertIsNot(re.compile('bug_926075'),
1193 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001194
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001195 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001196 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001197 self.assertEqual(re.compile(pattern).split("a.b.c"),
1198 ['a','b','c'])
1199
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001200 def test_bug_581080(self):
1201 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001202 self.assertEqual(next(iter).span(), (1,2))
1203 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001204
1205 scanner = re.compile(r"\s").scanner("a b")
1206 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001207 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001208
1209 def test_bug_817234(self):
1210 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001211 self.assertEqual(next(iter).span(), (0, 4))
1212 self.assertEqual(next(iter).span(), (4, 4))
1213 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001214
Mark Dickinson1f268282009-07-28 17:22:36 +00001215 def test_bug_6561(self):
1216 # '\d' should match characters in Unicode category 'Nd'
1217 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1218 # Letter) or 'No' (Number, Other).
1219 decimal_digits = [
1220 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1221 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1222 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1223 ]
1224 for x in decimal_digits:
1225 self.assertEqual(re.match('^\d$', x).group(0), x)
1226
1227 not_decimal_digits = [
1228 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1229 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1230 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1231 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1232 ]
1233 for x in not_decimal_digits:
1234 self.assertIsNone(re.match('^\d$', x))
1235
Guido van Rossumd8faa362007-04-27 19:54:29 +00001236 def test_empty_array(self):
1237 # SF buf 1647541
1238 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001239 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001240 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001241 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001242 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001243
Christian Heimes072c0f12008-01-03 23:01:04 +00001244 def test_inline_flags(self):
1245 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001246 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1247 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001248
1249 p = re.compile(upper_char, re.I | re.U)
1250 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001251 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001252
1253 p = re.compile(lower_char, re.I | re.U)
1254 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001255 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001256
1257 p = re.compile('(?i)' + upper_char, re.U)
1258 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001259 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001260
1261 p = re.compile('(?i)' + lower_char, re.U)
1262 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001263 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001264
1265 p = re.compile('(?iu)' + upper_char)
1266 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001267 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001268
1269 p = re.compile('(?iu)' + lower_char)
1270 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001271 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001272
Christian Heimes25bb7832008-01-11 16:17:00 +00001273 def test_dollar_matches_twice(self):
1274 "$ matches the end of string, and just before the terminating \n"
1275 pattern = re.compile('$')
1276 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1277 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1278 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1279
1280 pattern = re.compile('$', re.MULTILINE)
1281 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1282 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1283 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1284
Antoine Pitroufd036452008-08-19 17:56:33 +00001285 def test_bytes_str_mixing(self):
1286 # Mixing str and bytes is disallowed
1287 pat = re.compile('.')
1288 bpat = re.compile(b'.')
1289 self.assertRaises(TypeError, pat.match, b'b')
1290 self.assertRaises(TypeError, bpat.match, 'b')
1291 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1292 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1293 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1294 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1295 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1296 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1297
1298 def test_ascii_and_unicode_flag(self):
1299 # String patterns
1300 for flags in (0, re.UNICODE):
1301 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001302 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001303 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001304 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001305 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001306 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001307 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001308 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001309 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001310 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001311 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001312 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001313 # Bytes patterns
1314 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001315 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001316 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001317 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001318 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001319 # Incompatibilities
1320 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1321 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1322 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1323 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1324 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1325 self.assertRaises(ValueError, re.compile, '(?au)\w')
1326
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001327 def test_locale_flag(self):
1328 import locale
1329 _, enc = locale.getlocale(locale.LC_CTYPE)
1330 # Search non-ASCII letter
1331 for i in range(128, 256):
1332 try:
1333 c = bytes([i]).decode(enc)
1334 sletter = c.lower()
1335 if sletter == c: continue
1336 bletter = sletter.encode(enc)
1337 if len(bletter) != 1: continue
1338 if bletter.decode(enc) != sletter: continue
1339 bpat = re.escape(bytes([i]))
1340 break
1341 except (UnicodeError, TypeError):
1342 pass
1343 else:
1344 bletter = None
1345 bpat = b'A'
1346 # Bytes patterns
1347 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1348 if bletter:
1349 self.assertTrue(pat.match(bletter))
1350 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1351 if bletter:
1352 self.assertTrue(pat.match(bletter))
1353 pat = re.compile(bpat, re.IGNORECASE)
1354 if bletter:
1355 self.assertIsNone(pat.match(bletter))
1356 pat = re.compile(b'\w', re.LOCALE)
1357 if bletter:
1358 self.assertTrue(pat.match(bletter))
1359 pat = re.compile(b'(?L)\w')
1360 if bletter:
1361 self.assertTrue(pat.match(bletter))
1362 pat = re.compile(b'\w')
1363 if bletter:
1364 self.assertIsNone(pat.match(bletter))
1365 # Incompatibilities
1366 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1367 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1368 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1369 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1370 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1371 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1372
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001373 def test_bug_6509(self):
1374 # Replacement strings of both types must parse properly.
1375 # all strings
1376 pat = re.compile('a(\w)')
1377 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1378 pat = re.compile('a(.)')
1379 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1380 pat = re.compile('..')
1381 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1382
1383 # all bytes
1384 pat = re.compile(b'a(\w)')
1385 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1386 pat = re.compile(b'a(.)')
1387 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1388 pat = re.compile(b'..')
1389 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1390
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001391 def test_dealloc(self):
1392 # issue 3299: check for segfault in debug build
1393 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001394 # the overflow limit is different on wide and narrow builds and it
1395 # depends on the definition of SRE_CODE (see sre.h).
1396 # 2**128 should be big enough to overflow on both. For smaller values
1397 # a RuntimeError is raised instead of OverflowError.
1398 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001399 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001400 with self.assertRaises(OverflowError):
1401 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1402 with self.assertRaises(TypeError):
1403 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001406 self.assertTrue(re.search("123.*-", '123abc-'))
1407 self.assertTrue(re.search("123.*-", '123\xe9-'))
1408 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1409 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1410 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Ezio Melottidf723e12012-03-13 01:29:48 +02001412 def test_compile(self):
1413 # Test return value when given string and pattern as parameter
1414 pattern = re.compile('random pattern')
1415 self.assertIsInstance(pattern, re._pattern_type)
1416 same_pattern = re.compile(pattern)
1417 self.assertIsInstance(same_pattern, re._pattern_type)
1418 self.assertIs(same_pattern, pattern)
1419 # Test behaviour when not given a string or pattern as parameter
1420 self.assertRaises(TypeError, re.compile, 0)
1421
Ezio Melottife8e6e72013-01-11 08:32:01 +02001422 def test_bug_13899(self):
1423 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1424 # nothing. Ditto B and Z.
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001425 with self.assertWarns(DeprecationWarning):
1426 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1427 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melottife8e6e72013-01-11 08:32:01 +02001428
Antoine Pitroub33941a2012-12-03 20:55:56 +01001429 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001430 def test_large_search(self, size):
1431 # Issue #10182: indices were 32-bit-truncated.
1432 s = 'a' * size
1433 m = re.search('$', s)
1434 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001435 self.assertEqual(m.start(), size)
1436 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001437
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001438 # The huge memuse is because of re.sub() using a list and a join()
1439 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001440 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001441 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001442 # Issue #10182: indices were 32-bit-truncated.
1443 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001444 r, n = re.subn('', '', s)
1445 self.assertEqual(r, s)
1446 self.assertEqual(n, size + 1)
1447
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001448 def test_bug_16688(self):
1449 # Issue 16688: Backreferences make case-insensitive regex fail on
1450 # non-ASCII strings.
1451 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1452 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001453
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001454 def test_repeat_minmax_overflow(self):
1455 # Issue #13169
1456 string = "x" * 100000
1457 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1458 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1459 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1460 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1461 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1462 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1463 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1464 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1465 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1466 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1467 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1468
1469 @cpython_only
1470 def test_repeat_minmax_overflow_maxrepeat(self):
1471 try:
1472 from _sre import MAXREPEAT
1473 except ImportError:
1474 self.skipTest('requires _sre.MAXREPEAT constant')
1475 string = "x" * 100000
1476 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1477 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1478 (0, 100000))
1479 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1480 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1481 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1482 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1483
R David Murray26dfaac92013-04-14 13:00:54 -04001484 def test_backref_group_name_in_exception(self):
1485 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001486 self.checkPatternError('(?P=<foo>)',
1487 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001488
1489 def test_group_name_in_exception(self):
1490 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001491 self.checkPatternError('(?P<?foo>)',
1492 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001493
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001494 def test_issue17998(self):
1495 for reps in '*', '+', '?', '{1}':
1496 for mod in '', '?':
1497 pattern = '.' + reps + mod + 'yz'
1498 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1499 ['xyz'], msg=pattern)
1500 pattern = pattern.encode()
1501 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1502 [b'xyz'], msg=pattern)
1503
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001504 def test_match_repr(self):
1505 for string in '[abracadabra]', S('[abracadabra]'):
1506 m = re.search(r'(.+)(.*?)\1', string)
1507 self.assertEqual(repr(m), "<%s.%s object; "
1508 "span=(1, 12), match='abracadabra'>" %
1509 (type(m).__module__, type(m).__qualname__))
1510 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1511 bytearray(b'[abracadabra]'),
1512 memoryview(b'[abracadabra]')):
1513 m = re.search(rb'(.+)(.*?)\1', string)
1514 self.assertEqual(repr(m), "<%s.%s object; "
1515 "span=(1, 12), match=b'abracadabra'>" %
1516 (type(m).__module__, type(m).__qualname__))
1517
1518 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1519 self.assertEqual(repr(first), "<%s.%s object; "
1520 "span=(0, 2), match='aa'>" %
1521 (type(second).__module__, type(first).__qualname__))
1522 self.assertEqual(repr(second), "<%s.%s object; "
1523 "span=(3, 5), match='bb'>" %
1524 (type(second).__module__, type(second).__qualname__))
1525
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001526
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001527 def test_bug_2537(self):
1528 # issue 2537: empty submatches
1529 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1530 for inner_op in ('{0,}', '*', '?'):
1531 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1532 m = r.match("xyyzy")
1533 self.assertEqual(m.group(0), "xyy")
1534 self.assertEqual(m.group(1), "")
1535 self.assertEqual(m.group(2), "y")
1536
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001537 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001538 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001539 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001540 re.compile(pat, re.DEBUG)
1541 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001542SUBPATTERN 1
1543 LITERAL 46
1544SUBPATTERN None
1545 BRANCH
1546 IN
1547 LITERAL 99
1548 LITERAL 104
1549 OR
1550 LITERAL 112
1551 LITERAL 121
1552SUBPATTERN None
1553 GROUPREF_EXISTS 1
1554 AT AT_END
1555 ELSE
1556 LITERAL 58
1557 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001558'''
1559 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001560 # Debug output is output again even a second time (bypassing
1561 # the cache -- issue #20426).
1562 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001563 re.compile(pat, re.DEBUG)
1564 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001565
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001566 def test_keyword_parameters(self):
1567 # Issue #20283: Accepting the string keyword parameter.
1568 pat = re.compile(r'(ab)')
1569 self.assertEqual(
1570 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1571 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001572 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1573 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001574 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1575 self.assertEqual(
1576 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1577 self.assertEqual(
1578 pat.split(string='abracadabra', maxsplit=1),
1579 ['', 'ab', 'racadabra'])
1580 self.assertEqual(
1581 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1582 (7, 9))
1583
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001584 def test_bug_20998(self):
1585 # Issue #20998: Fullmatch of repeated single character pattern
1586 # with ignore case.
1587 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1588
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001589 def test_locale_caching(self):
1590 # Issue #22410
1591 oldlocale = locale.setlocale(locale.LC_CTYPE)
1592 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1593 for loc in 'en_US.iso88591', 'en_US.utf8':
1594 try:
1595 locale.setlocale(locale.LC_CTYPE, loc)
1596 except locale.Error:
1597 # Unsupported locale on this system
1598 self.skipTest('test needs %s locale' % loc)
1599
1600 re.purge()
1601 self.check_en_US_iso88591()
1602 self.check_en_US_utf8()
1603 re.purge()
1604 self.check_en_US_utf8()
1605 self.check_en_US_iso88591()
1606
1607 def check_en_US_iso88591(self):
1608 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1609 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1610 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1611 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1612 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1613 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1614 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1615
1616 def check_en_US_utf8(self):
1617 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1618 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1619 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1620 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1621 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1622 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1623 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1624
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001625 def test_error(self):
1626 with self.assertRaises(re.error) as cm:
1627 re.compile('(\u20ac))')
1628 err = cm.exception
1629 self.assertIsInstance(err.pattern, str)
1630 self.assertEqual(err.pattern, '(\u20ac))')
1631 self.assertEqual(err.pos, 3)
1632 self.assertEqual(err.lineno, 1)
1633 self.assertEqual(err.colno, 4)
1634 self.assertIn(err.msg, str(err))
1635 self.assertIn(' at position 3', str(err))
1636 self.assertNotIn(' at position 3', err.msg)
1637 # Bytes pattern
1638 with self.assertRaises(re.error) as cm:
1639 re.compile(b'(\xa4))')
1640 err = cm.exception
1641 self.assertIsInstance(err.pattern, bytes)
1642 self.assertEqual(err.pattern, b'(\xa4))')
1643 self.assertEqual(err.pos, 3)
1644 # Multiline pattern
1645 with self.assertRaises(re.error) as cm:
1646 re.compile("""
1647 (
1648 abc
1649 )
1650 )
1651 (
1652 """, re.VERBOSE)
1653 err = cm.exception
1654 self.assertEqual(err.pos, 77)
1655 self.assertEqual(err.lineno, 5)
1656 self.assertEqual(err.colno, 17)
1657 self.assertIn(err.msg, str(err))
1658 self.assertIn(' at position 77', str(err))
1659 self.assertIn('(line 5, column 17)', str(err))
1660
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001661 def test_misc_errors(self):
1662 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1663 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1664 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1665 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1666 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1667 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1668 self.checkPatternError(r'(?i', 'missing )', 3)
1669 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1670 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1671 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1672 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1673
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001674
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001675class PatternReprTests(unittest.TestCase):
1676 def check(self, pattern, expected):
1677 self.assertEqual(repr(re.compile(pattern)), expected)
1678
1679 def check_flags(self, pattern, flags, expected):
1680 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1681
1682 def test_without_flags(self):
1683 self.check('random pattern',
1684 "re.compile('random pattern')")
1685
1686 def test_single_flag(self):
1687 self.check_flags('random pattern', re.IGNORECASE,
1688 "re.compile('random pattern', re.IGNORECASE)")
1689
1690 def test_multiple_flags(self):
1691 self.check_flags('random pattern', re.I|re.S|re.X,
1692 "re.compile('random pattern', "
1693 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1694
1695 def test_unicode_flag(self):
1696 self.check_flags('random pattern', re.U,
1697 "re.compile('random pattern')")
1698 self.check_flags('random pattern', re.I|re.S|re.U,
1699 "re.compile('random pattern', "
1700 "re.IGNORECASE|re.DOTALL)")
1701
1702 def test_inline_flags(self):
1703 self.check('(?i)pattern',
1704 "re.compile('(?i)pattern', re.IGNORECASE)")
1705
1706 def test_unknown_flags(self):
1707 self.check_flags('random pattern', 0x123000,
1708 "re.compile('random pattern', 0x123000)")
1709 self.check_flags('random pattern', 0x123000|re.I,
1710 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1711
1712 def test_bytes(self):
1713 self.check(b'bytes pattern',
1714 "re.compile(b'bytes pattern')")
1715 self.check_flags(b'bytes pattern', re.A,
1716 "re.compile(b'bytes pattern', re.ASCII)")
1717
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001718 def test_locale(self):
1719 self.check_flags(b'bytes pattern', re.L,
1720 "re.compile(b'bytes pattern', re.LOCALE)")
1721
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001722 def test_quotes(self):
1723 self.check('random "double quoted" pattern',
1724 '''re.compile('random "double quoted" pattern')''')
1725 self.check("random 'single quoted' pattern",
1726 '''re.compile("random 'single quoted' pattern")''')
1727 self.check('''both 'single' and "double" quotes''',
1728 '''re.compile('both \\'single\\' and "double" quotes')''')
1729
1730 def test_long_pattern(self):
1731 pattern = 'Very %spattern' % ('long ' * 1000)
1732 r = repr(re.compile(pattern))
1733 self.assertLess(len(r), 300)
1734 self.assertEqual(r[:30], "re.compile('Very long long lon")
1735 r = repr(re.compile(pattern, re.I))
1736 self.assertLess(len(r), 300)
1737 self.assertEqual(r[:30], "re.compile('Very long long lon")
1738 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1739
1740
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001741class ImplementationTest(unittest.TestCase):
1742 """
1743 Test implementation details of the re module.
1744 """
1745
1746 def test_overlap_table(self):
1747 f = sre_compile._generate_overlap_table
1748 self.assertEqual(f(""), [])
1749 self.assertEqual(f("a"), [0])
1750 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1751 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1752 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1753 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1754
1755
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001756class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001757
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001758 def test_re_benchmarks(self):
1759 're_tests benchmarks'
1760 from test.re_tests import benchmarks
1761 for pattern, s in benchmarks:
1762 with self.subTest(pattern=pattern, string=s):
1763 p = re.compile(pattern)
1764 self.assertTrue(p.search(s))
1765 self.assertTrue(p.match(s))
1766 self.assertTrue(p.fullmatch(s))
1767 s2 = ' '*10000 + s + ' '*10000
1768 self.assertTrue(p.search(s2))
1769 self.assertTrue(p.match(s2, 10000))
1770 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1771 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001772
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001773 def test_re_tests(self):
1774 're_tests test suite'
1775 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1776 for t in tests:
1777 pattern = s = outcome = repl = expected = None
1778 if len(t) == 5:
1779 pattern, s, outcome, repl, expected = t
1780 elif len(t) == 3:
1781 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001782 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001783 raise ValueError('Test tuples should have 3 or 5 fields', t)
1784
1785 with self.subTest(pattern=pattern, string=s):
1786 if outcome == SYNTAX_ERROR: # Expected a syntax error
1787 with self.assertRaises(re.error):
1788 re.compile(pattern)
1789 continue
1790
1791 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001792 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001793 if outcome == FAIL:
1794 self.assertIsNone(result, 'Succeeded incorrectly')
1795 continue
1796
1797 with self.subTest():
1798 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001799 # Matched, as expected, so now we compute the
1800 # result string and compare it to our expected result.
1801 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001802 vardict = {'found': result.group(0),
1803 'groups': result.group(),
1804 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001805 for i in range(1, 100):
1806 try:
1807 gi = result.group(i)
1808 # Special hack because else the string concat fails:
1809 if gi is None:
1810 gi = "None"
1811 except IndexError:
1812 gi = "Error"
1813 vardict['g%d' % i] = gi
1814 for i in result.re.groupindex.keys():
1815 try:
1816 gi = result.group(i)
1817 if gi is None:
1818 gi = "None"
1819 except IndexError:
1820 gi = "Error"
1821 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001822 self.assertEqual(eval(repl, vardict), expected,
1823 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001824
Antoine Pitrou22628c42008-07-22 17:53:22 +00001825 # Try the match with both pattern and string converted to
1826 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001827 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001828 bpat = bytes(pattern, "ascii")
1829 bs = bytes(s, "ascii")
1830 except UnicodeEncodeError:
1831 # skip non-ascii tests
1832 pass
1833 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001834 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001835 obj = re.compile(bpat)
1836 self.assertTrue(obj.search(bs))
1837
1838 # Try the match with LOCALE enabled, and check that it
1839 # still succeeds.
1840 with self.subTest('locale-sensitive match'):
1841 obj = re.compile(bpat, re.LOCALE)
1842 result = obj.search(bs)
1843 if result is None:
1844 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001845
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001846 # Try the match with the search area limited to the extent
1847 # of the match and see if it still succeeds. \B will
1848 # break (because it won't match at the end or start of a
1849 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001850 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1851 and result is not None):
1852 with self.subTest('range-limited match'):
1853 obj = re.compile(pattern)
1854 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001855
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001856 # Try the match with IGNORECASE enabled, and check that it
1857 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001858 with self.subTest('case-insensitive match'):
1859 obj = re.compile(pattern, re.IGNORECASE)
1860 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001861
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001862 # Try the match with UNICODE locale enabled, and check
1863 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001864 with self.subTest('unicode-sensitive match'):
1865 obj = re.compile(pattern, re.UNICODE)
1866 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001867
Gregory P. Smith5a631832010-07-27 05:31:29 +00001868
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001869if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001870 unittest.main()