blob: 020656a62a4a2273a9318449fce1aa340fff9078 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020041 def checkPatternError(self, pattern, errmsg, pos=None):
42 with self.assertRaises(re.error) as cm:
43 re.compile(pattern)
44 with self.subTest(pattern=pattern):
45 err = cm.exception
46 self.assertEqual(err.msg, errmsg)
47 if pos is not None:
48 self.assertEqual(err.pos, pos)
49
50 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
51 with self.assertRaises(re.error) as cm:
52 re.sub(pattern, repl, string)
53 with self.subTest(pattern=pattern, repl=repl):
54 err = cm.exception
55 self.assertEqual(err.msg, errmsg)
56 if pos is not None:
57 self.assertEqual(err.pos, pos)
58
Benjamin Petersone48944b2012-03-07 14:50:25 -060059 def test_keep_buffer(self):
60 # See bug 14212
61 b = bytearray(b'x')
62 it = re.finditer(b'a', b)
63 with self.assertRaises(BufferError):
64 b.extend(b'x'*400)
65 list(it)
66 del it
67 gc_collect()
68 b.extend(b'x'*400)
69
Raymond Hettinger027bb632004-05-31 03:09:25 +000070 def test_weakref(self):
71 s = 'QabbbcR'
72 x = re.compile('ab+c')
73 y = proxy(x)
74 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
75
Skip Montanaro8ed06da2003-04-24 19:43:18 +000076 def test_search_star_plus(self):
77 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
78 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
79 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
80 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030081 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
83 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
84 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
85 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030086 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000087
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000089 int_value = int(matchobj.group(0))
90 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000091
Skip Montanaro8ed06da2003-04-24 19:43:18 +000092 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030093 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
94 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
95 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
96 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
97 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
98 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030099 for y in ("\xe0", "\u0430", "\U0001d49c"):
100 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +0300101
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000102 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
103 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
104 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100105 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000106 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000107
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000108 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
109 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000110
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000111 s = r"\1\1"
112 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
113 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
114 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000115
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000116 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
119 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000120
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200121 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
123 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
124 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
125 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
126 with self.subTest(c):
127 with self.assertWarns(DeprecationWarning):
128 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000129
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000130 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000131
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000132 def test_bug_449964(self):
133 # fails for group followed by other escape
134 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
135 'xx\bxx\b')
136
137 def test_bug_449000(self):
138 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000139 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
140 'abc\ndef\n')
141 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
142 'abc\ndef\n')
143 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
144 'abc\ndef\n')
145 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
146 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000147
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000148 def test_bug_1661(self):
149 # Verify that flags do not get silently ignored with compiled patterns
150 pattern = re.compile('.')
151 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
154 self.assertRaises(ValueError, re.compile, pattern, re.I)
155
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000156 def test_bug_3629(self):
157 # A regex that triggered a bug in the sre-code validator
158 re.compile("(?P<quote>)(?(quote))")
159
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000160 def test_sub_template_numeric_escape(self):
161 # bug 776311 and friends
162 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
163 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
164 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
165 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
166 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
167 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
168 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200169 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000170
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
172 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
173
174 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
175 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
176 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
177 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
178 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
179
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200180 self.checkTemplateError('x', r'\400', 'x',
181 r'octal escape value \400 outside of '
182 r'range 0-0o377', 0)
183 self.checkTemplateError('x', r'\777', 'x',
184 r'octal escape value \777 outside of '
185 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000186
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200187 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference')
188 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference')
189 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference')
190 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference')
191 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference')
192 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference')
193 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference')
194 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference')
195 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference') # r'\11' + '8'
196 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference')
197 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference') # r'\18' + '1'
198 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference') # r'\80' + '0'
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100209 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000210
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000211 def test_bug_114660(self):
212 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
213 'hello there')
214
215 def test_bug_462270(self):
216 # Test for empty sub() behaviour, see SF bug #462270
217 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
218 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
219
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200220 def test_symbolic_groups(self):
221 re.compile('(?P<a>x)(?P=a)(?(a)y)')
222 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300223 re.compile('(?P<a1>x)\1(?(1)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200224 self.checkPatternError('(?P<a>)(?P<a>)',
225 "redefinition of group name 'a' as group 2; "
226 "was group 1")
227 self.checkPatternError('(?Pxy)', 'unknown extension ?Px')
228 self.checkPatternError('(?P<a>)(?P=a', 'missing ), unterminated name', 11)
229 self.checkPatternError('(?P=', 'missing group name', 4)
230 self.checkPatternError('(?P=)', 'missing group name', 4)
231 self.checkPatternError('(?P=1)', "bad character in group name '1'", 4)
232 self.checkPatternError('(?P=a)', "unknown group name 'a'")
233 self.checkPatternError('(?P=a1)', "unknown group name 'a1'")
234 self.checkPatternError('(?P=a.)', "bad character in group name 'a.'", 4)
235 self.checkPatternError('(?P<)', 'missing >, unterminated name', 4)
236 self.checkPatternError('(?P<a', 'missing >, unterminated name', 4)
237 self.checkPatternError('(?P<', 'missing group name', 4)
238 self.checkPatternError('(?P<>)', 'missing group name', 4)
239 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
240 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
241 self.checkPatternError(r'(?(', 'missing group name', 3)
242 self.checkPatternError(r'(?())', 'missing group name', 3)
243 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
244 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
245 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
246 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200247 # New valid/invalid identifiers in Python 3
248 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
249 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200250 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300251 # Support > 100 groups.
252 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
253 pat = '(?:%s)(?(200)z|t)' % pat
254 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200255
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000256 def test_symbolic_refs(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200257 self.checkTemplateError('(?P<a>x)', '\g<a', 'xx',
258 'missing >, unterminated name', 3)
259 self.checkTemplateError('(?P<a>x)', '\g<', 'xx',
260 'missing group name', 3)
261 self.checkTemplateError('(?P<a>x)', '\g', 'xx', 'missing <', 2)
262 self.checkTemplateError('(?P<a>x)', '\g<a a>', 'xx',
263 "bad character in group name 'a a'", 3)
264 self.checkTemplateError('(?P<a>x)', '\g<>', 'xx',
265 'missing group name', 3)
266 self.checkTemplateError('(?P<a>x)', '\g<1a1>', 'xx',
267 "bad character in group name '1a1'", 3)
268 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
269 'invalid group reference')
270 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
271 'invalid group reference')
272 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
273 re.sub('(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300274 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
275 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200276 self.checkTemplateError('(?P<a>x)', '\g<-1>', 'xx',
277 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200278 # New valid/invalid identifiers in Python 3
279 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
280 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200281 self.checkTemplateError('(?P<a>x)', '\g<©>', 'xx',
282 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300283 # Support > 100 groups.
284 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
285 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000286
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000287 def test_re_subn(self):
288 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
289 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
290 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
291 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100292 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000293
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000294 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300295 for string in ":a:b::c", S(":a:b::c"):
296 self.assertTypedEqual(re.split(":", string),
297 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200298 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300299 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200300 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300301 ['', ':', 'a', ':', 'b', '::', 'c'])
302 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
303 memoryview(b":a:b::c")):
304 self.assertTypedEqual(re.split(b":", string),
305 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200306 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300307 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200308 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300309 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300310 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
311 "\U0001d49c\U0001d49e\U0001d4b5"):
312 string = ":%s:%s::%s" % (a, b, c)
313 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200314 self.assertEqual(re.split(":+", string), ['', a, b, c])
315 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300316 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300317
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200318 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
319 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000320 ['', ':', 'a', ':', 'b', ':', 'c'])
321 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
322 ['', ':', 'a', ':b::', 'c'])
323 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
324 ['', None, ':', 'a', None, ':', '', 'b', None, '',
325 None, '::', 'c'])
326 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
327 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000328
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200329 for sep, expected in [
330 (':*', ['', 'a', 'b', 'c']),
331 ('(?::*)', ['', 'a', 'b', 'c']),
332 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
333 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
334 ]:
335 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
336 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
337
338 for sep, expected in [
339 ('', [':a:b::c']),
340 (r'\b', [':a:b::c']),
341 (r'(?=:)', [':a:b::c']),
342 (r'(?<=:)', [':a:b::c']),
343 ]:
344 with self.subTest(sep=sep), self.assertRaises(ValueError):
345 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200354 with self.assertWarns(FutureWarning):
355 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
356 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000357
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000358 def test_re_findall(self):
359 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300360 for string in "a:b::c:::d", S("a:b::c:::d"):
361 self.assertTypedEqual(re.findall(":+", string),
362 [":", "::", ":::"])
363 self.assertTypedEqual(re.findall("(:+)", string),
364 [":", "::", ":::"])
365 self.assertTypedEqual(re.findall("(:)(:*)", string),
366 [(":", ""), (":", ":"), (":", "::")])
367 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
368 memoryview(b"a:b::c:::d")):
369 self.assertTypedEqual(re.findall(b":+", string),
370 [b":", b"::", b":::"])
371 self.assertTypedEqual(re.findall(b"(:+)", string),
372 [b":", b"::", b":::"])
373 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
374 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300375 for x in ("\xe0", "\u0430", "\U0001d49c"):
376 xx = x * 2
377 xxx = x * 3
378 string = "a%sb%sc%sd" % (x, xx, xxx)
379 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
380 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
381 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
382 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000383
Skip Montanaro5ba00542003-04-25 16:00:14 +0000384 def test_bug_117612(self):
385 self.assertEqual(re.findall(r"(a|(b))", "aba"),
386 [("a", ""),("b", "b"),("a", "")])
387
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000388 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300389 for string in 'a', S('a'):
390 self.assertEqual(re.match('a', string).groups(), ())
391 self.assertEqual(re.match('(a)', string).groups(), ('a',))
392 self.assertEqual(re.match('(a)', string).group(0), 'a')
393 self.assertEqual(re.match('(a)', string).group(1), 'a')
394 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
395 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
396 self.assertEqual(re.match(b'a', string).groups(), ())
397 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
398 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
399 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
400 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300401 for a in ("\xe0", "\u0430", "\U0001d49c"):
402 self.assertEqual(re.match(a, a).groups(), ())
403 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
404 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
405 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
406 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000407
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000408 pat = re.compile('((a)|(b))(c)?')
409 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
410 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
411 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
412 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
413 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000414
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000415 # A single group
416 m = re.match('(a)', 'a')
417 self.assertEqual(m.group(0), 'a')
418 self.assertEqual(m.group(0), 'a')
419 self.assertEqual(m.group(1), 'a')
420 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000421
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000422 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
423 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
424 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
425 (None, 'b', None))
426 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000427
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200428 def test_re_fullmatch(self):
429 # Issue 16203: Proposal: add re.fullmatch() method.
430 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
431 for string in "ab", S("ab"):
432 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
433 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
434 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
435 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
436 r = r"%s|%s" % (a, a + b)
437 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
438 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
439 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
440 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
441 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
442 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
443 self.assertIsNone(re.fullmatch(r"a+", "ab"))
444 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
445 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
446 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
447 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
448 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
449 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
450
451 self.assertEqual(
452 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
453 self.assertEqual(
454 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
455 self.assertEqual(
456 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
457
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000458 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000459 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
460 ('(', 'a'))
461 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
462 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300463 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
464 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000465 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
466 ('a', 'b'))
467 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
468 (None, 'd'))
469 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
470 (None, 'd'))
471 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
472 ('a', ''))
473
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000474 # Tests for bug #1177831: exercise groups other than the first group
475 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
476 self.assertEqual(p.match('abc').groups(),
477 ('a', 'b', 'c'))
478 self.assertEqual(p.match('ad').groups(),
479 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300480 self.assertIsNone(p.match('abd'))
481 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000482
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300483 # Support > 100 groups.
484 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
485 pat = '(?:%s)(?(200)z)' % pat
486 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000487
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200488 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
489 self.checkPatternError(r'()(?(1)a|b',
490 'missing ), unterminated subpattern', 2)
491 self.checkPatternError(r'()(?(1)a|b|c)',
492 'conditional backref with more than '
493 'two branches', 10)
494
495 def test_re_groupref_overflow(self):
496 self.checkTemplateError('()', '\g<%s>' % sre_constants.MAXGROUPS, 'xx',
497 'invalid group reference', 3)
498 self.checkPatternError(r'(?P<a>)(?(%d))' % sre_constants.MAXGROUPS,
499 'invalid group reference', 10)
500
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000501 def test_re_groupref(self):
502 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
503 ('|', 'a'))
504 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
505 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300506 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
507 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000508 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
509 ('a', 'a'))
510 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
511 (None, None))
512
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200513 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
514
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 def test_groupdict(self):
516 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
517 'first second').groupdict(),
518 {'first':'first', 'second':'second'})
519
520 def test_expand(self):
521 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
522 "first second")
523 .expand(r"\2 \1 \g<second> \g<first>"),
524 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300525 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
526 "first")
527 .expand(r"\2 \g<second>"),
528 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000529
530 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.match("^(\w){1}$", "abc"))
532 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
533 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
534 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000535
536 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
537 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
538 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
539 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
540 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
541 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
542 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
543 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
544
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300545 self.assertIsNone(re.match("^x{1}$", "xxx"))
546 self.assertIsNone(re.match("^x{1}?$", "xxx"))
547 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
548 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000549
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300550 self.assertTrue(re.match("^x{3}$", "xxx"))
551 self.assertTrue(re.match("^x{1,3}$", "xxx"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200552 self.assertTrue(re.match("^x{3,3}$", "xxx"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300553 self.assertTrue(re.match("^x{1,4}$", "xxx"))
554 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
555 self.assertTrue(re.match("^x{3}?$", "xxx"))
556 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
557 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
558 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000559
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300560 self.assertIsNone(re.match("^x{}$", "xxx"))
561 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000562
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200563 self.checkPatternError(r'x{2,1}',
564 'min repeat greater than max repeat', 2)
565
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000566 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000567 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000568 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000569 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
570 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
571 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
572 {'first': 1, 'other': 2})
573
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000574 self.assertEqual(re.match("(a)", "a").pos, 0)
575 self.assertEqual(re.match("(a)", "a").endpos, 1)
576 self.assertEqual(re.match("(a)", "a").string, "a")
577 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300578 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000579
580 def test_special_escapes(self):
581 self.assertEqual(re.search(r"\b(b.)\b",
582 "abcd abc bcd bx").group(1), "bx")
583 self.assertEqual(re.search(r"\B(b.)\B",
584 "abc bcd bc abxd").group(1), "bx")
585 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300586 "abcd abc bcd bx", re.ASCII).group(1), "bx")
587 self.assertEqual(re.search(r"\B(b.)\B",
588 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000589 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
590 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300591 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300592 self.assertEqual(re.search(br"\b(b.)\b",
593 b"abcd abc bcd bx").group(1), b"bx")
594 self.assertEqual(re.search(br"\B(b.)\B",
595 b"abc bcd bc abxd").group(1), b"bx")
596 self.assertEqual(re.search(br"\b(b.)\b",
597 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
598 self.assertEqual(re.search(br"\B(b.)\B",
599 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
600 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
601 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300602 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000603 self.assertEqual(re.search(r"\d\D\w\W\s\S",
604 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300605 self.assertEqual(re.search(br"\d\D\w\W\s\S",
606 b"1aa! a").group(0), b"1aa! a")
607 self.assertEqual(re.search(r"\d\D\w\W\s\S",
608 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300609 self.assertEqual(re.search(br"\d\D\w\W\s\S",
610 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000611
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200612 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200613 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200614 self.assertEqual(re.match(r"\(", '(').group(), '(')
615 self.assertIsNone(re.match(r"\(", ')'))
616 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200617 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
618 self.assertIsNone(re.match(r"[\]]", '['))
619 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
620 self.assertIsNone(re.match(r"[a\-c]", 'b'))
621 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
622 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200623 re.purge() # for warnings
624 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
625 with self.subTest(c):
626 with self.assertWarns(DeprecationWarning):
627 self.assertEqual(re.fullmatch('\\%c' % c, c).group(), c)
628 self.assertIsNone(re.match('\\%c' % c, 'a'))
629 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
630 with self.subTest(c):
631 with self.assertWarns(DeprecationWarning):
632 self.assertEqual(re.fullmatch('[\\%c]' % c, c).group(), c)
633 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200634
Ezio Melotti5a045b92012-02-29 11:48:44 +0200635 def test_string_boundaries(self):
636 # See http://bugs.python.org/issue10713
637 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
638 "abc")
639 # There's a word boundary at the start of a string.
640 self.assertTrue(re.match(r"\b", "abc"))
641 # A non-empty string includes a non-boundary zero-length match.
642 self.assertTrue(re.search(r"\B", "abc"))
643 # There is no non-boundary match at the start of a string.
644 self.assertFalse(re.match(r"\B", "abc"))
645 # However, an empty string contains no word boundaries, and also no
646 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300647 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200648 # This one is questionable and different from the perlre behaviour,
649 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300650 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200651 # A single word-character string has two boundaries, but no
652 # non-boundary gaps.
653 self.assertEqual(len(re.findall(r"\b", "a")), 2)
654 self.assertEqual(len(re.findall(r"\B", "a")), 0)
655 # If there are no words, there are no boundaries
656 self.assertEqual(len(re.findall(r"\b", " ")), 0)
657 self.assertEqual(len(re.findall(r"\b", " ")), 0)
658 # Can match around the whitespace.
659 self.assertEqual(len(re.findall(r"\B", " ")), 2)
660
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000661 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000662 self.assertEqual(re.match("([\u2222\u2223])",
663 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300664 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300665 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000666
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100667 def test_big_codesize(self):
668 # Issue #1160
669 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300670 self.assertTrue(r.match('1000'))
671 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100672
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000673 def test_anyall(self):
674 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
675 "a\nb")
676 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
677 "a\n\nb")
678
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200679 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000680 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
681 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
682 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
683 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
684 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
685 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
686 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
687
688 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
689 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
690 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
691 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
692
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200693 # Group reference.
694 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
695 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
696 # Conditional group reference.
697 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
698 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
699 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
700 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
701 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
702 # Group used before defined.
703 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
704 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
705 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
706
707 def test_lookbehind(self):
708 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
709 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
710 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
711 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
712 # Group reference.
713 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
714 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
715 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
716 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
717 # Conditional group reference.
718 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
719 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
720 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
721 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
722 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
723 # Group used before defined.
724 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
725 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
726 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
727 # Group defined in the same lookbehind pattern
728 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
729 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
730 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
731 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
732
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000733 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000734 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300735 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000736 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
737 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
738 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
739 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
740 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
741 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
742 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
743 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
744
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200745 assert '\u212a'.lower() == 'k' # 'K'
746 self.assertTrue(re.match(r'K', '\u212a', re.I))
747 self.assertTrue(re.match(r'k', '\u212a', re.I))
748 self.assertTrue(re.match(r'\u212a', 'K', re.I))
749 self.assertTrue(re.match(r'\u212a', 'k', re.I))
750 assert '\u017f'.upper() == 'S' # 'ſ'
751 self.assertTrue(re.match(r'S', '\u017f', re.I))
752 self.assertTrue(re.match(r's', '\u017f', re.I))
753 self.assertTrue(re.match(r'\u017f', 'S', re.I))
754 self.assertTrue(re.match(r'\u017f', 's', re.I))
755 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
756 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
757 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
758
759 def test_ignore_case_set(self):
760 self.assertTrue(re.match(r'[19A]', 'A', re.I))
761 self.assertTrue(re.match(r'[19a]', 'a', re.I))
762 self.assertTrue(re.match(r'[19a]', 'A', re.I))
763 self.assertTrue(re.match(r'[19A]', 'a', re.I))
764 self.assertTrue(re.match(br'[19A]', b'A', re.I))
765 self.assertTrue(re.match(br'[19a]', b'a', re.I))
766 self.assertTrue(re.match(br'[19a]', b'A', re.I))
767 self.assertTrue(re.match(br'[19A]', b'a', re.I))
768 assert '\u212a'.lower() == 'k' # 'K'
769 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
770 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
771 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
772 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
773 assert '\u017f'.upper() == 'S' # 'ſ'
774 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
775 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
776 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
777 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
778 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
779 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
780 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
781
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200782 def test_ignore_case_range(self):
783 # Issues #3511, #17381.
784 self.assertTrue(re.match(r'[9-a]', '_', re.I))
785 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
786 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
787 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
788 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
789 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
790 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
791 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
792 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
793 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
794 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
795 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
796 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
797 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
798 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
799 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
800
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200801 assert '\u212a'.lower() == 'k' # 'K'
802 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
803 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
804 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
805 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
806 assert '\u017f'.upper() == 'S' # 'ſ'
807 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
808 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
809 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
810 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
811 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
812 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
813 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
814
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000815 def test_category(self):
816 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
817
818 def test_getlower(self):
819 import _sre
820 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
821 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
822 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200823 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000824
825 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300826 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200827 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
828 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000829
830 def test_not_literal(self):
831 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
832 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
833
834 def test_search_coverage(self):
835 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
836 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
837
Ezio Melottid2114eb2011-03-25 14:08:44 +0200838 def assertMatch(self, pattern, text, match=None, span=None,
839 matcher=re.match):
840 if match is None and span is None:
841 # the pattern matches the whole text
842 match = text
843 span = (0, len(text))
844 elif match is None or span is None:
845 raise ValueError('If match is not None, span should be specified '
846 '(and vice versa).')
847 m = matcher(pattern, text)
848 self.assertTrue(m)
849 self.assertEqual(m.group(), match)
850 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000851
Ezio Melottid2114eb2011-03-25 14:08:44 +0200852 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300853 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200854 p = ''.join(chr(i) for i in range(256))
855 for c in p:
856 if c in alnum_chars:
857 self.assertEqual(re.escape(c), c)
858 elif c == '\x00':
859 self.assertEqual(re.escape(c), '\\000')
860 else:
861 self.assertEqual(re.escape(c), '\\' + c)
862 self.assertMatch(re.escape(c), c)
863 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000864
Guido van Rossum698280d2008-09-10 17:44:35 +0000865 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300866 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200867 p = bytes(range(256))
868 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000869 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200870 if b in alnum_chars:
871 self.assertEqual(re.escape(b), b)
872 elif i == 0:
873 self.assertEqual(re.escape(b), b'\\000')
874 else:
875 self.assertEqual(re.escape(b), b'\\' + b)
876 self.assertMatch(re.escape(b), b)
877 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000878
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200879 def test_re_escape_non_ascii(self):
880 s = 'xxx\u2620\u2620\u2620xxx'
881 s_escaped = re.escape(s)
882 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
883 self.assertMatch(s_escaped, s)
884 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
885 'x\u2620\u2620\u2620x', (2, 7), re.search)
886
887 def test_re_escape_non_ascii_bytes(self):
888 b = 'y\u2620y\u2620y'.encode('utf-8')
889 b_escaped = re.escape(b)
890 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
891 self.assertMatch(b_escaped, b)
892 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
893 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000894
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300895 def test_pickling(self):
896 import pickle
897 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
898 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
899 pickled = pickle.dumps(oldpat, proto)
900 newpat = pickle.loads(pickled)
901 self.assertEqual(newpat, oldpat)
902 # current pickle expects the _compile() reconstructor in re module
903 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000904
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000905 def test_constants(self):
906 self.assertEqual(re.I, re.IGNORECASE)
907 self.assertEqual(re.L, re.LOCALE)
908 self.assertEqual(re.M, re.MULTILINE)
909 self.assertEqual(re.S, re.DOTALL)
910 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000911
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000912 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200913 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300914 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200915 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
916 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000917
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000918 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200919 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
920 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300921 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
922 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
923 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
924 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
925 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
926 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200927 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300928 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
929 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
930 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
931 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
932 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
933 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
934 self.assertTrue(re.match(r"\0", "\000"))
935 self.assertTrue(re.match(r"\08", "\0008"))
936 self.assertTrue(re.match(r"\01", "\001"))
937 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200938 self.checkPatternError(r"\567",
939 r'octal escape value \567 outside of '
940 r'range 0-0o377', 0)
941 self.checkPatternError(r"\911", 'invalid group reference', 0)
942 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
943 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
944 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
945 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
946 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
947 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
948 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000949
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000950 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200951 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
952 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300953 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
954 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
955 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
956 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
957 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
958 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
959 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
960 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200961 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300962 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
963 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
964 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
965 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
966 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
967 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200968 self.checkPatternError(r"[\567]",
969 r'octal escape value \567 outside of '
970 r'range 0-0o377', 1)
971 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
972 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
973 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
974 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
975 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300976 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200977
978 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000979 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300980 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
981 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
982 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
983 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
984 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
985 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200986 with self.assertWarns(DeprecationWarning):
987 self.assertTrue(re.match(br"\u1234", b'u1234'))
988 with self.assertWarns(DeprecationWarning):
989 self.assertTrue(re.match(br"\U00012345", b'U00012345'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300990 self.assertTrue(re.match(br"\0", b"\000"))
991 self.assertTrue(re.match(br"\08", b"\0008"))
992 self.assertTrue(re.match(br"\01", b"\001"))
993 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200994 self.checkPatternError(br"\567",
995 r'octal escape value \567 outside of '
996 r'range 0-0o377', 0)
997 self.checkPatternError(br"\911", 'invalid group reference', 0)
998 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
999 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001000
1001 def test_sre_byte_class_literals(self):
1002 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001003 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1004 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1005 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1006 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1007 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1008 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1009 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1010 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001011 with self.assertWarns(DeprecationWarning):
1012 self.assertTrue(re.match(br"[\u1234]", b'u'))
1013 with self.assertWarns(DeprecationWarning):
1014 self.assertTrue(re.match(br"[\U00012345]", b'U'))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001015 self.checkPatternError(br"[\567]",
1016 r'octal escape value \567 outside of '
1017 r'range 0-0o377', 1)
1018 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1019 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1020
1021 def test_character_set_errors(self):
1022 self.checkPatternError(r'[', 'unterminated character set', 0)
1023 self.checkPatternError(r'[^', 'unterminated character set', 0)
1024 self.checkPatternError(r'[a', 'unterminated character set', 0)
1025 # bug 545855 -- This pattern failed to cause a compile error as it
1026 # should, instead provoking a TypeError.
1027 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1028 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1029 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1030 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001031
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001032 def test_bug_113254(self):
1033 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1034 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1035 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1036
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001037 def test_bug_527371(self):
1038 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001039 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001040 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1041 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
1042 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
1043 self.assertEqual(re.match("((a))", "a").lastindex, 1)
1044
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001045 def test_bug_418626(self):
1046 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1047 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1048 # pattern '*?' on a long string.
1049 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1050 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1051 20003)
1052 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001053 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001054 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001055 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001056
1057 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001058 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001059 self.assertEqual(re.compile(pat) and 1, 1)
1060
Skip Montanaro1e703c62003-04-25 15:40:28 +00001061 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001062 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001063 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001064 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1065 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1066 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001067
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001068 def test_nothing_to_repeat(self):
1069 for reps in '*', '+', '?', '{1,2}':
1070 for mod in '', '?':
1071 self.checkPatternError('%s%s' % (reps, mod),
1072 'nothing to repeat', 0)
1073 self.checkPatternError('(?:%s%s)' % (reps, mod),
1074 'nothing to repeat', 3)
1075
1076 def test_multiple_repeat(self):
1077 for outer_reps in '*', '+', '{1,2}':
1078 for outer_mod in '', '?':
1079 outer_op = outer_reps + outer_mod
1080 for inner_reps in '*', '+', '?', '{1,2}':
1081 for inner_mod in '', '?':
1082 inner_op = inner_reps + inner_mod
1083 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1084 'multiple repeat', 1 + len(inner_op))
1085
Serhiy Storchakafa468162013-02-16 21:23:53 +02001086 def test_unlimited_zero_width_repeat(self):
1087 # Issue #9669
1088 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1089 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1090 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1091 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1092 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1093 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1094
Skip Montanaro1e703c62003-04-25 15:40:28 +00001095 def test_scanner(self):
1096 def s_ident(scanner, token): return token
1097 def s_operator(scanner, token): return "op%s" % token
1098 def s_float(scanner, token): return float(token)
1099 def s_int(scanner, token): return int(token)
1100
1101 scanner = Scanner([
1102 (r"[a-zA-Z_]\w*", s_ident),
1103 (r"\d+\.\d*", s_float),
1104 (r"\d+", s_int),
1105 (r"=|\+|-|\*|/", s_operator),
1106 (r"\s+", None),
1107 ])
1108
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001109 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001110
Skip Montanaro1e703c62003-04-25 15:40:28 +00001111 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1112 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1113 'op+', 'bar'], ''))
1114
Skip Montanaro5ba00542003-04-25 16:00:14 +00001115 def test_bug_448951(self):
1116 # bug 448951 (similar to 429357, but with single char match)
1117 # (Also test greedy matches.)
1118 for op in '','?','*':
1119 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1120 (None, None))
1121 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1122 ('a:', 'a'))
1123
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001124 def test_bug_725106(self):
1125 # capturing groups in alternatives in repeats
1126 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1127 ('b', 'a'))
1128 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1129 ('c', 'b'))
1130 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1131 ('b', None))
1132 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1133 ('b', None))
1134 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1135 ('b', 'a'))
1136 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1137 ('c', 'b'))
1138 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1139 ('b', None))
1140 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1141 ('b', None))
1142
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001143 def test_bug_725149(self):
1144 # mark_stack_base restoring before restoring marks
1145 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1146 ('a', None))
1147 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1148 ('a', None, None))
1149
Just van Rossum12723ba2003-07-02 20:03:04 +00001150 def test_bug_764548(self):
1151 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001152 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001153 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001154 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001155
Skip Montanaro5ba00542003-04-25 16:00:14 +00001156 def test_finditer(self):
1157 iter = re.finditer(r":+", "a:b::c:::d")
1158 self.assertEqual([item.group(0) for item in iter],
1159 [":", "::", ":::"])
1160
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001161 pat = re.compile(r":+")
1162 iter = pat.finditer("a:b::c:::d", 1, 10)
1163 self.assertEqual([item.group(0) for item in iter],
1164 [":", "::", ":::"])
1165
1166 pat = re.compile(r":+")
1167 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1168 self.assertEqual([item.group(0) for item in iter],
1169 [":", "::", ":::"])
1170
1171 pat = re.compile(r":+")
1172 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1173 self.assertEqual([item.group(0) for item in iter],
1174 [":", "::", ":::"])
1175
1176 pat = re.compile(r":+")
1177 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1178 self.assertEqual([item.group(0) for item in iter],
1179 ["::", "::"])
1180
Thomas Wouters40a088d2008-03-18 20:19:54 +00001181 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001182 self.assertIsNot(re.compile('bug_926075'),
1183 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001184
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001185 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001186 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001187 self.assertEqual(re.compile(pattern).split("a.b.c"),
1188 ['a','b','c'])
1189
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001190 def test_bug_581080(self):
1191 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001192 self.assertEqual(next(iter).span(), (1,2))
1193 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001194
1195 scanner = re.compile(r"\s").scanner("a b")
1196 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001197 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001198
1199 def test_bug_817234(self):
1200 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001201 self.assertEqual(next(iter).span(), (0, 4))
1202 self.assertEqual(next(iter).span(), (4, 4))
1203 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001204
Mark Dickinson1f268282009-07-28 17:22:36 +00001205 def test_bug_6561(self):
1206 # '\d' should match characters in Unicode category 'Nd'
1207 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1208 # Letter) or 'No' (Number, Other).
1209 decimal_digits = [
1210 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1211 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1212 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1213 ]
1214 for x in decimal_digits:
1215 self.assertEqual(re.match('^\d$', x).group(0), x)
1216
1217 not_decimal_digits = [
1218 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1219 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1220 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1221 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1222 ]
1223 for x in not_decimal_digits:
1224 self.assertIsNone(re.match('^\d$', x))
1225
Guido van Rossumd8faa362007-04-27 19:54:29 +00001226 def test_empty_array(self):
1227 # SF buf 1647541
1228 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001229 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001230 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001231 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001232 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001233
Christian Heimes072c0f12008-01-03 23:01:04 +00001234 def test_inline_flags(self):
1235 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001236 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1237 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001238
1239 p = re.compile(upper_char, re.I | re.U)
1240 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001241 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001242
1243 p = re.compile(lower_char, re.I | re.U)
1244 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001245 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001246
1247 p = re.compile('(?i)' + upper_char, re.U)
1248 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001249 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001250
1251 p = re.compile('(?i)' + lower_char, re.U)
1252 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001253 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001254
1255 p = re.compile('(?iu)' + upper_char)
1256 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001257 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001258
1259 p = re.compile('(?iu)' + lower_char)
1260 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001261 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001262
Christian Heimes25bb7832008-01-11 16:17:00 +00001263 def test_dollar_matches_twice(self):
1264 "$ matches the end of string, and just before the terminating \n"
1265 pattern = re.compile('$')
1266 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1267 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1268 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1269
1270 pattern = re.compile('$', re.MULTILINE)
1271 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1272 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1273 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1274
Antoine Pitroufd036452008-08-19 17:56:33 +00001275 def test_bytes_str_mixing(self):
1276 # Mixing str and bytes is disallowed
1277 pat = re.compile('.')
1278 bpat = re.compile(b'.')
1279 self.assertRaises(TypeError, pat.match, b'b')
1280 self.assertRaises(TypeError, bpat.match, 'b')
1281 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1282 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1283 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1284 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1285 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1286 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1287
1288 def test_ascii_and_unicode_flag(self):
1289 # String patterns
1290 for flags in (0, re.UNICODE):
1291 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001292 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001293 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001294 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001295 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001296 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001297 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001298 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001299 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001300 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001301 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001302 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001303 # Bytes patterns
1304 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001305 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001306 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001307 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001308 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001309 # Incompatibilities
1310 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1311 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1312 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1313 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1314 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1315 self.assertRaises(ValueError, re.compile, '(?au)\w')
1316
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001317 def test_locale_flag(self):
1318 import locale
1319 _, enc = locale.getlocale(locale.LC_CTYPE)
1320 # Search non-ASCII letter
1321 for i in range(128, 256):
1322 try:
1323 c = bytes([i]).decode(enc)
1324 sletter = c.lower()
1325 if sletter == c: continue
1326 bletter = sletter.encode(enc)
1327 if len(bletter) != 1: continue
1328 if bletter.decode(enc) != sletter: continue
1329 bpat = re.escape(bytes([i]))
1330 break
1331 except (UnicodeError, TypeError):
1332 pass
1333 else:
1334 bletter = None
1335 bpat = b'A'
1336 # Bytes patterns
1337 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1338 if bletter:
1339 self.assertTrue(pat.match(bletter))
1340 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1341 if bletter:
1342 self.assertTrue(pat.match(bletter))
1343 pat = re.compile(bpat, re.IGNORECASE)
1344 if bletter:
1345 self.assertIsNone(pat.match(bletter))
1346 pat = re.compile(b'\w', re.LOCALE)
1347 if bletter:
1348 self.assertTrue(pat.match(bletter))
1349 pat = re.compile(b'(?L)\w')
1350 if bletter:
1351 self.assertTrue(pat.match(bletter))
1352 pat = re.compile(b'\w')
1353 if bletter:
1354 self.assertIsNone(pat.match(bletter))
1355 # Incompatibilities
1356 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1357 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1358 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1359 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1360 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1361 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1362
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001363 def test_bug_6509(self):
1364 # Replacement strings of both types must parse properly.
1365 # all strings
1366 pat = re.compile('a(\w)')
1367 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1368 pat = re.compile('a(.)')
1369 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1370 pat = re.compile('..')
1371 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1372
1373 # all bytes
1374 pat = re.compile(b'a(\w)')
1375 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1376 pat = re.compile(b'a(.)')
1377 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1378 pat = re.compile(b'..')
1379 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1380
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001381 def test_dealloc(self):
1382 # issue 3299: check for segfault in debug build
1383 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001384 # the overflow limit is different on wide and narrow builds and it
1385 # depends on the definition of SRE_CODE (see sre.h).
1386 # 2**128 should be big enough to overflow on both. For smaller values
1387 # a RuntimeError is raised instead of OverflowError.
1388 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001389 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001390 with self.assertRaises(OverflowError):
1391 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1392 with self.assertRaises(TypeError):
1393 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001396 self.assertTrue(re.search("123.*-", '123abc-'))
1397 self.assertTrue(re.search("123.*-", '123\xe9-'))
1398 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1399 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1400 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401
Ezio Melottidf723e12012-03-13 01:29:48 +02001402 def test_compile(self):
1403 # Test return value when given string and pattern as parameter
1404 pattern = re.compile('random pattern')
1405 self.assertIsInstance(pattern, re._pattern_type)
1406 same_pattern = re.compile(pattern)
1407 self.assertIsInstance(same_pattern, re._pattern_type)
1408 self.assertIs(same_pattern, pattern)
1409 # Test behaviour when not given a string or pattern as parameter
1410 self.assertRaises(TypeError, re.compile, 0)
1411
Ezio Melottife8e6e72013-01-11 08:32:01 +02001412 def test_bug_13899(self):
1413 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1414 # nothing. Ditto B and Z.
Serhiy Storchakaa54aae02015-03-24 22:58:14 +02001415 with self.assertWarns(DeprecationWarning):
1416 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1417 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melottife8e6e72013-01-11 08:32:01 +02001418
Antoine Pitroub33941a2012-12-03 20:55:56 +01001419 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001420 def test_large_search(self, size):
1421 # Issue #10182: indices were 32-bit-truncated.
1422 s = 'a' * size
1423 m = re.search('$', s)
1424 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001425 self.assertEqual(m.start(), size)
1426 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001427
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001428 # The huge memuse is because of re.sub() using a list and a join()
1429 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001430 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001431 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001432 # Issue #10182: indices were 32-bit-truncated.
1433 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001434 r, n = re.subn('', '', s)
1435 self.assertEqual(r, s)
1436 self.assertEqual(n, size + 1)
1437
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001438 def test_bug_16688(self):
1439 # Issue 16688: Backreferences make case-insensitive regex fail on
1440 # non-ASCII strings.
1441 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1442 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001443
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001444 def test_repeat_minmax_overflow(self):
1445 # Issue #13169
1446 string = "x" * 100000
1447 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1448 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1449 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1450 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1451 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1452 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1453 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1454 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1455 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1456 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1457 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1458
1459 @cpython_only
1460 def test_repeat_minmax_overflow_maxrepeat(self):
1461 try:
1462 from _sre import MAXREPEAT
1463 except ImportError:
1464 self.skipTest('requires _sre.MAXREPEAT constant')
1465 string = "x" * 100000
1466 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1467 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1468 (0, 100000))
1469 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1470 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1471 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1472 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1473
R David Murray26dfaac92013-04-14 13:00:54 -04001474 def test_backref_group_name_in_exception(self):
1475 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001476 self.checkPatternError('(?P=<foo>)',
1477 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001478
1479 def test_group_name_in_exception(self):
1480 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001481 self.checkPatternError('(?P<?foo>)',
1482 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001483
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001484 def test_issue17998(self):
1485 for reps in '*', '+', '?', '{1}':
1486 for mod in '', '?':
1487 pattern = '.' + reps + mod + 'yz'
1488 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1489 ['xyz'], msg=pattern)
1490 pattern = pattern.encode()
1491 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1492 [b'xyz'], msg=pattern)
1493
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001494 def test_match_repr(self):
1495 for string in '[abracadabra]', S('[abracadabra]'):
1496 m = re.search(r'(.+)(.*?)\1', string)
1497 self.assertEqual(repr(m), "<%s.%s object; "
1498 "span=(1, 12), match='abracadabra'>" %
1499 (type(m).__module__, type(m).__qualname__))
1500 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1501 bytearray(b'[abracadabra]'),
1502 memoryview(b'[abracadabra]')):
1503 m = re.search(rb'(.+)(.*?)\1', string)
1504 self.assertEqual(repr(m), "<%s.%s object; "
1505 "span=(1, 12), match=b'abracadabra'>" %
1506 (type(m).__module__, type(m).__qualname__))
1507
1508 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1509 self.assertEqual(repr(first), "<%s.%s object; "
1510 "span=(0, 2), match='aa'>" %
1511 (type(second).__module__, type(first).__qualname__))
1512 self.assertEqual(repr(second), "<%s.%s object; "
1513 "span=(3, 5), match='bb'>" %
1514 (type(second).__module__, type(second).__qualname__))
1515
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001516
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001517 def test_bug_2537(self):
1518 # issue 2537: empty submatches
1519 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1520 for inner_op in ('{0,}', '*', '?'):
1521 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1522 m = r.match("xyyzy")
1523 self.assertEqual(m.group(0), "xyy")
1524 self.assertEqual(m.group(1), "")
1525 self.assertEqual(m.group(2), "y")
1526
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001527 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001528 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001529 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001530 re.compile(pat, re.DEBUG)
1531 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001532SUBPATTERN 1
1533 LITERAL 46
1534SUBPATTERN None
1535 BRANCH
1536 IN
1537 LITERAL 99
1538 LITERAL 104
1539 OR
1540 LITERAL 112
1541 LITERAL 121
1542SUBPATTERN None
1543 GROUPREF_EXISTS 1
1544 AT AT_END
1545 ELSE
1546 LITERAL 58
1547 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001548'''
1549 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001550 # Debug output is output again even a second time (bypassing
1551 # the cache -- issue #20426).
1552 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001553 re.compile(pat, re.DEBUG)
1554 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001555
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001556 def test_keyword_parameters(self):
1557 # Issue #20283: Accepting the string keyword parameter.
1558 pat = re.compile(r'(ab)')
1559 self.assertEqual(
1560 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1561 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001562 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1563 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001564 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1565 self.assertEqual(
1566 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1567 self.assertEqual(
1568 pat.split(string='abracadabra', maxsplit=1),
1569 ['', 'ab', 'racadabra'])
1570 self.assertEqual(
1571 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1572 (7, 9))
1573
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001574 def test_bug_20998(self):
1575 # Issue #20998: Fullmatch of repeated single character pattern
1576 # with ignore case.
1577 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1578
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001579 def test_locale_caching(self):
1580 # Issue #22410
1581 oldlocale = locale.setlocale(locale.LC_CTYPE)
1582 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1583 for loc in 'en_US.iso88591', 'en_US.utf8':
1584 try:
1585 locale.setlocale(locale.LC_CTYPE, loc)
1586 except locale.Error:
1587 # Unsupported locale on this system
1588 self.skipTest('test needs %s locale' % loc)
1589
1590 re.purge()
1591 self.check_en_US_iso88591()
1592 self.check_en_US_utf8()
1593 re.purge()
1594 self.check_en_US_utf8()
1595 self.check_en_US_iso88591()
1596
1597 def check_en_US_iso88591(self):
1598 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1599 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1600 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1601 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1602 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1603 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1604 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1605
1606 def check_en_US_utf8(self):
1607 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1608 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1609 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1610 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1611 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1612 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1613 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1614
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001615 def test_error(self):
1616 with self.assertRaises(re.error) as cm:
1617 re.compile('(\u20ac))')
1618 err = cm.exception
1619 self.assertIsInstance(err.pattern, str)
1620 self.assertEqual(err.pattern, '(\u20ac))')
1621 self.assertEqual(err.pos, 3)
1622 self.assertEqual(err.lineno, 1)
1623 self.assertEqual(err.colno, 4)
1624 self.assertIn(err.msg, str(err))
1625 self.assertIn(' at position 3', str(err))
1626 self.assertNotIn(' at position 3', err.msg)
1627 # Bytes pattern
1628 with self.assertRaises(re.error) as cm:
1629 re.compile(b'(\xa4))')
1630 err = cm.exception
1631 self.assertIsInstance(err.pattern, bytes)
1632 self.assertEqual(err.pattern, b'(\xa4))')
1633 self.assertEqual(err.pos, 3)
1634 # Multiline pattern
1635 with self.assertRaises(re.error) as cm:
1636 re.compile("""
1637 (
1638 abc
1639 )
1640 )
1641 (
1642 """, re.VERBOSE)
1643 err = cm.exception
1644 self.assertEqual(err.pos, 77)
1645 self.assertEqual(err.lineno, 5)
1646 self.assertEqual(err.colno, 17)
1647 self.assertIn(err.msg, str(err))
1648 self.assertIn(' at position 77', str(err))
1649 self.assertIn('(line 5, column 17)', str(err))
1650
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001651 def test_misc_errors(self):
1652 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1653 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1654 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1655 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1656 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1657 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
1658 self.checkPatternError(r'(?i', 'missing )', 3)
1659 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1660 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1661 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1662 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1663
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001664
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001665class PatternReprTests(unittest.TestCase):
1666 def check(self, pattern, expected):
1667 self.assertEqual(repr(re.compile(pattern)), expected)
1668
1669 def check_flags(self, pattern, flags, expected):
1670 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1671
1672 def test_without_flags(self):
1673 self.check('random pattern',
1674 "re.compile('random pattern')")
1675
1676 def test_single_flag(self):
1677 self.check_flags('random pattern', re.IGNORECASE,
1678 "re.compile('random pattern', re.IGNORECASE)")
1679
1680 def test_multiple_flags(self):
1681 self.check_flags('random pattern', re.I|re.S|re.X,
1682 "re.compile('random pattern', "
1683 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1684
1685 def test_unicode_flag(self):
1686 self.check_flags('random pattern', re.U,
1687 "re.compile('random pattern')")
1688 self.check_flags('random pattern', re.I|re.S|re.U,
1689 "re.compile('random pattern', "
1690 "re.IGNORECASE|re.DOTALL)")
1691
1692 def test_inline_flags(self):
1693 self.check('(?i)pattern',
1694 "re.compile('(?i)pattern', re.IGNORECASE)")
1695
1696 def test_unknown_flags(self):
1697 self.check_flags('random pattern', 0x123000,
1698 "re.compile('random pattern', 0x123000)")
1699 self.check_flags('random pattern', 0x123000|re.I,
1700 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1701
1702 def test_bytes(self):
1703 self.check(b'bytes pattern',
1704 "re.compile(b'bytes pattern')")
1705 self.check_flags(b'bytes pattern', re.A,
1706 "re.compile(b'bytes pattern', re.ASCII)")
1707
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001708 def test_locale(self):
1709 self.check_flags(b'bytes pattern', re.L,
1710 "re.compile(b'bytes pattern', re.LOCALE)")
1711
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001712 def test_quotes(self):
1713 self.check('random "double quoted" pattern',
1714 '''re.compile('random "double quoted" pattern')''')
1715 self.check("random 'single quoted' pattern",
1716 '''re.compile("random 'single quoted' pattern")''')
1717 self.check('''both 'single' and "double" quotes''',
1718 '''re.compile('both \\'single\\' and "double" quotes')''')
1719
1720 def test_long_pattern(self):
1721 pattern = 'Very %spattern' % ('long ' * 1000)
1722 r = repr(re.compile(pattern))
1723 self.assertLess(len(r), 300)
1724 self.assertEqual(r[:30], "re.compile('Very long long lon")
1725 r = repr(re.compile(pattern, re.I))
1726 self.assertLess(len(r), 300)
1727 self.assertEqual(r[:30], "re.compile('Very long long lon")
1728 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1729
1730
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001731class ImplementationTest(unittest.TestCase):
1732 """
1733 Test implementation details of the re module.
1734 """
1735
1736 def test_overlap_table(self):
1737 f = sre_compile._generate_overlap_table
1738 self.assertEqual(f(""), [])
1739 self.assertEqual(f("a"), [0])
1740 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1741 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1742 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1743 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1744
1745
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001746class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001747
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001748 def test_re_benchmarks(self):
1749 're_tests benchmarks'
1750 from test.re_tests import benchmarks
1751 for pattern, s in benchmarks:
1752 with self.subTest(pattern=pattern, string=s):
1753 p = re.compile(pattern)
1754 self.assertTrue(p.search(s))
1755 self.assertTrue(p.match(s))
1756 self.assertTrue(p.fullmatch(s))
1757 s2 = ' '*10000 + s + ' '*10000
1758 self.assertTrue(p.search(s2))
1759 self.assertTrue(p.match(s2, 10000))
1760 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1761 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001762
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001763 def test_re_tests(self):
1764 're_tests test suite'
1765 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1766 for t in tests:
1767 pattern = s = outcome = repl = expected = None
1768 if len(t) == 5:
1769 pattern, s, outcome, repl, expected = t
1770 elif len(t) == 3:
1771 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001772 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001773 raise ValueError('Test tuples should have 3 or 5 fields', t)
1774
1775 with self.subTest(pattern=pattern, string=s):
1776 if outcome == SYNTAX_ERROR: # Expected a syntax error
1777 with self.assertRaises(re.error):
1778 re.compile(pattern)
1779 continue
1780
1781 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001782 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001783 if outcome == FAIL:
1784 self.assertIsNone(result, 'Succeeded incorrectly')
1785 continue
1786
1787 with self.subTest():
1788 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001789 # Matched, as expected, so now we compute the
1790 # result string and compare it to our expected result.
1791 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001792 vardict = {'found': result.group(0),
1793 'groups': result.group(),
1794 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001795 for i in range(1, 100):
1796 try:
1797 gi = result.group(i)
1798 # Special hack because else the string concat fails:
1799 if gi is None:
1800 gi = "None"
1801 except IndexError:
1802 gi = "Error"
1803 vardict['g%d' % i] = gi
1804 for i in result.re.groupindex.keys():
1805 try:
1806 gi = result.group(i)
1807 if gi is None:
1808 gi = "None"
1809 except IndexError:
1810 gi = "Error"
1811 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001812 self.assertEqual(eval(repl, vardict), expected,
1813 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001814
Antoine Pitrou22628c42008-07-22 17:53:22 +00001815 # Try the match with both pattern and string converted to
1816 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001817 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001818 bpat = bytes(pattern, "ascii")
1819 bs = bytes(s, "ascii")
1820 except UnicodeEncodeError:
1821 # skip non-ascii tests
1822 pass
1823 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001824 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001825 obj = re.compile(bpat)
1826 self.assertTrue(obj.search(bs))
1827
1828 # Try the match with LOCALE enabled, and check that it
1829 # still succeeds.
1830 with self.subTest('locale-sensitive match'):
1831 obj = re.compile(bpat, re.LOCALE)
1832 result = obj.search(bs)
1833 if result is None:
1834 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001835
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001836 # Try the match with the search area limited to the extent
1837 # of the match and see if it still succeeds. \B will
1838 # break (because it won't match at the end or start of a
1839 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001840 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1841 and result is not None):
1842 with self.subTest('range-limited match'):
1843 obj = re.compile(pattern)
1844 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001845
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001846 # Try the match with IGNORECASE enabled, and check that it
1847 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001848 with self.subTest('case-insensitive match'):
1849 obj = re.compile(pattern, re.IGNORECASE)
1850 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001851
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001852 # Try the match with UNICODE locale enabled, and check
1853 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001854 with self.subTest('unicode-sensitive match'):
1855 obj = re.compile(pattern, re.UNICODE)
1856 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001857
Gregory P. Smith5a631832010-07-27 05:31:29 +00001858
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001859if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001860 unittest.main()