blob: aaed3d893aaf94d25000ca5a455a6dd881da76c4 [file] [log] [blame]
Victor Stinnerd6debb22017-03-27 16:05:26 +02001from test.support import (gc_collect, bigmemtest, _2G,
2 cpython_only, captured_stdout)
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02003import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02005import sre_compile
Ezio Melottid2114eb2011-03-25 14:08:44 +02006import string
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import unittest
Victor Stinnerb44fb122016-11-21 16:35:08 +01008import warnings
9from re import Scanner
Raymond Hettinger027bb632004-05-31 03:09:25 +000010from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000011
Guido van Rossum23b22571997-07-17 22:36:14 +000012# Misc tests from Tim Peters' re.doc
13
Just van Rossum6802c6e2003-07-02 14:36:59 +000014# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020015# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000016# cover most of the code.
17
Serhiy Storchaka25324972013-10-16 12:46:28 +030018class S(str):
19 def __getitem__(self, index):
20 return S(super().__getitem__(index))
21
22class B(bytes):
23 def __getitem__(self, index):
24 return B(super().__getitem__(index))
25
Skip Montanaro8ed06da2003-04-24 19:43:18 +000026class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000027
Serhiy Storchaka25324972013-10-16 12:46:28 +030028 def assertTypedEqual(self, actual, expect, msg=None):
29 self.assertEqual(actual, expect, msg)
30 def recurse(actual, expect):
31 if isinstance(expect, (tuple, list)):
32 for x, y in zip(actual, expect):
33 recurse(x, y)
34 else:
35 self.assertIs(type(actual), type(expect), msg)
36 recurse(actual, expect)
37
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020038 def checkPatternError(self, pattern, errmsg, pos=None):
39 with self.assertRaises(re.error) as cm:
40 re.compile(pattern)
41 with self.subTest(pattern=pattern):
42 err = cm.exception
43 self.assertEqual(err.msg, errmsg)
44 if pos is not None:
45 self.assertEqual(err.pos, pos)
46
47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
48 with self.assertRaises(re.error) as cm:
49 re.sub(pattern, repl, string)
50 with self.subTest(pattern=pattern, repl=repl):
51 err = cm.exception
52 self.assertEqual(err.msg, errmsg)
53 if pos is not None:
54 self.assertEqual(err.pos, pos)
55
Benjamin Petersone48944b2012-03-07 14:50:25 -060056 def test_keep_buffer(self):
57 # See bug 14212
58 b = bytearray(b'x')
59 it = re.finditer(b'a', b)
60 with self.assertRaises(BufferError):
61 b.extend(b'x'*400)
62 list(it)
63 del it
64 gc_collect()
65 b.extend(b'x'*400)
66
Raymond Hettinger027bb632004-05-31 03:09:25 +000067 def test_weakref(self):
68 s = 'QabbbcR'
69 x = re.compile('ab+c')
70 y = proxy(x)
71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
72
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_search_star_plus(self):
74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030078 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000079 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030083 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000084
Skip Montanaro8ed06da2003-04-24 19:43:18 +000085 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000086 int_value = int(matchobj.group(0))
87 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030090 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030096 for y in ("\xe0", "\u0430", "\U0001d49c"):
97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030098
Skip Montanaro8ed06da2003-04-24 19:43:18 +000099 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
101 '9.3 -3 24x100y')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
103 '9.3 -3 23x99y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000105 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000106
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000110 s = r"\1\1"
111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
Serhiy Storchakab748e3b2017-12-12 19:21:50 +0200112 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000114
R David Murray44b548d2016-09-08 13:59:53 -0400115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000119
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
125 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300126 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000128
R David Murray44b548d2016-09-08 13:59:53 -0400129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000130
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000131 def test_bug_449964(self):
132 # fails for group followed by other escape
R David Murray44b548d2016-09-08 13:59:53 -0400133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000134 'xx\bxx\b')
135
136 def test_bug_449000(self):
137 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
139 'abc\ndef\n')
140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
141 'abc\ndef\n')
142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
143 'abc\ndef\n')
144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
145 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000146
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000147 def test_bug_1661(self):
148 # Verify that flags do not get silently ignored with compiled patterns
149 pattern = re.compile('.')
150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.compile, pattern, re.I)
154
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000155 def test_bug_3629(self):
156 # A regex that triggered a bug in the sre-code validator
157 re.compile("(?P<quote>)(?(quote))")
158
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000159 def test_sub_template_numeric_escape(self):
160 # bug 776311 and friends
161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000169
170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
172
173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
178
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200179 self.checkTemplateError('x', r'\400', 'x',
180 r'octal escape value \400 outside of '
181 r'range 0-0o377', 0)
182 self.checkTemplateError('x', r'\777', 'x',
183 r'octal escape value \777 outside of '
184 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000185
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Victor Stinner55e614a2014-10-29 16:58:59 +0100210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000211
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000212 def test_bug_114660(self):
213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
214 'hello there')
215
216 def test_bug_462270(self):
217 # Test for empty sub() behaviour, see SF bug #462270
218 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
219 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
220
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221 def test_symbolic_groups(self):
R David Murray44b548d2016-09-08 13:59:53 -0400222 re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
223 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
224 re.compile(r'(?P<a1>x)\1(?(1)y)')
225 self.checkPatternError(r'(?P<a>)(?P<a>)',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200226 "redefinition of group name 'a' as group 2; "
227 "was group 1")
R David Murray44b548d2016-09-08 13:59:53 -0400228 self.checkPatternError(r'(?P<a>(?P=a))',
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300229 "cannot refer to an open group", 10)
R David Murray44b548d2016-09-08 13:59:53 -0400230 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
231 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
232 self.checkPatternError(r'(?P=', 'missing group name', 4)
233 self.checkPatternError(r'(?P=)', 'missing group name', 4)
234 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
235 self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
236 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
237 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
238 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
239 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
240 self.checkPatternError(r'(?P<', 'missing group name', 4)
241 self.checkPatternError(r'(?P<>)', 'missing group name', 4)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200242 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
243 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
244 self.checkPatternError(r'(?(', 'missing group name', 3)
245 self.checkPatternError(r'(?())', 'missing group name', 3)
246 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
247 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
248 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
249 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200250 # New valid/invalid identifiers in Python 3
251 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
252 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200253 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300254 # Support > 100 groups.
255 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
256 pat = '(?:%s)(?(200)z|t)' % pat
257 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200258
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000259 def test_symbolic_refs(self):
R David Murray44b548d2016-09-08 13:59:53 -0400260 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200261 'missing >, unterminated name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400262 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200263 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400264 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
265 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200266 "bad character in group name 'a a'", 3)
R David Murray44b548d2016-09-08 13:59:53 -0400267 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200268 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400269 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200270 "bad character in group name '1a1'", 3)
271 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300272 'invalid group reference 2', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200273 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300274 'invalid group reference 2', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200275 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
R David Murray44b548d2016-09-08 13:59:53 -0400276 re.sub('(?P<a>x)', r'\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
278 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
R David Murray44b548d2016-09-08 13:59:53 -0400279 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200280 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200281 # New valid/invalid identifiers in Python 3
282 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
283 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
R David Murray44b548d2016-09-08 13:59:53 -0400284 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200285 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300286 # Support > 100 groups.
287 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
R David Murray44b548d2016-09-08 13:59:53 -0400288 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000289
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000290 def test_re_subn(self):
291 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
292 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
293 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
294 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300295 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Victor Stinner55e614a2014-10-29 16:58:59 +0100296 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000297
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000298 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300299 for string in ":a:b::c", S(":a:b::c"):
300 self.assertTypedEqual(re.split(":", string),
301 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200304 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300305 ['', ':', 'a', ':', 'b', '::', 'c'])
306 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
307 memoryview(b":a:b::c")):
308 self.assertTypedEqual(re.split(b":", string),
309 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200312 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300314 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
315 "\U0001d49c\U0001d49e\U0001d4b5"):
316 string = ":%s:%s::%s" % (a, b, c)
317 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200318 self.assertEqual(re.split(":+", string), ['', a, b, c])
319 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300320 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300321
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200322 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
323 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000324 ['', ':', 'a', ':', 'b', ':', 'c'])
325 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
326 ['', ':', 'a', ':b::', 'c'])
327 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
328 ['', None, ':', 'a', None, ':', '', 'b', None, '',
329 None, '::', 'c'])
330 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
331 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200333 for sep, expected in [
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200334 (':*', ['', 'a', 'b', 'c', '']),
335 ('(?::*)', ['', 'a', 'b', 'c', '']),
336 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
337 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200338 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200339 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200340 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
341
342 for sep, expected in [
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200343 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
344 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
345 (r'(?=:)', ['', ':a', ':b', ':', ':c']),
346 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200347 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200348 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200349 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
350
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 def test_qualified_re_split(self):
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300352 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100353 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
354 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
355 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000356 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200357 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000358 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200359 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
360 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000361
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000362 def test_re_findall(self):
363 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300364 for string in "a:b::c:::d", S("a:b::c:::d"):
365 self.assertTypedEqual(re.findall(":+", string),
366 [":", "::", ":::"])
367 self.assertTypedEqual(re.findall("(:+)", string),
368 [":", "::", ":::"])
369 self.assertTypedEqual(re.findall("(:)(:*)", string),
370 [(":", ""), (":", ":"), (":", "::")])
371 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
372 memoryview(b"a:b::c:::d")):
373 self.assertTypedEqual(re.findall(b":+", string),
374 [b":", b"::", b":::"])
375 self.assertTypedEqual(re.findall(b"(:+)", string),
376 [b":", b"::", b":::"])
377 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
378 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300379 for x in ("\xe0", "\u0430", "\U0001d49c"):
380 xx = x * 2
381 xxx = x * 3
382 string = "a%sb%sc%sd" % (x, xx, xxx)
383 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
384 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
385 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
386 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000387
Skip Montanaro5ba00542003-04-25 16:00:14 +0000388 def test_bug_117612(self):
389 self.assertEqual(re.findall(r"(a|(b))", "aba"),
390 [("a", ""),("b", "b"),("a", "")])
391
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000392 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300393 for string in 'a', S('a'):
394 self.assertEqual(re.match('a', string).groups(), ())
395 self.assertEqual(re.match('(a)', string).groups(), ('a',))
396 self.assertEqual(re.match('(a)', string).group(0), 'a')
397 self.assertEqual(re.match('(a)', string).group(1), 'a')
398 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
399 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
400 self.assertEqual(re.match(b'a', string).groups(), ())
401 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
402 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
403 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
404 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300405 for a in ("\xe0", "\u0430", "\U0001d49c"):
406 self.assertEqual(re.match(a, a).groups(), ())
407 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
408 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
409 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
410 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000411
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000412 pat = re.compile('((a)|(b))(c)?')
413 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
414 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
415 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
416 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
417 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000418
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000419 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
420 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
421 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
422 (None, 'b', None))
423 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000424
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300425 def test_group(self):
426 class Index:
427 def __init__(self, value):
428 self.value = value
429 def __index__(self):
430 return self.value
431 # A single group
432 m = re.match('(a)(b)', 'ab')
433 self.assertEqual(m.group(), 'ab')
434 self.assertEqual(m.group(0), 'ab')
435 self.assertEqual(m.group(1), 'a')
436 self.assertEqual(m.group(Index(1)), 'a')
437 self.assertRaises(IndexError, m.group, -1)
438 self.assertRaises(IndexError, m.group, 3)
439 self.assertRaises(IndexError, m.group, 1<<1000)
440 self.assertRaises(IndexError, m.group, Index(1<<1000))
441 self.assertRaises(IndexError, m.group, 'x')
442 # Multiple groups
443 self.assertEqual(m.group(2, 1), ('b', 'a'))
444 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
445
Eric V. Smith605bdae2016-09-11 08:55:43 -0400446 def test_match_getitem(self):
447 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
448
449 m = pat.match('a')
450 self.assertEqual(m['a1'], 'a')
451 self.assertEqual(m['b2'], None)
452 self.assertEqual(m['c3'], None)
453 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
454 self.assertEqual(m[0], 'a')
455 self.assertEqual(m[1], 'a')
456 self.assertEqual(m[2], None)
457 self.assertEqual(m[3], None)
458 with self.assertRaisesRegex(IndexError, 'no such group'):
459 m['X']
460 with self.assertRaisesRegex(IndexError, 'no such group'):
461 m[-1]
462 with self.assertRaisesRegex(IndexError, 'no such group'):
463 m[4]
464 with self.assertRaisesRegex(IndexError, 'no such group'):
465 m[0, 1]
466 with self.assertRaisesRegex(IndexError, 'no such group'):
467 m[(0,)]
468 with self.assertRaisesRegex(IndexError, 'no such group'):
469 m[(0, 1)]
Serhiy Storchaka50754162017-08-03 11:45:23 +0300470 with self.assertRaisesRegex(IndexError, 'no such group'):
Eric V. Smith605bdae2016-09-11 08:55:43 -0400471 'a1={a2}'.format_map(m)
472
473 m = pat.match('ac')
474 self.assertEqual(m['a1'], 'a')
475 self.assertEqual(m['b2'], None)
476 self.assertEqual(m['c3'], 'c')
477 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
478 self.assertEqual(m[0], 'ac')
479 self.assertEqual(m[1], 'a')
480 self.assertEqual(m[2], None)
481 self.assertEqual(m[3], 'c')
482
483 # Cannot assign.
484 with self.assertRaises(TypeError):
485 m[0] = 1
486
487 # No len().
488 self.assertRaises(TypeError, len, m)
489
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200490 def test_re_fullmatch(self):
491 # Issue 16203: Proposal: add re.fullmatch() method.
492 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
493 for string in "ab", S("ab"):
494 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
495 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
496 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
497 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
498 r = r"%s|%s" % (a, a + b)
499 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
500 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
501 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
502 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
503 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
504 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
505 self.assertIsNone(re.fullmatch(r"a+", "ab"))
506 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
507 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
508 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
509 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
510 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
511 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
512
513 self.assertEqual(
514 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
515 self.assertEqual(
516 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
517 self.assertEqual(
518 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
519
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000520 def test_re_groupref_exists(self):
R David Murray44b548d2016-09-08 13:59:53 -0400521 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000522 ('(', 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400523 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000524 (None, 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400525 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
526 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000527 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
528 ('a', 'b'))
R David Murray44b548d2016-09-08 13:59:53 -0400529 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000530 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400531 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000532 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400533 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000534 ('a', ''))
535
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000536 # Tests for bug #1177831: exercise groups other than the first group
537 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
538 self.assertEqual(p.match('abc').groups(),
539 ('a', 'b', 'c'))
540 self.assertEqual(p.match('ad').groups(),
541 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300542 self.assertIsNone(p.match('abd'))
543 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000544
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300545 # Support > 100 groups.
546 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
547 pat = '(?:%s)(?(200)z)' % pat
548 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000549
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200550 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
551 self.checkPatternError(r'()(?(1)a|b',
552 'missing ), unterminated subpattern', 2)
553 self.checkPatternError(r'()(?(1)a|b|c)',
554 'conditional backref with more than '
555 'two branches', 10)
556
557 def test_re_groupref_overflow(self):
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300558 from sre_constants import MAXGROUPS
559 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
560 'invalid group reference %d' % MAXGROUPS, 3)
561 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
562 'invalid group reference %d' % MAXGROUPS, 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200563
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000564 def test_re_groupref(self):
565 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
566 ('|', 'a'))
567 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
568 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300569 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
570 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000571 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
572 ('a', 'a'))
573 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
574 (None, None))
575
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200576 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
577
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000578 def test_groupdict(self):
579 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
580 'first second').groupdict(),
581 {'first':'first', 'second':'second'})
582
583 def test_expand(self):
584 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
585 "first second")
586 .expand(r"\2 \1 \g<second> \g<first>"),
587 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300588 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
589 "first")
590 .expand(r"\2 \g<second>"),
591 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000592
593 def test_repeat_minmax(self):
R David Murray44b548d2016-09-08 13:59:53 -0400594 self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
595 self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
596 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
597 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000598
R David Murray44b548d2016-09-08 13:59:53 -0400599 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
600 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
601 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
602 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
603 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
604 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
605 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
606 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000607
R David Murray44b548d2016-09-08 13:59:53 -0400608 self.assertIsNone(re.match(r"^x{1}$", "xxx"))
609 self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
610 self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
611 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000612
R David Murray44b548d2016-09-08 13:59:53 -0400613 self.assertTrue(re.match(r"^x{3}$", "xxx"))
614 self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
615 self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
616 self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
617 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
618 self.assertTrue(re.match(r"^x{3}?$", "xxx"))
619 self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
620 self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
621 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000622
R David Murray44b548d2016-09-08 13:59:53 -0400623 self.assertIsNone(re.match(r"^x{}$", "xxx"))
624 self.assertTrue(re.match(r"^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000625
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200626 self.checkPatternError(r'x{2,1}',
627 'min repeat greater than max repeat', 2)
628
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000629 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000630 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000631 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000632 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
633 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
634 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
635 {'first': 1, 'other': 2})
636
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000637 self.assertEqual(re.match("(a)", "a").pos, 0)
638 self.assertEqual(re.match("(a)", "a").endpos, 1)
639 self.assertEqual(re.match("(a)", "a").string, "a")
640 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300641 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000642
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300643 # Issue 14260. groupindex should be non-modifiable mapping.
644 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
645 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
646 self.assertEqual(p.groupindex['other'], 2)
647 with self.assertRaises(TypeError):
648 p.groupindex['other'] = 0
649 self.assertEqual(p.groupindex['other'], 2)
650
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000651 def test_special_escapes(self):
652 self.assertEqual(re.search(r"\b(b.)\b",
653 "abcd abc bcd bx").group(1), "bx")
654 self.assertEqual(re.search(r"\B(b.)\B",
655 "abc bcd bc abxd").group(1), "bx")
656 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300657 "abcd abc bcd bx", re.ASCII).group(1), "bx")
658 self.assertEqual(re.search(r"\B(b.)\B",
659 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000660 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
661 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300662 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300663 self.assertEqual(re.search(br"\b(b.)\b",
664 b"abcd abc bcd bx").group(1), b"bx")
665 self.assertEqual(re.search(br"\B(b.)\B",
666 b"abc bcd bc abxd").group(1), b"bx")
667 self.assertEqual(re.search(br"\b(b.)\b",
668 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
669 self.assertEqual(re.search(br"\B(b.)\B",
670 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
671 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
672 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300673 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000674 self.assertEqual(re.search(r"\d\D\w\W\s\S",
675 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300676 self.assertEqual(re.search(br"\d\D\w\W\s\S",
677 b"1aa! a").group(0), b"1aa! a")
678 self.assertEqual(re.search(r"\d\D\w\W\s\S",
679 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300680 self.assertEqual(re.search(br"\d\D\w\W\s\S",
681 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000682
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200683 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200684 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200685 self.assertEqual(re.match(r"\(", '(').group(), '(')
686 self.assertIsNone(re.match(r"\(", ')'))
687 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200688 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
689 self.assertIsNone(re.match(r"[\]]", '['))
690 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
691 self.assertIsNone(re.match(r"[a\-c]", 'b'))
692 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
693 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200694 re.purge() # for warnings
695 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
696 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300697 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200698 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
699 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300700 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200701
Ezio Melotti5a045b92012-02-29 11:48:44 +0200702 def test_string_boundaries(self):
703 # See http://bugs.python.org/issue10713
704 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
705 "abc")
706 # There's a word boundary at the start of a string.
707 self.assertTrue(re.match(r"\b", "abc"))
708 # A non-empty string includes a non-boundary zero-length match.
709 self.assertTrue(re.search(r"\B", "abc"))
710 # There is no non-boundary match at the start of a string.
711 self.assertFalse(re.match(r"\B", "abc"))
712 # However, an empty string contains no word boundaries, and also no
713 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300714 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200715 # This one is questionable and different from the perlre behaviour,
716 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300717 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200718 # A single word-character string has two boundaries, but no
719 # non-boundary gaps.
720 self.assertEqual(len(re.findall(r"\b", "a")), 2)
721 self.assertEqual(len(re.findall(r"\B", "a")), 0)
722 # If there are no words, there are no boundaries
723 self.assertEqual(len(re.findall(r"\b", " ")), 0)
724 self.assertEqual(len(re.findall(r"\b", " ")), 0)
725 # Can match around the whitespace.
726 self.assertEqual(len(re.findall(r"\B", " ")), 2)
727
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000728 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000729 self.assertEqual(re.match("([\u2222\u2223])",
730 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300731 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300732 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000733
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100734 def test_big_codesize(self):
735 # Issue #1160
736 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300737 self.assertTrue(r.match('1000'))
738 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100739
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000740 def test_anyall(self):
741 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
742 "a\nb")
743 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
744 "a\n\nb")
745
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200746 def test_lookahead(self):
R David Murray44b548d2016-09-08 13:59:53 -0400747 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
748 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
749 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
750 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000751 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
752 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
753 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
754
755 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
756 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
757 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
758 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
759
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200760 # Group reference.
761 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
762 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
763 # Conditional group reference.
764 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
765 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
766 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
767 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
768 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
769 # Group used before defined.
770 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
771 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
772 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
773
774 def test_lookbehind(self):
775 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
776 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
777 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
778 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
779 # Group reference.
780 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
781 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
782 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
783 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
784 # Conditional group reference.
785 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
786 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
787 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
788 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
789 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
790 # Group used before defined.
791 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
792 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
793 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
794 # Group defined in the same lookbehind pattern
795 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
796 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
797 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
798 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
799
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000800 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000801 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300802 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000803 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
804 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
805 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
806 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
807 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
808 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
809 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
810 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
811
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200812 assert '\u212a'.lower() == 'k' # 'K'
813 self.assertTrue(re.match(r'K', '\u212a', re.I))
814 self.assertTrue(re.match(r'k', '\u212a', re.I))
815 self.assertTrue(re.match(r'\u212a', 'K', re.I))
816 self.assertTrue(re.match(r'\u212a', 'k', re.I))
817 assert '\u017f'.upper() == 'S' # 'ſ'
818 self.assertTrue(re.match(r'S', '\u017f', re.I))
819 self.assertTrue(re.match(r's', '\u017f', re.I))
820 self.assertTrue(re.match(r'\u017f', 'S', re.I))
821 self.assertTrue(re.match(r'\u017f', 's', re.I))
822 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
823 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
824 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
825
826 def test_ignore_case_set(self):
827 self.assertTrue(re.match(r'[19A]', 'A', re.I))
828 self.assertTrue(re.match(r'[19a]', 'a', re.I))
829 self.assertTrue(re.match(r'[19a]', 'A', re.I))
830 self.assertTrue(re.match(r'[19A]', 'a', re.I))
831 self.assertTrue(re.match(br'[19A]', b'A', re.I))
832 self.assertTrue(re.match(br'[19a]', b'a', re.I))
833 self.assertTrue(re.match(br'[19a]', b'A', re.I))
834 self.assertTrue(re.match(br'[19A]', b'a', re.I))
835 assert '\u212a'.lower() == 'k' # 'K'
836 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
837 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
838 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
839 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
840 assert '\u017f'.upper() == 'S' # 'ſ'
841 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
842 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
843 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
844 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
845 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
846 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
847 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
848
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200849 def test_ignore_case_range(self):
850 # Issues #3511, #17381.
851 self.assertTrue(re.match(r'[9-a]', '_', re.I))
852 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
853 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
854 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
855 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
856 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
857 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
858 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
859 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
860 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
861 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
862 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
863 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
864 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
865 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
866 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
867
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200868 assert '\u212a'.lower() == 'k' # 'K'
869 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
870 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
871 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
872 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
873 assert '\u017f'.upper() == 'S' # 'ſ'
874 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
875 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
876 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
877 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
878 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
879 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
880 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
881
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000882 def test_category(self):
883 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
884
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300885 @cpython_only
886 def test_case_helpers(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000887 import _sre
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300888 for i in range(128):
889 c = chr(i)
890 lo = ord(c.lower())
891 self.assertEqual(_sre.ascii_tolower(i), lo)
892 self.assertEqual(_sre.unicode_tolower(i), lo)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300893 iscased = c in string.ascii_letters
894 self.assertEqual(_sre.ascii_iscased(i), iscased)
895 self.assertEqual(_sre.unicode_iscased(i), iscased)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000896
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300897 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
898 c = chr(i)
899 self.assertEqual(_sre.ascii_tolower(i), i)
900 if i != 0x0130:
901 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300902 iscased = c != c.lower() or c != c.upper()
903 self.assertFalse(_sre.ascii_iscased(i))
904 self.assertEqual(_sre.unicode_iscased(i),
905 c != c.lower() or c != c.upper())
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300906
907 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
908 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300909 self.assertFalse(_sre.ascii_iscased(0x0130))
910 self.assertTrue(_sre.unicode_iscased(0x0130))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000911
912 def test_not_literal(self):
R David Murray44b548d2016-09-08 13:59:53 -0400913 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
914 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000915
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200916 def test_possible_set_operations(self):
917 s = bytes(range(128)).decode()
918 with self.assertWarns(FutureWarning):
919 p = re.compile(r'[0-9--1]')
920 self.assertEqual(p.findall(s), list('-./0123456789'))
921 self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
922 with self.assertWarns(FutureWarning):
923 p = re.compile(r'[%--1]')
924 self.assertEqual(p.findall(s), list("%&'()*+,-1"))
925 with self.assertWarns(FutureWarning):
926 p = re.compile(r'[%--]')
927 self.assertEqual(p.findall(s), list("%&'()*+,-"))
928
929 with self.assertWarns(FutureWarning):
930 p = re.compile(r'[0-9&&1]')
931 self.assertEqual(p.findall(s), list('&0123456789'))
932 with self.assertWarns(FutureWarning):
933 p = re.compile(r'[\d&&1]')
934 self.assertEqual(p.findall(s), list('&0123456789'))
935 self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
936
937 with self.assertWarns(FutureWarning):
938 p = re.compile(r'[0-9||a]')
939 self.assertEqual(p.findall(s), list('0123456789a|'))
940 with self.assertWarns(FutureWarning):
941 p = re.compile(r'[\d||a]')
942 self.assertEqual(p.findall(s), list('0123456789a|'))
943 self.assertEqual(re.findall(r'[||1]', s), list('1|'))
944
945 with self.assertWarns(FutureWarning):
946 p = re.compile(r'[0-9~~1]')
947 self.assertEqual(p.findall(s), list('0123456789~'))
948 with self.assertWarns(FutureWarning):
949 p = re.compile(r'[\d~~1]')
950 self.assertEqual(p.findall(s), list('0123456789~'))
951 self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
952
953 with self.assertWarns(FutureWarning):
954 p = re.compile(r'[[0-9]|]')
955 self.assertEqual(p.findall(s), list('0123456789[]'))
956
957 with self.assertWarns(FutureWarning):
958 p = re.compile(r'[[:digit:]|]')
959 self.assertEqual(p.findall(s), list(':[]dgit'))
960
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000961 def test_search_coverage(self):
R David Murray44b548d2016-09-08 13:59:53 -0400962 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
963 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000964
Ezio Melottid2114eb2011-03-25 14:08:44 +0200965 def assertMatch(self, pattern, text, match=None, span=None,
Serhiy Storchaka59083002017-04-13 21:06:43 +0300966 matcher=re.fullmatch):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200967 if match is None and span is None:
968 # the pattern matches the whole text
969 match = text
970 span = (0, len(text))
971 elif match is None or span is None:
972 raise ValueError('If match is not None, span should be specified '
973 '(and vice versa).')
974 m = matcher(pattern, text)
975 self.assertTrue(m)
976 self.assertEqual(m.group(), match)
977 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000978
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200979 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
Serhiy Storchaka59083002017-04-13 21:06:43 +0300980
Ezio Melottid2114eb2011-03-25 14:08:44 +0200981 def test_re_escape(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200982 p = ''.join(chr(i) for i in range(256))
983 for c in p:
Ezio Melottid2114eb2011-03-25 14:08:44 +0200984 self.assertMatch(re.escape(c), c)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300985 self.assertMatch('[' + re.escape(c) + ']', c)
986 self.assertMatch('(?x)' + re.escape(c), c)
Ezio Melottid2114eb2011-03-25 14:08:44 +0200987 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300988 for c in '-.]{}':
989 self.assertEqual(re.escape(c)[:1], '\\')
990 literal_chars = self.LITERAL_CHARS
991 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum49946571997-07-18 04:26:25 +0000992
Serhiy Storchaka59083002017-04-13 21:06:43 +0300993 def test_re_escape_bytes(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200994 p = bytes(range(256))
995 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000996 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200997 self.assertMatch(re.escape(b), b)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300998 self.assertMatch(b'[' + re.escape(b) + b']', b)
999 self.assertMatch(b'(?x)' + re.escape(b), b)
Ezio Melottid2114eb2011-03-25 14:08:44 +02001000 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001001 for i in b'-.]{}':
1002 b = bytes([i])
1003 self.assertEqual(re.escape(b)[:1], b'\\')
1004 literal_chars = self.LITERAL_CHARS.encode('ascii')
1005 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum698280d2008-09-10 17:44:35 +00001006
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001007 def test_re_escape_non_ascii(self):
1008 s = 'xxx\u2620\u2620\u2620xxx'
1009 s_escaped = re.escape(s)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001010 self.assertEqual(s_escaped, s)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001011 self.assertMatch(s_escaped, s)
1012 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1013 'x\u2620\u2620\u2620x', (2, 7), re.search)
1014
1015 def test_re_escape_non_ascii_bytes(self):
1016 b = 'y\u2620y\u2620y'.encode('utf-8')
1017 b_escaped = re.escape(b)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001018 self.assertEqual(b_escaped, b)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001019 self.assertMatch(b_escaped, b)
1020 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1021 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +00001022
Serhiy Storchakab85a9762014-09-15 11:33:19 +03001023 def test_pickling(self):
1024 import pickle
1025 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1026 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1027 pickled = pickle.dumps(oldpat, proto)
1028 newpat = pickle.loads(pickled)
1029 self.assertEqual(newpat, oldpat)
1030 # current pickle expects the _compile() reconstructor in re module
1031 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +00001032
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001033 def test_copying(self):
1034 import copy
1035 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1036 self.assertIs(copy.copy(p), p)
1037 self.assertIs(copy.deepcopy(p), p)
1038 m = p.match('12.34')
1039 self.assertIs(copy.copy(m), m)
1040 self.assertIs(copy.deepcopy(m), m)
1041
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001042 def test_constants(self):
1043 self.assertEqual(re.I, re.IGNORECASE)
1044 self.assertEqual(re.L, re.LOCALE)
1045 self.assertEqual(re.M, re.MULTILINE)
1046 self.assertEqual(re.S, re.DOTALL)
1047 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001048
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001049 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001050 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001051 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001052 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1053 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +00001054
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001055 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001056 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1057 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001058 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1059 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1060 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1061 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1062 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1063 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001064 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001065 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1066 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1067 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1068 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1069 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1070 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1071 self.assertTrue(re.match(r"\0", "\000"))
1072 self.assertTrue(re.match(r"\08", "\0008"))
1073 self.assertTrue(re.match(r"\01", "\001"))
1074 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001075 self.checkPatternError(r"\567",
1076 r'octal escape value \567 outside of '
1077 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001078 self.checkPatternError(r"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001079 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1080 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1081 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1082 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1083 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1084 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1085 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001086
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001087 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001088 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1089 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001090 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1091 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1092 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1093 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1094 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1095 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1096 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1097 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001098 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001099 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1100 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1101 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1102 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1103 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1104 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001105 self.checkPatternError(r"[\567]",
1106 r'octal escape value \567 outside of '
1107 r'range 0-0o377', 1)
1108 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1109 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1110 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1111 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1112 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +03001113 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001114
1115 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001116 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001117 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1118 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1119 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1120 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1121 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1122 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001123 self.assertRaises(re.error, re.compile, br"\u1234")
1124 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001125 self.assertTrue(re.match(br"\0", b"\000"))
1126 self.assertTrue(re.match(br"\08", b"\0008"))
1127 self.assertTrue(re.match(br"\01", b"\001"))
1128 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001129 self.checkPatternError(br"\567",
1130 r'octal escape value \567 outside of '
1131 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001132 self.checkPatternError(br"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001133 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1134 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001135
1136 def test_sre_byte_class_literals(self):
1137 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001138 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1139 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1140 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1141 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1142 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1143 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1144 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1145 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001146 self.assertRaises(re.error, re.compile, br"[\u1234]")
1147 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001148 self.checkPatternError(br"[\567]",
1149 r'octal escape value \567 outside of '
1150 r'range 0-0o377', 1)
1151 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1152 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1153
1154 def test_character_set_errors(self):
1155 self.checkPatternError(r'[', 'unterminated character set', 0)
1156 self.checkPatternError(r'[^', 'unterminated character set', 0)
1157 self.checkPatternError(r'[a', 'unterminated character set', 0)
1158 # bug 545855 -- This pattern failed to cause a compile error as it
1159 # should, instead provoking a TypeError.
1160 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1161 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1162 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1163 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001164
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001165 def test_bug_113254(self):
1166 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1167 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1168 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1169
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001170 def test_bug_527371(self):
1171 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001172 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001173 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1174 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
R David Murray44b548d2016-09-08 13:59:53 -04001175 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1176 self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001177
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001178 def test_bug_418626(self):
1179 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1180 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1181 # pattern '*?' on a long string.
1182 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1183 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1184 20003)
1185 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001186 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001187 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001188 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001189
1190 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001191 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001192 self.assertEqual(re.compile(pat) and 1, 1)
1193
Skip Montanaro1e703c62003-04-25 15:40:28 +00001194 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001195 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001196 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001197 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1198 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1199 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001200
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001201 def test_nothing_to_repeat(self):
1202 for reps in '*', '+', '?', '{1,2}':
1203 for mod in '', '?':
1204 self.checkPatternError('%s%s' % (reps, mod),
1205 'nothing to repeat', 0)
1206 self.checkPatternError('(?:%s%s)' % (reps, mod),
1207 'nothing to repeat', 3)
1208
1209 def test_multiple_repeat(self):
1210 for outer_reps in '*', '+', '{1,2}':
1211 for outer_mod in '', '?':
1212 outer_op = outer_reps + outer_mod
1213 for inner_reps in '*', '+', '?', '{1,2}':
1214 for inner_mod in '', '?':
1215 inner_op = inner_reps + inner_mod
1216 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1217 'multiple repeat', 1 + len(inner_op))
1218
Serhiy Storchakafa468162013-02-16 21:23:53 +02001219 def test_unlimited_zero_width_repeat(self):
1220 # Issue #9669
1221 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1222 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1223 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1224 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1225 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1226 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1227
Skip Montanaro1e703c62003-04-25 15:40:28 +00001228 def test_scanner(self):
1229 def s_ident(scanner, token): return token
1230 def s_operator(scanner, token): return "op%s" % token
1231 def s_float(scanner, token): return float(token)
1232 def s_int(scanner, token): return int(token)
1233
1234 scanner = Scanner([
1235 (r"[a-zA-Z_]\w*", s_ident),
1236 (r"\d+\.\d*", s_float),
1237 (r"\d+", s_int),
1238 (r"=|\+|-|\*|/", s_operator),
1239 (r"\s+", None),
1240 ])
1241
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001242 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001243
Skip Montanaro1e703c62003-04-25 15:40:28 +00001244 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1245 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1246 'op+', 'bar'], ''))
1247
Skip Montanaro5ba00542003-04-25 16:00:14 +00001248 def test_bug_448951(self):
1249 # bug 448951 (similar to 429357, but with single char match)
1250 # (Also test greedy matches.)
1251 for op in '','?','*':
1252 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1253 (None, None))
1254 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1255 ('a:', 'a'))
1256
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001257 def test_bug_725106(self):
1258 # capturing groups in alternatives in repeats
1259 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1260 ('b', 'a'))
1261 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1262 ('c', 'b'))
1263 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1264 ('b', None))
1265 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1266 ('b', None))
1267 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1268 ('b', 'a'))
1269 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1270 ('c', 'b'))
1271 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1272 ('b', None))
1273 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1274 ('b', None))
1275
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001276 def test_bug_725149(self):
1277 # mark_stack_base restoring before restoring marks
1278 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1279 ('a', None))
1280 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1281 ('a', None, None))
1282
Just van Rossum12723ba2003-07-02 20:03:04 +00001283 def test_bug_764548(self):
1284 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001285 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001286 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001287 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001288
Skip Montanaro5ba00542003-04-25 16:00:14 +00001289 def test_finditer(self):
1290 iter = re.finditer(r":+", "a:b::c:::d")
1291 self.assertEqual([item.group(0) for item in iter],
1292 [":", "::", ":::"])
1293
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001294 pat = re.compile(r":+")
1295 iter = pat.finditer("a:b::c:::d", 1, 10)
1296 self.assertEqual([item.group(0) for item in iter],
1297 [":", "::", ":::"])
1298
1299 pat = re.compile(r":+")
1300 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1301 self.assertEqual([item.group(0) for item in iter],
1302 [":", "::", ":::"])
1303
1304 pat = re.compile(r":+")
1305 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1306 self.assertEqual([item.group(0) for item in iter],
1307 [":", "::", ":::"])
1308
1309 pat = re.compile(r":+")
1310 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1311 self.assertEqual([item.group(0) for item in iter],
1312 ["::", "::"])
1313
Thomas Wouters40a088d2008-03-18 20:19:54 +00001314 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001315 self.assertIsNot(re.compile('bug_926075'),
1316 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001317
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001318 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001319 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001320 self.assertEqual(re.compile(pattern).split("a.b.c"),
1321 ['a','b','c'])
1322
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001323 def test_bug_581080(self):
1324 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001325 self.assertEqual(next(iter).span(), (1,2))
1326 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001327
1328 scanner = re.compile(r"\s").scanner("a b")
1329 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001330 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001331
1332 def test_bug_817234(self):
1333 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001334 self.assertEqual(next(iter).span(), (0, 4))
1335 self.assertEqual(next(iter).span(), (4, 4))
1336 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001337
Mark Dickinson1f268282009-07-28 17:22:36 +00001338 def test_bug_6561(self):
1339 # '\d' should match characters in Unicode category 'Nd'
1340 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1341 # Letter) or 'No' (Number, Other).
1342 decimal_digits = [
1343 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1344 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1345 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1346 ]
1347 for x in decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001348 self.assertEqual(re.match(r'^\d$', x).group(0), x)
Mark Dickinson1f268282009-07-28 17:22:36 +00001349
1350 not_decimal_digits = [
1351 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1352 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1353 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1354 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1355 ]
1356 for x in not_decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001357 self.assertIsNone(re.match(r'^\d$', x))
Mark Dickinson1f268282009-07-28 17:22:36 +00001358
Guido van Rossumd8faa362007-04-27 19:54:29 +00001359 def test_empty_array(self):
1360 # SF buf 1647541
1361 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001362 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001363 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001364 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001365 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001366
Christian Heimes072c0f12008-01-03 23:01:04 +00001367 def test_inline_flags(self):
1368 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001369 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1370 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001371
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001372 p = re.compile('.' + upper_char, re.I | re.S)
1373 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001374 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001375
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001376 p = re.compile('.' + lower_char, re.I | re.S)
1377 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001378 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001379
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001380 p = re.compile('(?i).' + upper_char, re.S)
1381 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001382 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001383
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001384 p = re.compile('(?i).' + lower_char, re.S)
1385 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001386 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001387
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001388 p = re.compile('(?is).' + upper_char)
1389 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001390 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001391
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001392 p = re.compile('(?is).' + lower_char)
1393 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001394 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001395
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001396 p = re.compile('(?s)(?i).' + upper_char)
1397 q = p.match('\n' + lower_char)
1398 self.assertTrue(q)
1399
1400 p = re.compile('(?s)(?i).' + lower_char)
1401 q = p.match('\n' + upper_char)
1402 self.assertTrue(q)
1403
1404 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1405 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1406 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1407 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1408 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
Serhiy Storchakad65cd092016-09-11 01:39:01 +03001409
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001410 p = upper_char + '(?i)'
1411 with self.assertWarns(DeprecationWarning) as warns:
1412 self.assertTrue(re.match(p, lower_char))
1413 self.assertEqual(
1414 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001415 'Flags not at the start of the expression %r' % p
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001416 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001417 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001418
1419 p = upper_char + '(?i)%s' % ('.?' * 100)
1420 with self.assertWarns(DeprecationWarning) as warns:
1421 self.assertTrue(re.match(p, lower_char))
1422 self.assertEqual(
1423 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001424 'Flags not at the start of the expression %r (truncated)' % p[:20]
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001425 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001426 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakabd48d272016-09-11 12:50:02 +03001427
Roy Williams171b9a32017-06-09 22:01:16 -07001428 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1429 with warnings.catch_warnings():
1430 warnings.simplefilter('error', BytesWarning)
1431 p = b'A(?i)'
1432 with self.assertWarns(DeprecationWarning) as warns:
1433 self.assertTrue(re.match(p, b'a'))
1434 self.assertEqual(
1435 str(warns.warnings[0].message),
1436 'Flags not at the start of the expression %r' % p
1437 )
1438 self.assertEqual(warns.warnings[0].filename, __file__)
1439
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001440 with self.assertWarns(DeprecationWarning):
1441 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1442 with self.assertWarns(DeprecationWarning):
1443 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1444 with self.assertWarns(DeprecationWarning):
1445 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1446 with self.assertWarns(DeprecationWarning):
1447 self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1448 with self.assertWarns(DeprecationWarning):
1449 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001450 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001451 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001452 self.assertRegex(str(warns.warnings[0].message),
1453 'Flags not at the start')
1454 self.assertEqual(warns.warnings[0].filename, __file__)
1455 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001456 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1457 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001458 self.assertRegex(str(warns.warnings[0].message),
1459 'Flags not at the start')
1460 self.assertEqual(warns.warnings[0].filename, __file__)
1461 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001462 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1463 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001464 self.assertRegex(str(warns.warnings[0].message),
1465 'Flags not at the start')
1466 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001467
1468
Christian Heimes25bb7832008-01-11 16:17:00 +00001469 def test_dollar_matches_twice(self):
1470 "$ matches the end of string, and just before the terminating \n"
1471 pattern = re.compile('$')
1472 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1473 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1474 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1475
1476 pattern = re.compile('$', re.MULTILINE)
1477 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1478 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1479 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1480
Antoine Pitroufd036452008-08-19 17:56:33 +00001481 def test_bytes_str_mixing(self):
1482 # Mixing str and bytes is disallowed
1483 pat = re.compile('.')
1484 bpat = re.compile(b'.')
1485 self.assertRaises(TypeError, pat.match, b'b')
1486 self.assertRaises(TypeError, bpat.match, 'b')
1487 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1488 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1489 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1490 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1491 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1492 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1493
1494 def test_ascii_and_unicode_flag(self):
1495 # String patterns
1496 for flags in (0, re.UNICODE):
1497 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001498 self.assertTrue(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001499 pat = re.compile(r'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001500 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001501 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001502 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001503 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001504 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001505 pat = re.compile(r'\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001506 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001507 pat = re.compile(r'(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001508 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001509 # Bytes patterns
1510 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001511 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001512 self.assertIsNone(pat.match(b'\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001513 pat = re.compile(br'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001514 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001515 # Incompatibilities
R David Murray44b548d2016-09-08 13:59:53 -04001516 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001517 self.assertRaises(re.error, re.compile, br'(?u)\w')
R David Murray44b548d2016-09-08 13:59:53 -04001518 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1519 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1520 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001521 self.assertRaises(re.error, re.compile, r'(?au)\w')
Antoine Pitroufd036452008-08-19 17:56:33 +00001522
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001523 def test_locale_flag(self):
1524 import locale
Benjamin Peterson21a74312017-03-07 22:48:09 -08001525 _, enc = locale.getlocale(locale.LC_CTYPE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001526 # Search non-ASCII letter
1527 for i in range(128, 256):
1528 try:
1529 c = bytes([i]).decode(enc)
1530 sletter = c.lower()
1531 if sletter == c: continue
1532 bletter = sletter.encode(enc)
1533 if len(bletter) != 1: continue
1534 if bletter.decode(enc) != sletter: continue
1535 bpat = re.escape(bytes([i]))
1536 break
1537 except (UnicodeError, TypeError):
1538 pass
Benjamin Peterson1e687162017-03-01 21:53:00 -08001539 else:
1540 bletter = None
1541 bpat = b'A'
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001542 # Bytes patterns
1543 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1544 if bletter:
1545 self.assertTrue(pat.match(bletter))
1546 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1547 if bletter:
1548 self.assertTrue(pat.match(bletter))
1549 pat = re.compile(bpat, re.IGNORECASE)
1550 if bletter:
1551 self.assertIsNone(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001552 pat = re.compile(br'\w', re.LOCALE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001553 if bletter:
1554 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001555 pat = re.compile(br'(?L)\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001556 if bletter:
1557 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001558 pat = re.compile(br'\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001559 if bletter:
1560 self.assertIsNone(pat.match(bletter))
1561 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001562 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001563 self.assertRaises(re.error, re.compile, '(?L)')
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001564 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1565 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1566 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001567 self.assertRaises(re.error, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001568
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001569 def test_scoped_flags(self):
1570 self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1571 self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1572 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1573 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1574 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1575 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1576
1577 self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1578 self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1579 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1580 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1581
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001582 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1583 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1584 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1585
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001586 self.checkPatternError(r'(?a)(?-a:\w)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001587 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001588 self.checkPatternError(r'(?i-i:a)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001589 'bad inline flags: flag turned on and off', 5)
1590 self.checkPatternError(r'(?au:a)',
1591 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1592 self.checkPatternError(br'(?aL:a)',
1593 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001594
1595 self.checkPatternError(r'(?-', 'missing flag', 3)
1596 self.checkPatternError(r'(?-+', 'missing flag', 3)
1597 self.checkPatternError(r'(?-z', 'unknown flag', 3)
1598 self.checkPatternError(r'(?-i', 'missing :', 4)
1599 self.checkPatternError(r'(?-i)', 'missing :', 4)
1600 self.checkPatternError(r'(?-i+', 'missing :', 4)
1601 self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1602 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1603 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1604 self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1605 self.checkPatternError(r'(?iz', 'unknown flag', 3)
1606
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001607 def test_bug_6509(self):
1608 # Replacement strings of both types must parse properly.
1609 # all strings
R David Murray44b548d2016-09-08 13:59:53 -04001610 pat = re.compile(r'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001611 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1612 pat = re.compile('a(.)')
1613 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1614 pat = re.compile('..')
1615 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1616
1617 # all bytes
R David Murray44b548d2016-09-08 13:59:53 -04001618 pat = re.compile(br'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001619 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1620 pat = re.compile(b'a(.)')
1621 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1622 pat = re.compile(b'..')
1623 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1624
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001625 def test_dealloc(self):
1626 # issue 3299: check for segfault in debug build
1627 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001628 # the overflow limit is different on wide and narrow builds and it
1629 # depends on the definition of SRE_CODE (see sre.h).
1630 # 2**128 should be big enough to overflow on both. For smaller values
1631 # a RuntimeError is raised instead of OverflowError.
1632 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001633 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001634 with self.assertRaises(OverflowError):
Victor Stinner726a57d2016-11-22 23:04:39 +01001635 _sre.compile("abc", 0, [long_overflow], 0, {}, ())
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001636 with self.assertRaises(TypeError):
1637 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001640 self.assertTrue(re.search("123.*-", '123abc-'))
1641 self.assertTrue(re.search("123.*-", '123\xe9-'))
1642 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1643 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1644 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645
Ezio Melottidf723e12012-03-13 01:29:48 +02001646 def test_compile(self):
1647 # Test return value when given string and pattern as parameter
1648 pattern = re.compile('random pattern')
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001649 self.assertIsInstance(pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001650 same_pattern = re.compile(pattern)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001651 self.assertIsInstance(same_pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001652 self.assertIs(same_pattern, pattern)
1653 # Test behaviour when not given a string or pattern as parameter
1654 self.assertRaises(TypeError, re.compile, 0)
1655
Antoine Pitroub33941a2012-12-03 20:55:56 +01001656 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001657 def test_large_search(self, size):
1658 # Issue #10182: indices were 32-bit-truncated.
1659 s = 'a' * size
1660 m = re.search('$', s)
1661 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001662 self.assertEqual(m.start(), size)
1663 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001664
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001665 # The huge memuse is because of re.sub() using a list and a join()
1666 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001667 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001668 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001669 # Issue #10182: indices were 32-bit-truncated.
1670 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001671 r, n = re.subn('', '', s)
1672 self.assertEqual(r, s)
1673 self.assertEqual(n, size + 1)
1674
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001675 def test_bug_16688(self):
1676 # Issue 16688: Backreferences make case-insensitive regex fail on
1677 # non-ASCII strings.
1678 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1679 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001680
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001681 def test_repeat_minmax_overflow(self):
1682 # Issue #13169
1683 string = "x" * 100000
1684 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1685 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1686 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1687 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1688 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1689 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1690 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1691 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1692 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1693 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1694 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1695
1696 @cpython_only
1697 def test_repeat_minmax_overflow_maxrepeat(self):
1698 try:
1699 from _sre import MAXREPEAT
1700 except ImportError:
1701 self.skipTest('requires _sre.MAXREPEAT constant')
1702 string = "x" * 100000
1703 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1704 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1705 (0, 100000))
1706 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1707 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1708 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1709 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1710
R David Murray26dfaac92013-04-14 13:00:54 -04001711 def test_backref_group_name_in_exception(self):
1712 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001713 self.checkPatternError('(?P=<foo>)',
1714 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001715
1716 def test_group_name_in_exception(self):
1717 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001718 self.checkPatternError('(?P<?foo>)',
1719 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001720
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001721 def test_issue17998(self):
1722 for reps in '*', '+', '?', '{1}':
1723 for mod in '', '?':
1724 pattern = '.' + reps + mod + 'yz'
1725 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1726 ['xyz'], msg=pattern)
1727 pattern = pattern.encode()
1728 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1729 [b'xyz'], msg=pattern)
1730
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001731 def test_match_repr(self):
1732 for string in '[abracadabra]', S('[abracadabra]'):
1733 m = re.search(r'(.+)(.*?)\1', string)
1734 self.assertEqual(repr(m), "<%s.%s object; "
1735 "span=(1, 12), match='abracadabra'>" %
1736 (type(m).__module__, type(m).__qualname__))
1737 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1738 bytearray(b'[abracadabra]'),
1739 memoryview(b'[abracadabra]')):
R David Murray44b548d2016-09-08 13:59:53 -04001740 m = re.search(br'(.+)(.*?)\1', string)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001741 self.assertEqual(repr(m), "<%s.%s object; "
1742 "span=(1, 12), match=b'abracadabra'>" %
1743 (type(m).__module__, type(m).__qualname__))
1744
1745 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1746 self.assertEqual(repr(first), "<%s.%s object; "
1747 "span=(0, 2), match='aa'>" %
1748 (type(second).__module__, type(first).__qualname__))
1749 self.assertEqual(repr(second), "<%s.%s object; "
1750 "span=(3, 5), match='bb'>" %
1751 (type(second).__module__, type(second).__qualname__))
1752
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001753 def test_zerowidth(self):
1754 # Issues 852532, 1647489, 3262, 25054.
1755 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
1756 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
1757 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
1758 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1759
1760 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
1761 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
1762 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
1763
1764 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1765 self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1766 ['', 'a', '', '', 'bc', ''])
1767
1768 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1769 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1770 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1771 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001772
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001773 def test_bug_2537(self):
1774 # issue 2537: empty submatches
1775 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1776 for inner_op in ('{0,}', '*', '?'):
1777 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1778 m = r.match("xyyzy")
1779 self.assertEqual(m.group(0), "xyy")
1780 self.assertEqual(m.group(1), "")
1781 self.assertEqual(m.group(2), "y")
1782
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001783 @cpython_only
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001784 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001785 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001786 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001787 re.compile(pat, re.DEBUG)
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001788 self.maxDiff = None
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001789 dump = '''\
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001790SUBPATTERN 1 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001791 LITERAL 46
Serhiy Storchaka821a9d12017-05-14 08:32:33 +03001792BRANCH
1793 IN
1794 LITERAL 99
1795 LITERAL 104
1796OR
1797 LITERAL 112
1798 LITERAL 121
1799GROUPREF_EXISTS 1
1800 AT AT_END
1801ELSE
1802 LITERAL 58
1803 LITERAL 32
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001804
1805 0. INFO 8 0b1 2 5 (to 9)
1806 prefix_skip 0
1807 prefix [0x2e] ('.')
1808 overlap [0]
1809 9: MARK 0
181011. LITERAL 0x2e ('.')
181113. MARK 1
181215. BRANCH 10 (to 26)
181317. IN 6 (to 24)
181419. LITERAL 0x63 ('c')
181521. LITERAL 0x68 ('h')
181623. FAILURE
181724: JUMP 9 (to 34)
181826: branch 7 (to 33)
181927. LITERAL 0x70 ('p')
182029. LITERAL 0x79 ('y')
182131. JUMP 2 (to 34)
182233: FAILURE
182334: GROUPREF_EXISTS 0 6 (to 41)
182437. AT END
182539. JUMP 5 (to 45)
182641: LITERAL 0x3a (':')
182743. LITERAL 0x20 (' ')
182845: SUCCESS
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001829'''
1830 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001831 # Debug output is output again even a second time (bypassing
1832 # the cache -- issue #20426).
1833 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001834 re.compile(pat, re.DEBUG)
1835 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001836
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001837 def test_keyword_parameters(self):
1838 # Issue #20283: Accepting the string keyword parameter.
1839 pat = re.compile(r'(ab)')
1840 self.assertEqual(
1841 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1842 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001843 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1844 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001845 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1846 self.assertEqual(
1847 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1848 self.assertEqual(
1849 pat.split(string='abracadabra', maxsplit=1),
1850 ['', 'ab', 'racadabra'])
1851 self.assertEqual(
1852 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1853 (7, 9))
1854
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001855 def test_bug_20998(self):
1856 # Issue #20998: Fullmatch of repeated single character pattern
1857 # with ignore case.
1858 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1859
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001860 def test_locale_caching(self):
1861 # Issue #22410
1862 oldlocale = locale.setlocale(locale.LC_CTYPE)
1863 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1864 for loc in 'en_US.iso88591', 'en_US.utf8':
1865 try:
1866 locale.setlocale(locale.LC_CTYPE, loc)
1867 except locale.Error:
1868 # Unsupported locale on this system
1869 self.skipTest('test needs %s locale' % loc)
1870
1871 re.purge()
1872 self.check_en_US_iso88591()
1873 self.check_en_US_utf8()
1874 re.purge()
1875 self.check_en_US_utf8()
1876 self.check_en_US_iso88591()
1877
1878 def check_en_US_iso88591(self):
1879 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1880 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1881 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1882 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1883 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1884 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1885 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1886
1887 def check_en_US_utf8(self):
1888 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1889 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1890 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1891 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1892 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1893 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1894 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1895
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001896 def test_locale_compiled(self):
1897 oldlocale = locale.setlocale(locale.LC_CTYPE)
1898 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1899 for loc in 'en_US.iso88591', 'en_US.utf8':
1900 try:
1901 locale.setlocale(locale.LC_CTYPE, loc)
1902 except locale.Error:
1903 # Unsupported locale on this system
1904 self.skipTest('test needs %s locale' % loc)
1905
1906 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1907 p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1908 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1909 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1910 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1911 for p in p1, p2, p3:
1912 self.assertTrue(p.match(b'\xc5\xe5'))
1913 self.assertTrue(p.match(b'\xe5\xe5'))
1914 self.assertTrue(p.match(b'\xc5\xc5'))
1915 self.assertIsNone(p4.match(b'\xe5\xc5'))
1916 self.assertIsNone(p4.match(b'\xe5\xe5'))
1917 self.assertIsNone(p4.match(b'\xc5\xc5'))
1918
1919 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1920 for p in p1, p2, p3:
1921 self.assertTrue(p.match(b'\xc5\xe5'))
1922 self.assertIsNone(p.match(b'\xe5\xe5'))
1923 self.assertIsNone(p.match(b'\xc5\xc5'))
1924 self.assertTrue(p4.match(b'\xe5\xc5'))
1925 self.assertIsNone(p4.match(b'\xe5\xe5'))
1926 self.assertIsNone(p4.match(b'\xc5\xc5'))
1927
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001928 def test_error(self):
1929 with self.assertRaises(re.error) as cm:
1930 re.compile('(\u20ac))')
1931 err = cm.exception
1932 self.assertIsInstance(err.pattern, str)
1933 self.assertEqual(err.pattern, '(\u20ac))')
1934 self.assertEqual(err.pos, 3)
1935 self.assertEqual(err.lineno, 1)
1936 self.assertEqual(err.colno, 4)
1937 self.assertIn(err.msg, str(err))
1938 self.assertIn(' at position 3', str(err))
1939 self.assertNotIn(' at position 3', err.msg)
1940 # Bytes pattern
1941 with self.assertRaises(re.error) as cm:
1942 re.compile(b'(\xa4))')
1943 err = cm.exception
1944 self.assertIsInstance(err.pattern, bytes)
1945 self.assertEqual(err.pattern, b'(\xa4))')
1946 self.assertEqual(err.pos, 3)
1947 # Multiline pattern
1948 with self.assertRaises(re.error) as cm:
1949 re.compile("""
1950 (
1951 abc
1952 )
1953 )
1954 (
1955 """, re.VERBOSE)
1956 err = cm.exception
1957 self.assertEqual(err.pos, 77)
1958 self.assertEqual(err.lineno, 5)
1959 self.assertEqual(err.colno, 17)
1960 self.assertIn(err.msg, str(err))
1961 self.assertIn(' at position 77', str(err))
1962 self.assertIn('(line 5, column 17)', str(err))
1963
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001964 def test_misc_errors(self):
1965 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1966 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1967 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1968 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1969 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1970 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001971 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001972 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1973 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1974 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1975 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1976
Victor Stinner8bf43e62016-11-14 12:38:43 +01001977 def test_enum(self):
1978 # Issue #28082: Check that str(flag) returns a human readable string
1979 # instead of an integer
1980 self.assertIn('ASCII', str(re.A))
1981 self.assertIn('DOTALL', str(re.S))
1982
Victor Stinnerb44fb122016-11-21 16:35:08 +01001983 def test_pattern_compare(self):
1984 pattern1 = re.compile('abc', re.IGNORECASE)
1985
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01001986 # equal to itself
1987 self.assertEqual(pattern1, pattern1)
1988 self.assertFalse(pattern1 != pattern1)
1989
Victor Stinnerb44fb122016-11-21 16:35:08 +01001990 # equal
1991 re.purge()
1992 pattern2 = re.compile('abc', re.IGNORECASE)
1993 self.assertEqual(hash(pattern2), hash(pattern1))
1994 self.assertEqual(pattern2, pattern1)
1995
1996 # not equal: different pattern
1997 re.purge()
1998 pattern3 = re.compile('XYZ', re.IGNORECASE)
1999 # Don't test hash(pattern3) != hash(pattern1) because there is no
2000 # warranty that hash values are different
2001 self.assertNotEqual(pattern3, pattern1)
2002
2003 # not equal: different flag (flags=0)
2004 re.purge()
2005 pattern4 = re.compile('abc')
2006 self.assertNotEqual(pattern4, pattern1)
2007
2008 # only == and != comparison operators are supported
2009 with self.assertRaises(TypeError):
2010 pattern1 < pattern2
2011
2012 def test_pattern_compare_bytes(self):
2013 pattern1 = re.compile(b'abc')
2014
2015 # equal: test bytes patterns
2016 re.purge()
2017 pattern2 = re.compile(b'abc')
2018 self.assertEqual(hash(pattern2), hash(pattern1))
2019 self.assertEqual(pattern2, pattern1)
2020
2021 # not equal: pattern of a different types (str vs bytes),
2022 # comparison must not raise a BytesWarning
2023 re.purge()
2024 pattern3 = re.compile('abc')
2025 with warnings.catch_warnings():
2026 warnings.simplefilter('error', BytesWarning)
2027 self.assertNotEqual(pattern3, pattern1)
2028
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02002029 def test_bug_29444(self):
2030 s = bytearray(b'abcdefgh')
2031 m = re.search(b'[a-h]+', s)
2032 m2 = re.search(b'[e-h]+', s)
2033 self.assertEqual(m.group(), b'abcdefgh')
2034 self.assertEqual(m2.group(), b'efgh')
2035 s[:] = b'xyz'
2036 self.assertEqual(m.group(), b'xyz')
2037 self.assertEqual(m2.group(), b'')
2038
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002039
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002040class PatternReprTests(unittest.TestCase):
2041 def check(self, pattern, expected):
2042 self.assertEqual(repr(re.compile(pattern)), expected)
2043
2044 def check_flags(self, pattern, flags, expected):
2045 self.assertEqual(repr(re.compile(pattern, flags)), expected)
2046
2047 def test_without_flags(self):
2048 self.check('random pattern',
2049 "re.compile('random pattern')")
2050
2051 def test_single_flag(self):
2052 self.check_flags('random pattern', re.IGNORECASE,
2053 "re.compile('random pattern', re.IGNORECASE)")
2054
2055 def test_multiple_flags(self):
2056 self.check_flags('random pattern', re.I|re.S|re.X,
2057 "re.compile('random pattern', "
2058 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2059
2060 def test_unicode_flag(self):
2061 self.check_flags('random pattern', re.U,
2062 "re.compile('random pattern')")
2063 self.check_flags('random pattern', re.I|re.S|re.U,
2064 "re.compile('random pattern', "
2065 "re.IGNORECASE|re.DOTALL)")
2066
2067 def test_inline_flags(self):
2068 self.check('(?i)pattern',
2069 "re.compile('(?i)pattern', re.IGNORECASE)")
2070
2071 def test_unknown_flags(self):
2072 self.check_flags('random pattern', 0x123000,
2073 "re.compile('random pattern', 0x123000)")
2074 self.check_flags('random pattern', 0x123000|re.I,
2075 "re.compile('random pattern', re.IGNORECASE|0x123000)")
2076
2077 def test_bytes(self):
2078 self.check(b'bytes pattern',
2079 "re.compile(b'bytes pattern')")
2080 self.check_flags(b'bytes pattern', re.A,
2081 "re.compile(b'bytes pattern', re.ASCII)")
2082
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002083 def test_locale(self):
2084 self.check_flags(b'bytes pattern', re.L,
2085 "re.compile(b'bytes pattern', re.LOCALE)")
2086
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002087 def test_quotes(self):
2088 self.check('random "double quoted" pattern',
2089 '''re.compile('random "double quoted" pattern')''')
2090 self.check("random 'single quoted' pattern",
2091 '''re.compile("random 'single quoted' pattern")''')
2092 self.check('''both 'single' and "double" quotes''',
2093 '''re.compile('both \\'single\\' and "double" quotes')''')
2094
2095 def test_long_pattern(self):
2096 pattern = 'Very %spattern' % ('long ' * 1000)
2097 r = repr(re.compile(pattern))
2098 self.assertLess(len(r), 300)
2099 self.assertEqual(r[:30], "re.compile('Very long long lon")
2100 r = repr(re.compile(pattern, re.I))
2101 self.assertLess(len(r), 300)
2102 self.assertEqual(r[:30], "re.compile('Very long long lon")
2103 self.assertEqual(r[-16:], ", re.IGNORECASE)")
2104
2105
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002106class ImplementationTest(unittest.TestCase):
2107 """
2108 Test implementation details of the re module.
2109 """
2110
2111 def test_overlap_table(self):
2112 f = sre_compile._generate_overlap_table
2113 self.assertEqual(f(""), [])
2114 self.assertEqual(f("a"), [0])
2115 self.assertEqual(f("abcd"), [0, 0, 0, 0])
2116 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2117 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2118 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2119
2120
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002121class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00002122
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002123 def test_re_benchmarks(self):
2124 're_tests benchmarks'
2125 from test.re_tests import benchmarks
2126 for pattern, s in benchmarks:
2127 with self.subTest(pattern=pattern, string=s):
2128 p = re.compile(pattern)
2129 self.assertTrue(p.search(s))
2130 self.assertTrue(p.match(s))
2131 self.assertTrue(p.fullmatch(s))
2132 s2 = ' '*10000 + s + ' '*10000
2133 self.assertTrue(p.search(s2))
2134 self.assertTrue(p.match(s2, 10000))
2135 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2136 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002137
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002138 def test_re_tests(self):
2139 're_tests test suite'
2140 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
2141 for t in tests:
2142 pattern = s = outcome = repl = expected = None
2143 if len(t) == 5:
2144 pattern, s, outcome, repl, expected = t
2145 elif len(t) == 3:
2146 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00002147 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002148 raise ValueError('Test tuples should have 3 or 5 fields', t)
2149
2150 with self.subTest(pattern=pattern, string=s):
2151 if outcome == SYNTAX_ERROR: # Expected a syntax error
2152 with self.assertRaises(re.error):
2153 re.compile(pattern)
2154 continue
2155
2156 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002157 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002158 if outcome == FAIL:
2159 self.assertIsNone(result, 'Succeeded incorrectly')
2160 continue
2161
2162 with self.subTest():
2163 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002164 # Matched, as expected, so now we compute the
2165 # result string and compare it to our expected result.
2166 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002167 vardict = {'found': result.group(0),
2168 'groups': result.group(),
2169 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002170 for i in range(1, 100):
2171 try:
2172 gi = result.group(i)
2173 # Special hack because else the string concat fails:
2174 if gi is None:
2175 gi = "None"
2176 except IndexError:
2177 gi = "Error"
2178 vardict['g%d' % i] = gi
2179 for i in result.re.groupindex.keys():
2180 try:
2181 gi = result.group(i)
2182 if gi is None:
2183 gi = "None"
2184 except IndexError:
2185 gi = "Error"
2186 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002187 self.assertEqual(eval(repl, vardict), expected,
2188 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002189
Antoine Pitrou22628c42008-07-22 17:53:22 +00002190 # Try the match with both pattern and string converted to
2191 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002192 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00002193 bpat = bytes(pattern, "ascii")
2194 bs = bytes(s, "ascii")
2195 except UnicodeEncodeError:
2196 # skip non-ascii tests
2197 pass
2198 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002199 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002200 obj = re.compile(bpat)
2201 self.assertTrue(obj.search(bs))
2202
2203 # Try the match with LOCALE enabled, and check that it
2204 # still succeeds.
2205 with self.subTest('locale-sensitive match'):
2206 obj = re.compile(bpat, re.LOCALE)
2207 result = obj.search(bs)
2208 if result is None:
2209 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002210
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002211 # Try the match with the search area limited to the extent
2212 # of the match and see if it still succeeds. \B will
2213 # break (because it won't match at the end or start of a
2214 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002215 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2216 and result is not None):
2217 with self.subTest('range-limited match'):
2218 obj = re.compile(pattern)
2219 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00002220
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002221 # Try the match with IGNORECASE enabled, and check that it
2222 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002223 with self.subTest('case-insensitive match'):
2224 obj = re.compile(pattern, re.IGNORECASE)
2225 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00002226
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002227 # Try the match with UNICODE locale enabled, and check
2228 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002229 with self.subTest('unicode-sensitive match'):
2230 obj = re.compile(pattern, re.UNICODE)
2231 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002232
Gregory P. Smith5a631832010-07-27 05:31:29 +00002233
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002234if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002235 unittest.main()