blob: 4817d761a22df91c1fdf4cc53a11ac7d1ba3938c [file] [log] [blame]
Victor Stinnerd6debb22017-03-27 16:05:26 +02001from test.support import (gc_collect, bigmemtest, _2G,
2 cpython_only, captured_stdout)
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02003import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02005import sre_compile
Ezio Melottid2114eb2011-03-25 14:08:44 +02006import string
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import unittest
Victor Stinnerb44fb122016-11-21 16:35:08 +01008import warnings
9from re import Scanner
Raymond Hettinger027bb632004-05-31 03:09:25 +000010from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000011
Guido van Rossum23b22571997-07-17 22:36:14 +000012# Misc tests from Tim Peters' re.doc
13
Just van Rossum6802c6e2003-07-02 14:36:59 +000014# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020015# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000016# cover most of the code.
17
Serhiy Storchaka25324972013-10-16 12:46:28 +030018class S(str):
19 def __getitem__(self, index):
20 return S(super().__getitem__(index))
21
22class B(bytes):
23 def __getitem__(self, index):
24 return B(super().__getitem__(index))
25
Skip Montanaro8ed06da2003-04-24 19:43:18 +000026class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000027
Serhiy Storchaka25324972013-10-16 12:46:28 +030028 def assertTypedEqual(self, actual, expect, msg=None):
29 self.assertEqual(actual, expect, msg)
30 def recurse(actual, expect):
31 if isinstance(expect, (tuple, list)):
32 for x, y in zip(actual, expect):
33 recurse(x, y)
34 else:
35 self.assertIs(type(actual), type(expect), msg)
36 recurse(actual, expect)
37
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020038 def checkPatternError(self, pattern, errmsg, pos=None):
39 with self.assertRaises(re.error) as cm:
40 re.compile(pattern)
41 with self.subTest(pattern=pattern):
42 err = cm.exception
43 self.assertEqual(err.msg, errmsg)
44 if pos is not None:
45 self.assertEqual(err.pos, pos)
46
47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
48 with self.assertRaises(re.error) as cm:
49 re.sub(pattern, repl, string)
50 with self.subTest(pattern=pattern, repl=repl):
51 err = cm.exception
52 self.assertEqual(err.msg, errmsg)
53 if pos is not None:
54 self.assertEqual(err.pos, pos)
55
Benjamin Petersone48944b2012-03-07 14:50:25 -060056 def test_keep_buffer(self):
57 # See bug 14212
58 b = bytearray(b'x')
59 it = re.finditer(b'a', b)
60 with self.assertRaises(BufferError):
61 b.extend(b'x'*400)
62 list(it)
63 del it
64 gc_collect()
65 b.extend(b'x'*400)
66
Raymond Hettinger027bb632004-05-31 03:09:25 +000067 def test_weakref(self):
68 s = 'QabbbcR'
69 x = re.compile('ab+c')
70 y = proxy(x)
71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
72
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_search_star_plus(self):
74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030078 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000079 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030083 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000084
Skip Montanaro8ed06da2003-04-24 19:43:18 +000085 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000086 int_value = int(matchobj.group(0))
87 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030090 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030096 for y in ("\xe0", "\u0430", "\U0001d49c"):
97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030098
Skip Montanaro8ed06da2003-04-24 19:43:18 +000099 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
101 '9.3 -3 24x100y')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
103 '9.3 -3 23x99y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000105 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000106
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000110 s = r"\1\1"
111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
Serhiy Storchakab748e3b2017-12-12 19:21:50 +0200112 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000114
R David Murray44b548d2016-09-08 13:59:53 -0400115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000119
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
125 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300126 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000128
R David Murray44b548d2016-09-08 13:59:53 -0400129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000130
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000131 def test_bug_449964(self):
132 # fails for group followed by other escape
R David Murray44b548d2016-09-08 13:59:53 -0400133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000134 'xx\bxx\b')
135
136 def test_bug_449000(self):
137 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
139 'abc\ndef\n')
140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
141 'abc\ndef\n')
142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
143 'abc\ndef\n')
144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
145 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000146
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000147 def test_bug_1661(self):
148 # Verify that flags do not get silently ignored with compiled patterns
149 pattern = re.compile('.')
150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.compile, pattern, re.I)
154
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000155 def test_bug_3629(self):
156 # A regex that triggered a bug in the sre-code validator
157 re.compile("(?P<quote>)(?(quote))")
158
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000159 def test_sub_template_numeric_escape(self):
160 # bug 776311 and friends
161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000169
170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
172
173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
178
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200179 self.checkTemplateError('x', r'\400', 'x',
180 r'octal escape value \400 outside of '
181 r'range 0-0o377', 0)
182 self.checkTemplateError('x', r'\777', 'x',
183 r'octal escape value \777 outside of '
184 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000185
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Victor Stinner55e614a2014-10-29 16:58:59 +0100210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000211
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000212 def test_bug_114660(self):
213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
214 'hello there')
215
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216 def test_symbolic_groups(self):
R David Murray44b548d2016-09-08 13:59:53 -0400217 re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
218 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
219 re.compile(r'(?P<a1>x)\1(?(1)y)')
220 self.checkPatternError(r'(?P<a>)(?P<a>)',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200221 "redefinition of group name 'a' as group 2; "
222 "was group 1")
R David Murray44b548d2016-09-08 13:59:53 -0400223 self.checkPatternError(r'(?P<a>(?P=a))',
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300224 "cannot refer to an open group", 10)
R David Murray44b548d2016-09-08 13:59:53 -0400225 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
226 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
227 self.checkPatternError(r'(?P=', 'missing group name', 4)
228 self.checkPatternError(r'(?P=)', 'missing group name', 4)
229 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
230 self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
231 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
232 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
233 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
234 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
235 self.checkPatternError(r'(?P<', 'missing group name', 4)
236 self.checkPatternError(r'(?P<>)', 'missing group name', 4)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200237 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
238 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
239 self.checkPatternError(r'(?(', 'missing group name', 3)
240 self.checkPatternError(r'(?())', 'missing group name', 3)
241 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
242 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
243 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
244 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200245 # New valid/invalid identifiers in Python 3
246 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
247 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200248 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300249 # Support > 100 groups.
250 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
251 pat = '(?:%s)(?(200)z|t)' % pat
252 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200253
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000254 def test_symbolic_refs(self):
R David Murray44b548d2016-09-08 13:59:53 -0400255 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200256 'missing >, unterminated name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400257 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200258 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400259 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
260 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200261 "bad character in group name 'a a'", 3)
R David Murray44b548d2016-09-08 13:59:53 -0400262 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200263 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400264 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200265 "bad character in group name '1a1'", 3)
266 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300267 'invalid group reference 2', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200268 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300269 'invalid group reference 2', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200270 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
R David Murray44b548d2016-09-08 13:59:53 -0400271 re.sub('(?P<a>x)', r'\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300272 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
273 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
R David Murray44b548d2016-09-08 13:59:53 -0400274 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200275 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200276 # New valid/invalid identifiers in Python 3
277 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
278 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
R David Murray44b548d2016-09-08 13:59:53 -0400279 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200280 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300281 # Support > 100 groups.
282 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
R David Murray44b548d2016-09-08 13:59:53 -0400283 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000284
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000285 def test_re_subn(self):
286 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
287 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
288 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
289 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300290 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Victor Stinner55e614a2014-10-29 16:58:59 +0100291 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000292
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000293 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300294 for string in ":a:b::c", S(":a:b::c"):
295 self.assertTypedEqual(re.split(":", string),
296 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200297 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300298 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200299 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300300 ['', ':', 'a', ':', 'b', '::', 'c'])
301 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
302 memoryview(b":a:b::c")):
303 self.assertTypedEqual(re.split(b":", string),
304 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200305 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300306 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200307 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300308 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300309 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
310 "\U0001d49c\U0001d49e\U0001d4b5"):
311 string = ":%s:%s::%s" % (a, b, c)
312 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200313 self.assertEqual(re.split(":+", string), ['', a, b, c])
314 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300315 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300316
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200317 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
318 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000319 ['', ':', 'a', ':', 'b', ':', 'c'])
320 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
321 ['', ':', 'a', ':b::', 'c'])
322 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
323 ['', None, ':', 'a', None, ':', '', 'b', None, '',
324 None, '::', 'c'])
325 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
326 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000327
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200328 for sep, expected in [
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200329 (':*', ['', '', 'a', '', 'b', '', 'c', '']),
330 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
331 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
332 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200333 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200334 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200335 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
336
337 for sep, expected in [
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200338 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
339 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
340 (r'(?=:)', ['', ':a', ':b', ':', ':c']),
341 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200342 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200343 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200344 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
345
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000346 def test_qualified_re_split(self):
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300347 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200354 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200355 ['', ':', '', '', 'a:b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000356
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000357 def test_re_findall(self):
358 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300359 for string in "a:b::c:::d", S("a:b::c:::d"):
360 self.assertTypedEqual(re.findall(":+", string),
361 [":", "::", ":::"])
362 self.assertTypedEqual(re.findall("(:+)", string),
363 [":", "::", ":::"])
364 self.assertTypedEqual(re.findall("(:)(:*)", string),
365 [(":", ""), (":", ":"), (":", "::")])
366 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
367 memoryview(b"a:b::c:::d")):
368 self.assertTypedEqual(re.findall(b":+", string),
369 [b":", b"::", b":::"])
370 self.assertTypedEqual(re.findall(b"(:+)", string),
371 [b":", b"::", b":::"])
372 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
373 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300374 for x in ("\xe0", "\u0430", "\U0001d49c"):
375 xx = x * 2
376 xxx = x * 3
377 string = "a%sb%sc%sd" % (x, xx, xxx)
378 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
379 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
380 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
381 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000382
Skip Montanaro5ba00542003-04-25 16:00:14 +0000383 def test_bug_117612(self):
384 self.assertEqual(re.findall(r"(a|(b))", "aba"),
385 [("a", ""),("b", "b"),("a", "")])
386
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000387 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300388 for string in 'a', S('a'):
389 self.assertEqual(re.match('a', string).groups(), ())
390 self.assertEqual(re.match('(a)', string).groups(), ('a',))
391 self.assertEqual(re.match('(a)', string).group(0), 'a')
392 self.assertEqual(re.match('(a)', string).group(1), 'a')
393 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
394 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
395 self.assertEqual(re.match(b'a', string).groups(), ())
396 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
397 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
398 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
399 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300400 for a in ("\xe0", "\u0430", "\U0001d49c"):
401 self.assertEqual(re.match(a, a).groups(), ())
402 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
403 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
404 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
405 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000406
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000407 pat = re.compile('((a)|(b))(c)?')
408 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
409 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
410 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
411 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
412 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000413
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000414 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
415 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
416 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
417 (None, 'b', None))
418 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000419
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300420 def test_group(self):
421 class Index:
422 def __init__(self, value):
423 self.value = value
424 def __index__(self):
425 return self.value
426 # A single group
427 m = re.match('(a)(b)', 'ab')
428 self.assertEqual(m.group(), 'ab')
429 self.assertEqual(m.group(0), 'ab')
430 self.assertEqual(m.group(1), 'a')
431 self.assertEqual(m.group(Index(1)), 'a')
432 self.assertRaises(IndexError, m.group, -1)
433 self.assertRaises(IndexError, m.group, 3)
434 self.assertRaises(IndexError, m.group, 1<<1000)
435 self.assertRaises(IndexError, m.group, Index(1<<1000))
436 self.assertRaises(IndexError, m.group, 'x')
437 # Multiple groups
438 self.assertEqual(m.group(2, 1), ('b', 'a'))
439 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
440
Eric V. Smith605bdae2016-09-11 08:55:43 -0400441 def test_match_getitem(self):
442 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
443
444 m = pat.match('a')
445 self.assertEqual(m['a1'], 'a')
446 self.assertEqual(m['b2'], None)
447 self.assertEqual(m['c3'], None)
448 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
449 self.assertEqual(m[0], 'a')
450 self.assertEqual(m[1], 'a')
451 self.assertEqual(m[2], None)
452 self.assertEqual(m[3], None)
453 with self.assertRaisesRegex(IndexError, 'no such group'):
454 m['X']
455 with self.assertRaisesRegex(IndexError, 'no such group'):
456 m[-1]
457 with self.assertRaisesRegex(IndexError, 'no such group'):
458 m[4]
459 with self.assertRaisesRegex(IndexError, 'no such group'):
460 m[0, 1]
461 with self.assertRaisesRegex(IndexError, 'no such group'):
462 m[(0,)]
463 with self.assertRaisesRegex(IndexError, 'no such group'):
464 m[(0, 1)]
Serhiy Storchaka50754162017-08-03 11:45:23 +0300465 with self.assertRaisesRegex(IndexError, 'no such group'):
Eric V. Smith605bdae2016-09-11 08:55:43 -0400466 'a1={a2}'.format_map(m)
467
468 m = pat.match('ac')
469 self.assertEqual(m['a1'], 'a')
470 self.assertEqual(m['b2'], None)
471 self.assertEqual(m['c3'], 'c')
472 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
473 self.assertEqual(m[0], 'ac')
474 self.assertEqual(m[1], 'a')
475 self.assertEqual(m[2], None)
476 self.assertEqual(m[3], 'c')
477
478 # Cannot assign.
479 with self.assertRaises(TypeError):
480 m[0] = 1
481
482 # No len().
483 self.assertRaises(TypeError, len, m)
484
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200485 def test_re_fullmatch(self):
486 # Issue 16203: Proposal: add re.fullmatch() method.
487 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
488 for string in "ab", S("ab"):
489 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
490 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
491 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
492 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
493 r = r"%s|%s" % (a, a + b)
494 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
495 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
496 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
497 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
498 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
499 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
500 self.assertIsNone(re.fullmatch(r"a+", "ab"))
501 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
502 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
503 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
504 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
505 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
506 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
507
508 self.assertEqual(
509 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
510 self.assertEqual(
511 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
512 self.assertEqual(
513 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
514
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 def test_re_groupref_exists(self):
R David Murray44b548d2016-09-08 13:59:53 -0400516 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000517 ('(', 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400518 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000519 (None, 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400520 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
521 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000522 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
523 ('a', 'b'))
R David Murray44b548d2016-09-08 13:59:53 -0400524 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000525 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400526 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000527 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400528 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000529 ('a', ''))
530
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000531 # Tests for bug #1177831: exercise groups other than the first group
532 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
533 self.assertEqual(p.match('abc').groups(),
534 ('a', 'b', 'c'))
535 self.assertEqual(p.match('ad').groups(),
536 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300537 self.assertIsNone(p.match('abd'))
538 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000539
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300540 # Support > 100 groups.
541 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
542 pat = '(?:%s)(?(200)z)' % pat
543 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000544
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200545 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
546 self.checkPatternError(r'()(?(1)a|b',
547 'missing ), unterminated subpattern', 2)
548 self.checkPatternError(r'()(?(1)a|b|c)',
549 'conditional backref with more than '
550 'two branches', 10)
551
552 def test_re_groupref_overflow(self):
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300553 from sre_constants import MAXGROUPS
554 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
555 'invalid group reference %d' % MAXGROUPS, 3)
556 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
557 'invalid group reference %d' % MAXGROUPS, 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200558
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000559 def test_re_groupref(self):
560 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
561 ('|', 'a'))
562 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
563 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300564 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
565 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000566 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
567 ('a', 'a'))
568 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
569 (None, None))
570
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200571 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
572
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000573 def test_groupdict(self):
574 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
575 'first second').groupdict(),
576 {'first':'first', 'second':'second'})
577
578 def test_expand(self):
579 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
580 "first second")
581 .expand(r"\2 \1 \g<second> \g<first>"),
582 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300583 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
584 "first")
585 .expand(r"\2 \g<second>"),
586 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000587
588 def test_repeat_minmax(self):
R David Murray44b548d2016-09-08 13:59:53 -0400589 self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
590 self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
591 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
592 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000593
R David Murray44b548d2016-09-08 13:59:53 -0400594 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
595 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
596 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
597 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
598 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
599 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
600 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
601 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000602
R David Murray44b548d2016-09-08 13:59:53 -0400603 self.assertIsNone(re.match(r"^x{1}$", "xxx"))
604 self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
605 self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
606 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000607
R David Murray44b548d2016-09-08 13:59:53 -0400608 self.assertTrue(re.match(r"^x{3}$", "xxx"))
609 self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
610 self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
611 self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
612 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
613 self.assertTrue(re.match(r"^x{3}?$", "xxx"))
614 self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
615 self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
616 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000617
R David Murray44b548d2016-09-08 13:59:53 -0400618 self.assertIsNone(re.match(r"^x{}$", "xxx"))
619 self.assertTrue(re.match(r"^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000620
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200621 self.checkPatternError(r'x{2,1}',
622 'min repeat greater than max repeat', 2)
623
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000624 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000625 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000626 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000627 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
628 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
629 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
630 {'first': 1, 'other': 2})
631
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000632 self.assertEqual(re.match("(a)", "a").pos, 0)
633 self.assertEqual(re.match("(a)", "a").endpos, 1)
634 self.assertEqual(re.match("(a)", "a").string, "a")
635 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300636 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000637
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300638 # Issue 14260. groupindex should be non-modifiable mapping.
639 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
640 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
641 self.assertEqual(p.groupindex['other'], 2)
642 with self.assertRaises(TypeError):
643 p.groupindex['other'] = 0
644 self.assertEqual(p.groupindex['other'], 2)
645
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000646 def test_special_escapes(self):
647 self.assertEqual(re.search(r"\b(b.)\b",
648 "abcd abc bcd bx").group(1), "bx")
649 self.assertEqual(re.search(r"\B(b.)\B",
650 "abc bcd bc abxd").group(1), "bx")
651 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300652 "abcd abc bcd bx", re.ASCII).group(1), "bx")
653 self.assertEqual(re.search(r"\B(b.)\B",
654 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000655 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
656 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300657 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300658 self.assertEqual(re.search(br"\b(b.)\b",
659 b"abcd abc bcd bx").group(1), b"bx")
660 self.assertEqual(re.search(br"\B(b.)\B",
661 b"abc bcd bc abxd").group(1), b"bx")
662 self.assertEqual(re.search(br"\b(b.)\b",
663 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
664 self.assertEqual(re.search(br"\B(b.)\B",
665 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
666 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
667 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300668 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000669 self.assertEqual(re.search(r"\d\D\w\W\s\S",
670 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300671 self.assertEqual(re.search(br"\d\D\w\W\s\S",
672 b"1aa! a").group(0), b"1aa! a")
673 self.assertEqual(re.search(r"\d\D\w\W\s\S",
674 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300675 self.assertEqual(re.search(br"\d\D\w\W\s\S",
676 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000677
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200678 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200679 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200680 self.assertEqual(re.match(r"\(", '(').group(), '(')
681 self.assertIsNone(re.match(r"\(", ')'))
682 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200683 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
684 self.assertIsNone(re.match(r"[\]]", '['))
685 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
686 self.assertIsNone(re.match(r"[a\-c]", 'b'))
687 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
688 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200689 re.purge() # for warnings
690 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
691 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300692 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200693 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
694 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300695 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200696
Serhiy Storchakaa445feb2018-02-10 00:08:17 +0200697 def test_named_unicode_escapes(self):
698 # test individual Unicode named escapes
699 self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
700 self.assertTrue(re.match(r'\N{less-than sign}', '<'))
701 self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
702 self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
703 self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
704 r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
705 '\ufbf9'))
706 self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
707 '='))
708 self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
709 ';'))
710
711 # test errors in \N{name} handling - only valid names should pass
712 self.checkPatternError(r'\N', 'missing {', 2)
713 self.checkPatternError(r'[\N]', 'missing {', 3)
714 self.checkPatternError(r'\N{', 'missing character name', 3)
715 self.checkPatternError(r'[\N{', 'missing character name', 4)
716 self.checkPatternError(r'\N{}', 'missing character name', 3)
717 self.checkPatternError(r'[\N{}]', 'missing character name', 4)
718 self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
719 self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
720 self.checkPatternError(r'\N{SNAKE',
721 'missing }, unterminated name', 3)
722 self.checkPatternError(r'[\N{SNAKE]',
723 'missing }, unterminated name', 4)
724 self.checkPatternError(r'[\N{SNAKE]}',
725 "undefined character name 'SNAKE]'", 1)
726 self.checkPatternError(r'\N{SPAM}',
727 "undefined character name 'SPAM'", 0)
728 self.checkPatternError(r'[\N{SPAM}]',
729 "undefined character name 'SPAM'", 1)
730 self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
731 self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
732
Ezio Melotti5a045b92012-02-29 11:48:44 +0200733 def test_string_boundaries(self):
734 # See http://bugs.python.org/issue10713
735 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
736 "abc")
737 # There's a word boundary at the start of a string.
738 self.assertTrue(re.match(r"\b", "abc"))
739 # A non-empty string includes a non-boundary zero-length match.
740 self.assertTrue(re.search(r"\B", "abc"))
741 # There is no non-boundary match at the start of a string.
742 self.assertFalse(re.match(r"\B", "abc"))
743 # However, an empty string contains no word boundaries, and also no
744 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300745 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200746 # This one is questionable and different from the perlre behaviour,
747 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300748 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200749 # A single word-character string has two boundaries, but no
750 # non-boundary gaps.
751 self.assertEqual(len(re.findall(r"\b", "a")), 2)
752 self.assertEqual(len(re.findall(r"\B", "a")), 0)
753 # If there are no words, there are no boundaries
754 self.assertEqual(len(re.findall(r"\b", " ")), 0)
755 self.assertEqual(len(re.findall(r"\b", " ")), 0)
756 # Can match around the whitespace.
757 self.assertEqual(len(re.findall(r"\B", " ")), 2)
758
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000759 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000760 self.assertEqual(re.match("([\u2222\u2223])",
761 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300762 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300763 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000764
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100765 def test_big_codesize(self):
766 # Issue #1160
767 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300768 self.assertTrue(r.match('1000'))
769 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100770
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000771 def test_anyall(self):
772 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
773 "a\nb")
774 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
775 "a\n\nb")
776
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200777 def test_lookahead(self):
R David Murray44b548d2016-09-08 13:59:53 -0400778 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
779 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
780 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
781 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000782 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
783 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
784 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
785
786 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
787 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
788 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
789 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
790
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200791 # Group reference.
792 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
793 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
794 # Conditional group reference.
795 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
796 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
797 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
798 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
799 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
800 # Group used before defined.
801 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
802 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
803 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
804
805 def test_lookbehind(self):
806 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
807 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
808 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
809 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
810 # Group reference.
811 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
812 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
813 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
814 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
815 # Conditional group reference.
816 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
817 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
818 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
819 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
820 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
821 # Group used before defined.
822 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
823 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
824 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
825 # Group defined in the same lookbehind pattern
826 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
827 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
828 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
829 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
830
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000831 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000832 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300833 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000834 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
835 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
836 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
837 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
838 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
839 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
840 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
841 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
842
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200843 assert '\u212a'.lower() == 'k' # 'K'
844 self.assertTrue(re.match(r'K', '\u212a', re.I))
845 self.assertTrue(re.match(r'k', '\u212a', re.I))
846 self.assertTrue(re.match(r'\u212a', 'K', re.I))
847 self.assertTrue(re.match(r'\u212a', 'k', re.I))
848 assert '\u017f'.upper() == 'S' # 'ſ'
849 self.assertTrue(re.match(r'S', '\u017f', re.I))
850 self.assertTrue(re.match(r's', '\u017f', re.I))
851 self.assertTrue(re.match(r'\u017f', 'S', re.I))
852 self.assertTrue(re.match(r'\u017f', 's', re.I))
853 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
854 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
855 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
856
857 def test_ignore_case_set(self):
858 self.assertTrue(re.match(r'[19A]', 'A', re.I))
859 self.assertTrue(re.match(r'[19a]', 'a', re.I))
860 self.assertTrue(re.match(r'[19a]', 'A', re.I))
861 self.assertTrue(re.match(r'[19A]', 'a', re.I))
862 self.assertTrue(re.match(br'[19A]', b'A', re.I))
863 self.assertTrue(re.match(br'[19a]', b'a', re.I))
864 self.assertTrue(re.match(br'[19a]', b'A', re.I))
865 self.assertTrue(re.match(br'[19A]', b'a', re.I))
866 assert '\u212a'.lower() == 'k' # 'K'
867 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
868 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
869 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
870 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
871 assert '\u017f'.upper() == 'S' # 'ſ'
872 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
873 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
874 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
875 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
876 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
877 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
878 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
879
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200880 def test_ignore_case_range(self):
881 # Issues #3511, #17381.
882 self.assertTrue(re.match(r'[9-a]', '_', re.I))
883 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
884 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
885 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
886 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
887 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
888 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
889 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
890 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
891 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
892 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
893 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
894 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
895 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
896 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
897 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
898
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200899 assert '\u212a'.lower() == 'k' # 'K'
900 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
901 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
902 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
903 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
904 assert '\u017f'.upper() == 'S' # 'ſ'
905 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
906 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
907 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
908 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
909 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
910 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
911 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
912
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000913 def test_category(self):
914 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
915
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300916 @cpython_only
917 def test_case_helpers(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000918 import _sre
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300919 for i in range(128):
920 c = chr(i)
921 lo = ord(c.lower())
922 self.assertEqual(_sre.ascii_tolower(i), lo)
923 self.assertEqual(_sre.unicode_tolower(i), lo)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300924 iscased = c in string.ascii_letters
925 self.assertEqual(_sre.ascii_iscased(i), iscased)
926 self.assertEqual(_sre.unicode_iscased(i), iscased)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000927
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300928 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
929 c = chr(i)
930 self.assertEqual(_sre.ascii_tolower(i), i)
931 if i != 0x0130:
932 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300933 iscased = c != c.lower() or c != c.upper()
934 self.assertFalse(_sre.ascii_iscased(i))
935 self.assertEqual(_sre.unicode_iscased(i),
936 c != c.lower() or c != c.upper())
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300937
938 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
939 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300940 self.assertFalse(_sre.ascii_iscased(0x0130))
941 self.assertTrue(_sre.unicode_iscased(0x0130))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000942
943 def test_not_literal(self):
R David Murray44b548d2016-09-08 13:59:53 -0400944 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
945 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000946
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200947 def test_possible_set_operations(self):
948 s = bytes(range(128)).decode()
949 with self.assertWarns(FutureWarning):
950 p = re.compile(r'[0-9--1]')
951 self.assertEqual(p.findall(s), list('-./0123456789'))
952 self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
953 with self.assertWarns(FutureWarning):
954 p = re.compile(r'[%--1]')
955 self.assertEqual(p.findall(s), list("%&'()*+,-1"))
956 with self.assertWarns(FutureWarning):
957 p = re.compile(r'[%--]')
958 self.assertEqual(p.findall(s), list("%&'()*+,-"))
959
960 with self.assertWarns(FutureWarning):
961 p = re.compile(r'[0-9&&1]')
962 self.assertEqual(p.findall(s), list('&0123456789'))
963 with self.assertWarns(FutureWarning):
964 p = re.compile(r'[\d&&1]')
965 self.assertEqual(p.findall(s), list('&0123456789'))
966 self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
967
968 with self.assertWarns(FutureWarning):
969 p = re.compile(r'[0-9||a]')
970 self.assertEqual(p.findall(s), list('0123456789a|'))
971 with self.assertWarns(FutureWarning):
972 p = re.compile(r'[\d||a]')
973 self.assertEqual(p.findall(s), list('0123456789a|'))
974 self.assertEqual(re.findall(r'[||1]', s), list('1|'))
975
976 with self.assertWarns(FutureWarning):
977 p = re.compile(r'[0-9~~1]')
978 self.assertEqual(p.findall(s), list('0123456789~'))
979 with self.assertWarns(FutureWarning):
980 p = re.compile(r'[\d~~1]')
981 self.assertEqual(p.findall(s), list('0123456789~'))
982 self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
983
984 with self.assertWarns(FutureWarning):
985 p = re.compile(r'[[0-9]|]')
986 self.assertEqual(p.findall(s), list('0123456789[]'))
987
988 with self.assertWarns(FutureWarning):
989 p = re.compile(r'[[:digit:]|]')
990 self.assertEqual(p.findall(s), list(':[]dgit'))
991
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000992 def test_search_coverage(self):
R David Murray44b548d2016-09-08 13:59:53 -0400993 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
994 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000995
Ezio Melottid2114eb2011-03-25 14:08:44 +0200996 def assertMatch(self, pattern, text, match=None, span=None,
Serhiy Storchaka59083002017-04-13 21:06:43 +0300997 matcher=re.fullmatch):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200998 if match is None and span is None:
999 # the pattern matches the whole text
1000 match = text
1001 span = (0, len(text))
1002 elif match is None or span is None:
1003 raise ValueError('If match is not None, span should be specified '
1004 '(and vice versa).')
1005 m = matcher(pattern, text)
1006 self.assertTrue(m)
1007 self.assertEqual(m.group(), match)
1008 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +00001009
Serhiy Storchaka05cb7282017-11-16 12:38:26 +02001010 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
Serhiy Storchaka59083002017-04-13 21:06:43 +03001011
Ezio Melottid2114eb2011-03-25 14:08:44 +02001012 def test_re_escape(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +02001013 p = ''.join(chr(i) for i in range(256))
1014 for c in p:
Ezio Melottid2114eb2011-03-25 14:08:44 +02001015 self.assertMatch(re.escape(c), c)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001016 self.assertMatch('[' + re.escape(c) + ']', c)
1017 self.assertMatch('(?x)' + re.escape(c), c)
Ezio Melottid2114eb2011-03-25 14:08:44 +02001018 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001019 for c in '-.]{}':
1020 self.assertEqual(re.escape(c)[:1], '\\')
1021 literal_chars = self.LITERAL_CHARS
1022 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum49946571997-07-18 04:26:25 +00001023
Serhiy Storchaka59083002017-04-13 21:06:43 +03001024 def test_re_escape_bytes(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +02001025 p = bytes(range(256))
1026 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +00001027 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +02001028 self.assertMatch(re.escape(b), b)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001029 self.assertMatch(b'[' + re.escape(b) + b']', b)
1030 self.assertMatch(b'(?x)' + re.escape(b), b)
Ezio Melottid2114eb2011-03-25 14:08:44 +02001031 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001032 for i in b'-.]{}':
1033 b = bytes([i])
1034 self.assertEqual(re.escape(b)[:1], b'\\')
1035 literal_chars = self.LITERAL_CHARS.encode('ascii')
1036 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum698280d2008-09-10 17:44:35 +00001037
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001038 def test_re_escape_non_ascii(self):
1039 s = 'xxx\u2620\u2620\u2620xxx'
1040 s_escaped = re.escape(s)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001041 self.assertEqual(s_escaped, s)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001042 self.assertMatch(s_escaped, s)
1043 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1044 'x\u2620\u2620\u2620x', (2, 7), re.search)
1045
1046 def test_re_escape_non_ascii_bytes(self):
1047 b = 'y\u2620y\u2620y'.encode('utf-8')
1048 b_escaped = re.escape(b)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001049 self.assertEqual(b_escaped, b)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001050 self.assertMatch(b_escaped, b)
1051 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1052 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +00001053
Serhiy Storchakab85a9762014-09-15 11:33:19 +03001054 def test_pickling(self):
1055 import pickle
1056 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1057 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1058 pickled = pickle.dumps(oldpat, proto)
1059 newpat = pickle.loads(pickled)
1060 self.assertEqual(newpat, oldpat)
1061 # current pickle expects the _compile() reconstructor in re module
1062 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +00001063
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001064 def test_copying(self):
1065 import copy
1066 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1067 self.assertIs(copy.copy(p), p)
1068 self.assertIs(copy.deepcopy(p), p)
1069 m = p.match('12.34')
1070 self.assertIs(copy.copy(m), m)
1071 self.assertIs(copy.deepcopy(m), m)
1072
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001073 def test_constants(self):
1074 self.assertEqual(re.I, re.IGNORECASE)
1075 self.assertEqual(re.L, re.LOCALE)
1076 self.assertEqual(re.M, re.MULTILINE)
1077 self.assertEqual(re.S, re.DOTALL)
1078 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001079
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001080 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001081 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001082 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001083 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1084 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +00001085
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001086 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001087 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1088 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001089 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1090 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1091 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1092 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1093 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1094 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001095 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001096 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1097 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1098 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1099 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1100 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1101 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1102 self.assertTrue(re.match(r"\0", "\000"))
1103 self.assertTrue(re.match(r"\08", "\0008"))
1104 self.assertTrue(re.match(r"\01", "\001"))
1105 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001106 self.checkPatternError(r"\567",
1107 r'octal escape value \567 outside of '
1108 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001109 self.checkPatternError(r"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001110 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1111 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1112 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1113 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1114 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1115 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1116 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001117
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001118 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001119 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1120 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001121 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1122 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1123 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1124 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1125 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1126 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1127 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1128 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001129 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001130 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1131 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1132 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1133 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1134 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1135 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001136 self.checkPatternError(r"[\567]",
1137 r'octal escape value \567 outside of '
1138 r'range 0-0o377', 1)
1139 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1140 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1141 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1142 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1143 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +03001144 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001145
1146 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001147 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001148 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1149 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1150 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1151 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1152 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1153 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001154 self.assertRaises(re.error, re.compile, br"\u1234")
1155 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001156 self.assertTrue(re.match(br"\0", b"\000"))
1157 self.assertTrue(re.match(br"\08", b"\0008"))
1158 self.assertTrue(re.match(br"\01", b"\001"))
1159 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001160 self.checkPatternError(br"\567",
1161 r'octal escape value \567 outside of '
1162 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001163 self.checkPatternError(br"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001164 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1165 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001166
1167 def test_sre_byte_class_literals(self):
1168 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001169 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1170 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1171 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1172 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1173 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1174 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1175 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1176 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001177 self.assertRaises(re.error, re.compile, br"[\u1234]")
1178 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001179 self.checkPatternError(br"[\567]",
1180 r'octal escape value \567 outside of '
1181 r'range 0-0o377', 1)
1182 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1183 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1184
1185 def test_character_set_errors(self):
1186 self.checkPatternError(r'[', 'unterminated character set', 0)
1187 self.checkPatternError(r'[^', 'unterminated character set', 0)
1188 self.checkPatternError(r'[a', 'unterminated character set', 0)
1189 # bug 545855 -- This pattern failed to cause a compile error as it
1190 # should, instead provoking a TypeError.
1191 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1192 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1193 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1194 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001195
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001196 def test_bug_113254(self):
1197 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1198 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1199 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1200
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001201 def test_bug_527371(self):
1202 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001203 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001204 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1205 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
R David Murray44b548d2016-09-08 13:59:53 -04001206 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1207 self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001208
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001209 def test_bug_418626(self):
1210 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1211 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1212 # pattern '*?' on a long string.
1213 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1214 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1215 20003)
1216 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001217 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001218 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001219 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001220
1221 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001222 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001223 self.assertEqual(re.compile(pat) and 1, 1)
1224
Skip Montanaro1e703c62003-04-25 15:40:28 +00001225 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001226 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001227 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001228 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1229 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1230 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001231
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001232 def test_nothing_to_repeat(self):
1233 for reps in '*', '+', '?', '{1,2}':
1234 for mod in '', '?':
1235 self.checkPatternError('%s%s' % (reps, mod),
1236 'nothing to repeat', 0)
1237 self.checkPatternError('(?:%s%s)' % (reps, mod),
1238 'nothing to repeat', 3)
1239
1240 def test_multiple_repeat(self):
1241 for outer_reps in '*', '+', '{1,2}':
1242 for outer_mod in '', '?':
1243 outer_op = outer_reps + outer_mod
1244 for inner_reps in '*', '+', '?', '{1,2}':
1245 for inner_mod in '', '?':
1246 inner_op = inner_reps + inner_mod
1247 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1248 'multiple repeat', 1 + len(inner_op))
1249
Serhiy Storchakafa468162013-02-16 21:23:53 +02001250 def test_unlimited_zero_width_repeat(self):
1251 # Issue #9669
1252 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1253 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1254 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1255 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1256 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1257 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1258
Skip Montanaro1e703c62003-04-25 15:40:28 +00001259 def test_scanner(self):
1260 def s_ident(scanner, token): return token
1261 def s_operator(scanner, token): return "op%s" % token
1262 def s_float(scanner, token): return float(token)
1263 def s_int(scanner, token): return int(token)
1264
1265 scanner = Scanner([
1266 (r"[a-zA-Z_]\w*", s_ident),
1267 (r"\d+\.\d*", s_float),
1268 (r"\d+", s_int),
1269 (r"=|\+|-|\*|/", s_operator),
1270 (r"\s+", None),
1271 ])
1272
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001273 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001274
Skip Montanaro1e703c62003-04-25 15:40:28 +00001275 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1276 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1277 'op+', 'bar'], ''))
1278
Skip Montanaro5ba00542003-04-25 16:00:14 +00001279 def test_bug_448951(self):
1280 # bug 448951 (similar to 429357, but with single char match)
1281 # (Also test greedy matches.)
1282 for op in '','?','*':
1283 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1284 (None, None))
1285 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1286 ('a:', 'a'))
1287
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001288 def test_bug_725106(self):
1289 # capturing groups in alternatives in repeats
1290 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1291 ('b', 'a'))
1292 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1293 ('c', 'b'))
1294 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1295 ('b', None))
1296 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1297 ('b', None))
1298 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1299 ('b', 'a'))
1300 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1301 ('c', 'b'))
1302 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1303 ('b', None))
1304 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1305 ('b', None))
1306
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001307 def test_bug_725149(self):
1308 # mark_stack_base restoring before restoring marks
1309 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1310 ('a', None))
1311 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1312 ('a', None, None))
1313
Just van Rossum12723ba2003-07-02 20:03:04 +00001314 def test_bug_764548(self):
1315 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001316 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001317 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001318 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001319
Skip Montanaro5ba00542003-04-25 16:00:14 +00001320 def test_finditer(self):
1321 iter = re.finditer(r":+", "a:b::c:::d")
1322 self.assertEqual([item.group(0) for item in iter],
1323 [":", "::", ":::"])
1324
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001325 pat = re.compile(r":+")
1326 iter = pat.finditer("a:b::c:::d", 1, 10)
1327 self.assertEqual([item.group(0) for item in iter],
1328 [":", "::", ":::"])
1329
1330 pat = re.compile(r":+")
1331 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1332 self.assertEqual([item.group(0) for item in iter],
1333 [":", "::", ":::"])
1334
1335 pat = re.compile(r":+")
1336 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1337 self.assertEqual([item.group(0) for item in iter],
1338 [":", "::", ":::"])
1339
1340 pat = re.compile(r":+")
1341 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1342 self.assertEqual([item.group(0) for item in iter],
1343 ["::", "::"])
1344
Thomas Wouters40a088d2008-03-18 20:19:54 +00001345 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001346 self.assertIsNot(re.compile('bug_926075'),
1347 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001348
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001349 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001350 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001351 self.assertEqual(re.compile(pattern).split("a.b.c"),
1352 ['a','b','c'])
1353
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001354 def test_bug_581080(self):
1355 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001356 self.assertEqual(next(iter).span(), (1,2))
1357 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001358
1359 scanner = re.compile(r"\s").scanner("a b")
1360 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001361 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001362
1363 def test_bug_817234(self):
1364 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001365 self.assertEqual(next(iter).span(), (0, 4))
1366 self.assertEqual(next(iter).span(), (4, 4))
1367 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001368
Mark Dickinson1f268282009-07-28 17:22:36 +00001369 def test_bug_6561(self):
1370 # '\d' should match characters in Unicode category 'Nd'
1371 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1372 # Letter) or 'No' (Number, Other).
1373 decimal_digits = [
1374 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1375 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1376 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1377 ]
1378 for x in decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001379 self.assertEqual(re.match(r'^\d$', x).group(0), x)
Mark Dickinson1f268282009-07-28 17:22:36 +00001380
1381 not_decimal_digits = [
1382 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1383 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1384 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1385 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1386 ]
1387 for x in not_decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001388 self.assertIsNone(re.match(r'^\d$', x))
Mark Dickinson1f268282009-07-28 17:22:36 +00001389
Guido van Rossumd8faa362007-04-27 19:54:29 +00001390 def test_empty_array(self):
1391 # SF buf 1647541
1392 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001393 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001394 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001395 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001396 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001397
Christian Heimes072c0f12008-01-03 23:01:04 +00001398 def test_inline_flags(self):
1399 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001400 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1401 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001402
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001403 p = re.compile('.' + upper_char, re.I | re.S)
1404 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001405 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001406
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001407 p = re.compile('.' + lower_char, re.I | re.S)
1408 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001409 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001410
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001411 p = re.compile('(?i).' + upper_char, re.S)
1412 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001413 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001414
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001415 p = re.compile('(?i).' + lower_char, re.S)
1416 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001417 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001418
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001419 p = re.compile('(?is).' + upper_char)
1420 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001421 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001422
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001423 p = re.compile('(?is).' + lower_char)
1424 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001425 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001426
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001427 p = re.compile('(?s)(?i).' + upper_char)
1428 q = p.match('\n' + lower_char)
1429 self.assertTrue(q)
1430
1431 p = re.compile('(?s)(?i).' + lower_char)
1432 q = p.match('\n' + upper_char)
1433 self.assertTrue(q)
1434
1435 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1436 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1437 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1438 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1439 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
Serhiy Storchakad65cd092016-09-11 01:39:01 +03001440
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001441 p = upper_char + '(?i)'
1442 with self.assertWarns(DeprecationWarning) as warns:
1443 self.assertTrue(re.match(p, lower_char))
1444 self.assertEqual(
1445 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001446 'Flags not at the start of the expression %r' % p
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001447 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001448 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001449
1450 p = upper_char + '(?i)%s' % ('.?' * 100)
1451 with self.assertWarns(DeprecationWarning) as warns:
1452 self.assertTrue(re.match(p, lower_char))
1453 self.assertEqual(
1454 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001455 'Flags not at the start of the expression %r (truncated)' % p[:20]
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001456 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001457 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakabd48d272016-09-11 12:50:02 +03001458
Roy Williams171b9a32017-06-09 22:01:16 -07001459 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1460 with warnings.catch_warnings():
1461 warnings.simplefilter('error', BytesWarning)
1462 p = b'A(?i)'
1463 with self.assertWarns(DeprecationWarning) as warns:
1464 self.assertTrue(re.match(p, b'a'))
1465 self.assertEqual(
1466 str(warns.warnings[0].message),
1467 'Flags not at the start of the expression %r' % p
1468 )
1469 self.assertEqual(warns.warnings[0].filename, __file__)
1470
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001471 with self.assertWarns(DeprecationWarning):
1472 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1473 with self.assertWarns(DeprecationWarning):
1474 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1475 with self.assertWarns(DeprecationWarning):
1476 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1477 with self.assertWarns(DeprecationWarning):
1478 self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1479 with self.assertWarns(DeprecationWarning):
1480 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001481 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001482 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001483 self.assertRegex(str(warns.warnings[0].message),
1484 'Flags not at the start')
1485 self.assertEqual(warns.warnings[0].filename, __file__)
1486 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001487 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1488 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001489 self.assertRegex(str(warns.warnings[0].message),
1490 'Flags not at the start')
1491 self.assertEqual(warns.warnings[0].filename, __file__)
1492 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001493 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1494 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001495 self.assertRegex(str(warns.warnings[0].message),
1496 'Flags not at the start')
1497 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001498
1499
Christian Heimes25bb7832008-01-11 16:17:00 +00001500 def test_dollar_matches_twice(self):
1501 "$ matches the end of string, and just before the terminating \n"
1502 pattern = re.compile('$')
1503 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1504 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1505 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1506
1507 pattern = re.compile('$', re.MULTILINE)
1508 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1509 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1510 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1511
Antoine Pitroufd036452008-08-19 17:56:33 +00001512 def test_bytes_str_mixing(self):
1513 # Mixing str and bytes is disallowed
1514 pat = re.compile('.')
1515 bpat = re.compile(b'.')
1516 self.assertRaises(TypeError, pat.match, b'b')
1517 self.assertRaises(TypeError, bpat.match, 'b')
1518 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1519 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1520 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1521 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1522 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1523 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1524
1525 def test_ascii_and_unicode_flag(self):
1526 # String patterns
1527 for flags in (0, re.UNICODE):
1528 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001529 self.assertTrue(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001530 pat = re.compile(r'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001531 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001532 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001533 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001534 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001535 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001536 pat = re.compile(r'\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001537 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001538 pat = re.compile(r'(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001539 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001540 # Bytes patterns
1541 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001542 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001543 self.assertIsNone(pat.match(b'\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001544 pat = re.compile(br'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001545 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001546 # Incompatibilities
R David Murray44b548d2016-09-08 13:59:53 -04001547 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001548 self.assertRaises(re.error, re.compile, br'(?u)\w')
R David Murray44b548d2016-09-08 13:59:53 -04001549 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1550 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1551 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001552 self.assertRaises(re.error, re.compile, r'(?au)\w')
Antoine Pitroufd036452008-08-19 17:56:33 +00001553
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001554 def test_locale_flag(self):
Victor Stinnerab71f8b2019-03-01 00:08:03 +01001555 enc = locale.getpreferredencoding()
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001556 # Search non-ASCII letter
1557 for i in range(128, 256):
1558 try:
1559 c = bytes([i]).decode(enc)
1560 sletter = c.lower()
1561 if sletter == c: continue
1562 bletter = sletter.encode(enc)
1563 if len(bletter) != 1: continue
1564 if bletter.decode(enc) != sletter: continue
1565 bpat = re.escape(bytes([i]))
1566 break
1567 except (UnicodeError, TypeError):
1568 pass
Benjamin Peterson1e687162017-03-01 21:53:00 -08001569 else:
1570 bletter = None
1571 bpat = b'A'
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001572 # Bytes patterns
1573 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1574 if bletter:
1575 self.assertTrue(pat.match(bletter))
1576 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1577 if bletter:
1578 self.assertTrue(pat.match(bletter))
1579 pat = re.compile(bpat, re.IGNORECASE)
1580 if bletter:
1581 self.assertIsNone(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001582 pat = re.compile(br'\w', re.LOCALE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001583 if bletter:
1584 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001585 pat = re.compile(br'(?L)\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001586 if bletter:
1587 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001588 pat = re.compile(br'\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001589 if bletter:
1590 self.assertIsNone(pat.match(bletter))
1591 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001592 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001593 self.assertRaises(re.error, re.compile, '(?L)')
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001594 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1595 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1596 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001597 self.assertRaises(re.error, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001598
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001599 def test_scoped_flags(self):
1600 self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1601 self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1602 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1603 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1604 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1605 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1606
1607 self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1608 self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1609 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1610 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1611
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001612 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1613 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1614 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1615
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001616 self.checkPatternError(r'(?a)(?-a:\w)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001617 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001618 self.checkPatternError(r'(?i-i:a)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001619 'bad inline flags: flag turned on and off', 5)
1620 self.checkPatternError(r'(?au:a)',
1621 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1622 self.checkPatternError(br'(?aL:a)',
1623 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001624
1625 self.checkPatternError(r'(?-', 'missing flag', 3)
1626 self.checkPatternError(r'(?-+', 'missing flag', 3)
1627 self.checkPatternError(r'(?-z', 'unknown flag', 3)
1628 self.checkPatternError(r'(?-i', 'missing :', 4)
1629 self.checkPatternError(r'(?-i)', 'missing :', 4)
1630 self.checkPatternError(r'(?-i+', 'missing :', 4)
1631 self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1632 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1633 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1634 self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1635 self.checkPatternError(r'(?iz', 'unknown flag', 3)
1636
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001637 def test_bug_6509(self):
1638 # Replacement strings of both types must parse properly.
1639 # all strings
R David Murray44b548d2016-09-08 13:59:53 -04001640 pat = re.compile(r'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001641 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1642 pat = re.compile('a(.)')
1643 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1644 pat = re.compile('..')
1645 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1646
1647 # all bytes
R David Murray44b548d2016-09-08 13:59:53 -04001648 pat = re.compile(br'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001649 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1650 pat = re.compile(b'a(.)')
1651 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1652 pat = re.compile(b'..')
1653 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1654
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001655 def test_dealloc(self):
1656 # issue 3299: check for segfault in debug build
1657 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001658 # the overflow limit is different on wide and narrow builds and it
1659 # depends on the definition of SRE_CODE (see sre.h).
1660 # 2**128 should be big enough to overflow on both. For smaller values
1661 # a RuntimeError is raised instead of OverflowError.
1662 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001663 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001664 with self.assertRaises(OverflowError):
Victor Stinner726a57d2016-11-22 23:04:39 +01001665 _sre.compile("abc", 0, [long_overflow], 0, {}, ())
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001666 with self.assertRaises(TypeError):
1667 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001670 self.assertTrue(re.search("123.*-", '123abc-'))
1671 self.assertTrue(re.search("123.*-", '123\xe9-'))
1672 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1673 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1674 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675
Ezio Melottidf723e12012-03-13 01:29:48 +02001676 def test_compile(self):
1677 # Test return value when given string and pattern as parameter
1678 pattern = re.compile('random pattern')
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001679 self.assertIsInstance(pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001680 same_pattern = re.compile(pattern)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001681 self.assertIsInstance(same_pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001682 self.assertIs(same_pattern, pattern)
1683 # Test behaviour when not given a string or pattern as parameter
1684 self.assertRaises(TypeError, re.compile, 0)
1685
Antoine Pitroub33941a2012-12-03 20:55:56 +01001686 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001687 def test_large_search(self, size):
1688 # Issue #10182: indices were 32-bit-truncated.
1689 s = 'a' * size
1690 m = re.search('$', s)
1691 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001692 self.assertEqual(m.start(), size)
1693 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001694
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001695 # The huge memuse is because of re.sub() using a list and a join()
1696 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001697 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001698 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001699 # Issue #10182: indices were 32-bit-truncated.
1700 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001701 r, n = re.subn('', '', s)
1702 self.assertEqual(r, s)
1703 self.assertEqual(n, size + 1)
1704
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001705 def test_bug_16688(self):
1706 # Issue 16688: Backreferences make case-insensitive regex fail on
1707 # non-ASCII strings.
1708 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1709 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001710
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001711 def test_repeat_minmax_overflow(self):
1712 # Issue #13169
1713 string = "x" * 100000
1714 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1715 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1716 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1717 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1718 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1719 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1720 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1721 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1722 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1723 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1724 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1725
1726 @cpython_only
1727 def test_repeat_minmax_overflow_maxrepeat(self):
1728 try:
1729 from _sre import MAXREPEAT
1730 except ImportError:
1731 self.skipTest('requires _sre.MAXREPEAT constant')
1732 string = "x" * 100000
1733 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1734 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1735 (0, 100000))
1736 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1737 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1738 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1739 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1740
R David Murray26dfaac92013-04-14 13:00:54 -04001741 def test_backref_group_name_in_exception(self):
1742 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001743 self.checkPatternError('(?P=<foo>)',
1744 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001745
1746 def test_group_name_in_exception(self):
1747 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001748 self.checkPatternError('(?P<?foo>)',
1749 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001750
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001751 def test_issue17998(self):
1752 for reps in '*', '+', '?', '{1}':
1753 for mod in '', '?':
1754 pattern = '.' + reps + mod + 'yz'
1755 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1756 ['xyz'], msg=pattern)
1757 pattern = pattern.encode()
1758 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1759 [b'xyz'], msg=pattern)
1760
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001761 def test_match_repr(self):
1762 for string in '[abracadabra]', S('[abracadabra]'):
1763 m = re.search(r'(.+)(.*?)\1', string)
Max Bernsteinccb7ca72019-05-21 10:09:21 -07001764 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
1765 type(m).__module__, type(m).__qualname__
1766 )
1767 self.assertRegex(repr(m), pattern)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001768 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1769 bytearray(b'[abracadabra]'),
1770 memoryview(b'[abracadabra]')):
R David Murray44b548d2016-09-08 13:59:53 -04001771 m = re.search(br'(.+)(.*?)\1', string)
Max Bernsteinccb7ca72019-05-21 10:09:21 -07001772 pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
1773 type(m).__module__, type(m).__qualname__
1774 )
1775 self.assertRegex(repr(m), pattern)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001776
1777 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
Max Bernsteinccb7ca72019-05-21 10:09:21 -07001778 pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
1779 type(second).__module__, type(second).__qualname__
1780 )
1781 self.assertRegex(repr(first), pattern)
1782 pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
1783 type(second).__module__, type(second).__qualname__
1784 )
1785 self.assertRegex(repr(second), pattern)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001786
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001787 def test_zerowidth(self):
1788 # Issues 852532, 1647489, 3262, 25054.
1789 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001790 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
1791 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001792 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1793
1794 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001795 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
1796 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001797
1798 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1799 self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1800 ['', 'a', '', '', 'bc', ''])
1801
1802 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1803 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1804 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1805 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001806
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001807 def test_bug_2537(self):
1808 # issue 2537: empty submatches
1809 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1810 for inner_op in ('{0,}', '*', '?'):
1811 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1812 m = r.match("xyyzy")
1813 self.assertEqual(m.group(0), "xyy")
1814 self.assertEqual(m.group(1), "")
1815 self.assertEqual(m.group(2), "y")
1816
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001817 @cpython_only
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001818 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001819 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001820 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001821 re.compile(pat, re.DEBUG)
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001822 self.maxDiff = None
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001823 dump = '''\
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001824SUBPATTERN 1 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001825 LITERAL 46
Serhiy Storchaka821a9d12017-05-14 08:32:33 +03001826BRANCH
1827 IN
1828 LITERAL 99
1829 LITERAL 104
1830OR
1831 LITERAL 112
1832 LITERAL 121
1833GROUPREF_EXISTS 1
1834 AT AT_END
1835ELSE
1836 LITERAL 58
1837 LITERAL 32
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001838
1839 0. INFO 8 0b1 2 5 (to 9)
1840 prefix_skip 0
1841 prefix [0x2e] ('.')
1842 overlap [0]
1843 9: MARK 0
184411. LITERAL 0x2e ('.')
184513. MARK 1
184615. BRANCH 10 (to 26)
184717. IN 6 (to 24)
184819. LITERAL 0x63 ('c')
184921. LITERAL 0x68 ('h')
185023. FAILURE
185124: JUMP 9 (to 34)
185226: branch 7 (to 33)
185327. LITERAL 0x70 ('p')
185429. LITERAL 0x79 ('y')
185531. JUMP 2 (to 34)
185633: FAILURE
185734: GROUPREF_EXISTS 0 6 (to 41)
185837. AT END
185939. JUMP 5 (to 45)
186041: LITERAL 0x3a (':')
186143. LITERAL 0x20 (' ')
186245: SUCCESS
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001863'''
1864 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001865 # Debug output is output again even a second time (bypassing
1866 # the cache -- issue #20426).
1867 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001868 re.compile(pat, re.DEBUG)
1869 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001870
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001871 def test_keyword_parameters(self):
1872 # Issue #20283: Accepting the string keyword parameter.
1873 pat = re.compile(r'(ab)')
1874 self.assertEqual(
1875 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1876 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001877 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1878 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001879 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1880 self.assertEqual(
1881 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1882 self.assertEqual(
1883 pat.split(string='abracadabra', maxsplit=1),
1884 ['', 'ab', 'racadabra'])
1885 self.assertEqual(
1886 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1887 (7, 9))
1888
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001889 def test_bug_20998(self):
1890 # Issue #20998: Fullmatch of repeated single character pattern
1891 # with ignore case.
1892 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1893
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001894 def test_locale_caching(self):
1895 # Issue #22410
1896 oldlocale = locale.setlocale(locale.LC_CTYPE)
1897 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1898 for loc in 'en_US.iso88591', 'en_US.utf8':
1899 try:
1900 locale.setlocale(locale.LC_CTYPE, loc)
1901 except locale.Error:
1902 # Unsupported locale on this system
1903 self.skipTest('test needs %s locale' % loc)
1904
1905 re.purge()
1906 self.check_en_US_iso88591()
1907 self.check_en_US_utf8()
1908 re.purge()
1909 self.check_en_US_utf8()
1910 self.check_en_US_iso88591()
1911
1912 def check_en_US_iso88591(self):
1913 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1914 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1915 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1916 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1917 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1918 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1919 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1920
1921 def check_en_US_utf8(self):
1922 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1923 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1924 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1925 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1926 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1927 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1928 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1929
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001930 def test_locale_compiled(self):
1931 oldlocale = locale.setlocale(locale.LC_CTYPE)
1932 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1933 for loc in 'en_US.iso88591', 'en_US.utf8':
1934 try:
1935 locale.setlocale(locale.LC_CTYPE, loc)
1936 except locale.Error:
1937 # Unsupported locale on this system
1938 self.skipTest('test needs %s locale' % loc)
1939
1940 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1941 p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1942 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1943 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1944 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1945 for p in p1, p2, p3:
1946 self.assertTrue(p.match(b'\xc5\xe5'))
1947 self.assertTrue(p.match(b'\xe5\xe5'))
1948 self.assertTrue(p.match(b'\xc5\xc5'))
1949 self.assertIsNone(p4.match(b'\xe5\xc5'))
1950 self.assertIsNone(p4.match(b'\xe5\xe5'))
1951 self.assertIsNone(p4.match(b'\xc5\xc5'))
1952
1953 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1954 for p in p1, p2, p3:
1955 self.assertTrue(p.match(b'\xc5\xe5'))
1956 self.assertIsNone(p.match(b'\xe5\xe5'))
1957 self.assertIsNone(p.match(b'\xc5\xc5'))
1958 self.assertTrue(p4.match(b'\xe5\xc5'))
1959 self.assertIsNone(p4.match(b'\xe5\xe5'))
1960 self.assertIsNone(p4.match(b'\xc5\xc5'))
1961
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001962 def test_error(self):
1963 with self.assertRaises(re.error) as cm:
1964 re.compile('(\u20ac))')
1965 err = cm.exception
1966 self.assertIsInstance(err.pattern, str)
1967 self.assertEqual(err.pattern, '(\u20ac))')
1968 self.assertEqual(err.pos, 3)
1969 self.assertEqual(err.lineno, 1)
1970 self.assertEqual(err.colno, 4)
1971 self.assertIn(err.msg, str(err))
1972 self.assertIn(' at position 3', str(err))
1973 self.assertNotIn(' at position 3', err.msg)
1974 # Bytes pattern
1975 with self.assertRaises(re.error) as cm:
1976 re.compile(b'(\xa4))')
1977 err = cm.exception
1978 self.assertIsInstance(err.pattern, bytes)
1979 self.assertEqual(err.pattern, b'(\xa4))')
1980 self.assertEqual(err.pos, 3)
1981 # Multiline pattern
1982 with self.assertRaises(re.error) as cm:
1983 re.compile("""
1984 (
1985 abc
1986 )
1987 )
1988 (
1989 """, re.VERBOSE)
1990 err = cm.exception
1991 self.assertEqual(err.pos, 77)
1992 self.assertEqual(err.lineno, 5)
1993 self.assertEqual(err.colno, 17)
1994 self.assertIn(err.msg, str(err))
1995 self.assertIn(' at position 77', str(err))
1996 self.assertIn('(line 5, column 17)', str(err))
1997
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001998 def test_misc_errors(self):
1999 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
2000 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
2001 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
2002 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
2003 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
2004 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03002005 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02002006 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
2007 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
2008 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
2009 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
2010
Victor Stinner8bf43e62016-11-14 12:38:43 +01002011 def test_enum(self):
2012 # Issue #28082: Check that str(flag) returns a human readable string
2013 # instead of an integer
2014 self.assertIn('ASCII', str(re.A))
2015 self.assertIn('DOTALL', str(re.S))
2016
Victor Stinnerb44fb122016-11-21 16:35:08 +01002017 def test_pattern_compare(self):
2018 pattern1 = re.compile('abc', re.IGNORECASE)
2019
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01002020 # equal to itself
2021 self.assertEqual(pattern1, pattern1)
2022 self.assertFalse(pattern1 != pattern1)
2023
Victor Stinnerb44fb122016-11-21 16:35:08 +01002024 # equal
2025 re.purge()
2026 pattern2 = re.compile('abc', re.IGNORECASE)
2027 self.assertEqual(hash(pattern2), hash(pattern1))
2028 self.assertEqual(pattern2, pattern1)
2029
2030 # not equal: different pattern
2031 re.purge()
2032 pattern3 = re.compile('XYZ', re.IGNORECASE)
2033 # Don't test hash(pattern3) != hash(pattern1) because there is no
2034 # warranty that hash values are different
2035 self.assertNotEqual(pattern3, pattern1)
2036
2037 # not equal: different flag (flags=0)
2038 re.purge()
2039 pattern4 = re.compile('abc')
2040 self.assertNotEqual(pattern4, pattern1)
2041
2042 # only == and != comparison operators are supported
2043 with self.assertRaises(TypeError):
2044 pattern1 < pattern2
2045
2046 def test_pattern_compare_bytes(self):
2047 pattern1 = re.compile(b'abc')
2048
2049 # equal: test bytes patterns
2050 re.purge()
2051 pattern2 = re.compile(b'abc')
2052 self.assertEqual(hash(pattern2), hash(pattern1))
2053 self.assertEqual(pattern2, pattern1)
2054
2055 # not equal: pattern of a different types (str vs bytes),
2056 # comparison must not raise a BytesWarning
2057 re.purge()
2058 pattern3 = re.compile('abc')
2059 with warnings.catch_warnings():
2060 warnings.simplefilter('error', BytesWarning)
2061 self.assertNotEqual(pattern3, pattern1)
2062
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02002063 def test_bug_29444(self):
2064 s = bytearray(b'abcdefgh')
2065 m = re.search(b'[a-h]+', s)
2066 m2 = re.search(b'[e-h]+', s)
2067 self.assertEqual(m.group(), b'abcdefgh')
2068 self.assertEqual(m2.group(), b'efgh')
2069 s[:] = b'xyz'
2070 self.assertEqual(m.group(), b'xyz')
2071 self.assertEqual(m2.group(), b'')
2072
animalize4a7f44a2019-02-18 21:26:37 +08002073 def test_bug_34294(self):
2074 # Issue 34294: wrong capturing groups
2075
2076 # exists since Python 2
2077 s = "a\tx"
2078 p = r"\b(?=(\t)|(x))x"
2079 self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2080
2081 # introduced in Python 3.7.0
2082 s = "ab"
2083 p = r"(?=(.)(.)?)"
2084 self.assertEqual(re.findall(p, s),
2085 [('a', 'b'), ('b', '')])
2086 self.assertEqual([m.groups() for m in re.finditer(p, s)],
2087 [('a', 'b'), ('b', None)])
2088
2089 # test-cases provided by issue34294, introduced in Python 3.7.0
2090 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2091 s = "<test><foo2/></test>"
2092 self.assertEqual(re.findall(p, s),
2093 [('test', '<foo2/>'), ('foo2', '')])
2094 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2095 [{'tag': 'test', 'text': '<foo2/>'},
2096 {'tag': 'foo2', 'text': None}])
2097 s = "<test>Hello</test><foo/>"
2098 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2099 [{'tag': 'test', 'text': 'Hello'},
2100 {'tag': 'foo', 'text': None}])
2101 s = "<test>Hello</test><foo/><foo/>"
2102 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2103 [{'tag': 'test', 'text': 'Hello'},
2104 {'tag': 'foo', 'text': None},
2105 {'tag': 'foo', 'text': None}])
2106
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002107
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002108class PatternReprTests(unittest.TestCase):
2109 def check(self, pattern, expected):
2110 self.assertEqual(repr(re.compile(pattern)), expected)
2111
2112 def check_flags(self, pattern, flags, expected):
2113 self.assertEqual(repr(re.compile(pattern, flags)), expected)
2114
2115 def test_without_flags(self):
2116 self.check('random pattern',
2117 "re.compile('random pattern')")
2118
2119 def test_single_flag(self):
2120 self.check_flags('random pattern', re.IGNORECASE,
2121 "re.compile('random pattern', re.IGNORECASE)")
2122
2123 def test_multiple_flags(self):
2124 self.check_flags('random pattern', re.I|re.S|re.X,
2125 "re.compile('random pattern', "
2126 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2127
2128 def test_unicode_flag(self):
2129 self.check_flags('random pattern', re.U,
2130 "re.compile('random pattern')")
2131 self.check_flags('random pattern', re.I|re.S|re.U,
2132 "re.compile('random pattern', "
2133 "re.IGNORECASE|re.DOTALL)")
2134
2135 def test_inline_flags(self):
2136 self.check('(?i)pattern',
2137 "re.compile('(?i)pattern', re.IGNORECASE)")
2138
2139 def test_unknown_flags(self):
2140 self.check_flags('random pattern', 0x123000,
2141 "re.compile('random pattern', 0x123000)")
2142 self.check_flags('random pattern', 0x123000|re.I,
2143 "re.compile('random pattern', re.IGNORECASE|0x123000)")
2144
2145 def test_bytes(self):
2146 self.check(b'bytes pattern',
2147 "re.compile(b'bytes pattern')")
2148 self.check_flags(b'bytes pattern', re.A,
2149 "re.compile(b'bytes pattern', re.ASCII)")
2150
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002151 def test_locale(self):
2152 self.check_flags(b'bytes pattern', re.L,
2153 "re.compile(b'bytes pattern', re.LOCALE)")
2154
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002155 def test_quotes(self):
2156 self.check('random "double quoted" pattern',
2157 '''re.compile('random "double quoted" pattern')''')
2158 self.check("random 'single quoted' pattern",
2159 '''re.compile("random 'single quoted' pattern")''')
2160 self.check('''both 'single' and "double" quotes''',
2161 '''re.compile('both \\'single\\' and "double" quotes')''')
2162
2163 def test_long_pattern(self):
2164 pattern = 'Very %spattern' % ('long ' * 1000)
2165 r = repr(re.compile(pattern))
2166 self.assertLess(len(r), 300)
2167 self.assertEqual(r[:30], "re.compile('Very long long lon")
2168 r = repr(re.compile(pattern, re.I))
2169 self.assertLess(len(r), 300)
2170 self.assertEqual(r[:30], "re.compile('Very long long lon")
2171 self.assertEqual(r[-16:], ", re.IGNORECASE)")
2172
Serhiy Storchaka14a0e162019-05-31 10:39:47 +03002173 def test_flags_repr(self):
2174 self.assertEqual(repr(re.I), "re.IGNORECASE")
2175 self.assertEqual(repr(re.I|re.S|re.X),
2176 "re.IGNORECASE|re.DOTALL|re.VERBOSE")
2177 self.assertEqual(repr(re.I|re.S|re.X|(1<<20)),
2178 "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000")
2179 self.assertEqual(repr(~re.I), "~re.IGNORECASE")
2180 self.assertEqual(repr(~(re.I|re.S|re.X)),
2181 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2182 self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))),
2183 "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)")
2184
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002185
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002186class ImplementationTest(unittest.TestCase):
2187 """
2188 Test implementation details of the re module.
2189 """
2190
2191 def test_overlap_table(self):
2192 f = sre_compile._generate_overlap_table
2193 self.assertEqual(f(""), [])
2194 self.assertEqual(f("a"), [0])
2195 self.assertEqual(f("abcd"), [0, 0, 0, 0])
2196 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2197 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2198 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2199
2200
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002201class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00002202
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002203 def test_re_benchmarks(self):
2204 're_tests benchmarks'
2205 from test.re_tests import benchmarks
2206 for pattern, s in benchmarks:
2207 with self.subTest(pattern=pattern, string=s):
2208 p = re.compile(pattern)
2209 self.assertTrue(p.search(s))
2210 self.assertTrue(p.match(s))
2211 self.assertTrue(p.fullmatch(s))
2212 s2 = ' '*10000 + s + ' '*10000
2213 self.assertTrue(p.search(s2))
2214 self.assertTrue(p.match(s2, 10000))
2215 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2216 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002217
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002218 def test_re_tests(self):
2219 're_tests test suite'
2220 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
2221 for t in tests:
2222 pattern = s = outcome = repl = expected = None
2223 if len(t) == 5:
2224 pattern, s, outcome, repl, expected = t
2225 elif len(t) == 3:
2226 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00002227 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002228 raise ValueError('Test tuples should have 3 or 5 fields', t)
2229
2230 with self.subTest(pattern=pattern, string=s):
2231 if outcome == SYNTAX_ERROR: # Expected a syntax error
2232 with self.assertRaises(re.error):
2233 re.compile(pattern)
2234 continue
2235
2236 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002237 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002238 if outcome == FAIL:
2239 self.assertIsNone(result, 'Succeeded incorrectly')
2240 continue
2241
2242 with self.subTest():
2243 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002244 # Matched, as expected, so now we compute the
2245 # result string and compare it to our expected result.
2246 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002247 vardict = {'found': result.group(0),
2248 'groups': result.group(),
2249 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002250 for i in range(1, 100):
2251 try:
2252 gi = result.group(i)
2253 # Special hack because else the string concat fails:
2254 if gi is None:
2255 gi = "None"
2256 except IndexError:
2257 gi = "Error"
2258 vardict['g%d' % i] = gi
2259 for i in result.re.groupindex.keys():
2260 try:
2261 gi = result.group(i)
2262 if gi is None:
2263 gi = "None"
2264 except IndexError:
2265 gi = "Error"
2266 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002267 self.assertEqual(eval(repl, vardict), expected,
2268 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002269
Antoine Pitrou22628c42008-07-22 17:53:22 +00002270 # Try the match with both pattern and string converted to
2271 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002272 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00002273 bpat = bytes(pattern, "ascii")
2274 bs = bytes(s, "ascii")
2275 except UnicodeEncodeError:
2276 # skip non-ascii tests
2277 pass
2278 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002279 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002280 obj = re.compile(bpat)
2281 self.assertTrue(obj.search(bs))
2282
2283 # Try the match with LOCALE enabled, and check that it
2284 # still succeeds.
2285 with self.subTest('locale-sensitive match'):
2286 obj = re.compile(bpat, re.LOCALE)
2287 result = obj.search(bs)
2288 if result is None:
2289 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002290
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002291 # Try the match with the search area limited to the extent
2292 # of the match and see if it still succeeds. \B will
2293 # break (because it won't match at the end or start of a
2294 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002295 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2296 and result is not None):
2297 with self.subTest('range-limited match'):
2298 obj = re.compile(pattern)
2299 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00002300
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002301 # Try the match with IGNORECASE enabled, and check that it
2302 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002303 with self.subTest('case-insensitive match'):
2304 obj = re.compile(pattern, re.IGNORECASE)
2305 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00002306
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002307 # Try the match with UNICODE locale enabled, and check
2308 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002309 with self.subTest('unicode-sensitive match'):
2310 obj = re.compile(pattern, re.UNICODE)
2311 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002312
Gregory P. Smith5a631832010-07-27 05:31:29 +00002313
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002314if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002315 unittest.main()