blob: 5ef6d7b12c50a3baf17e4a8375571a612f07e5d8 [file] [log] [blame]
Victor Stinnerd6debb22017-03-27 16:05:26 +02001from test.support import (gc_collect, bigmemtest, _2G,
2 cpython_only, captured_stdout)
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02003import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02005import sre_compile
Ezio Melottid2114eb2011-03-25 14:08:44 +02006import string
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import unittest
Victor Stinnerb44fb122016-11-21 16:35:08 +01008import warnings
9from re import Scanner
Raymond Hettinger027bb632004-05-31 03:09:25 +000010from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000011
Guido van Rossum23b22571997-07-17 22:36:14 +000012# Misc tests from Tim Peters' re.doc
13
Just van Rossum6802c6e2003-07-02 14:36:59 +000014# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020015# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000016# cover most of the code.
17
Serhiy Storchaka25324972013-10-16 12:46:28 +030018class S(str):
19 def __getitem__(self, index):
20 return S(super().__getitem__(index))
21
22class B(bytes):
23 def __getitem__(self, index):
24 return B(super().__getitem__(index))
25
Skip Montanaro8ed06da2003-04-24 19:43:18 +000026class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000027
Serhiy Storchaka25324972013-10-16 12:46:28 +030028 def assertTypedEqual(self, actual, expect, msg=None):
29 self.assertEqual(actual, expect, msg)
30 def recurse(actual, expect):
31 if isinstance(expect, (tuple, list)):
32 for x, y in zip(actual, expect):
33 recurse(x, y)
34 else:
35 self.assertIs(type(actual), type(expect), msg)
36 recurse(actual, expect)
37
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020038 def checkPatternError(self, pattern, errmsg, pos=None):
39 with self.assertRaises(re.error) as cm:
40 re.compile(pattern)
41 with self.subTest(pattern=pattern):
42 err = cm.exception
43 self.assertEqual(err.msg, errmsg)
44 if pos is not None:
45 self.assertEqual(err.pos, pos)
46
47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
48 with self.assertRaises(re.error) as cm:
49 re.sub(pattern, repl, string)
50 with self.subTest(pattern=pattern, repl=repl):
51 err = cm.exception
52 self.assertEqual(err.msg, errmsg)
53 if pos is not None:
54 self.assertEqual(err.pos, pos)
55
Benjamin Petersone48944b2012-03-07 14:50:25 -060056 def test_keep_buffer(self):
57 # See bug 14212
58 b = bytearray(b'x')
59 it = re.finditer(b'a', b)
60 with self.assertRaises(BufferError):
61 b.extend(b'x'*400)
62 list(it)
63 del it
64 gc_collect()
65 b.extend(b'x'*400)
66
Raymond Hettinger027bb632004-05-31 03:09:25 +000067 def test_weakref(self):
68 s = 'QabbbcR'
69 x = re.compile('ab+c')
70 y = proxy(x)
71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
72
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_search_star_plus(self):
74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030078 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000079 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030083 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000084
Skip Montanaro8ed06da2003-04-24 19:43:18 +000085 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000086 int_value = int(matchobj.group(0))
87 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030090 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030096 for y in ("\xe0", "\u0430", "\U0001d49c"):
97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030098
Skip Montanaro8ed06da2003-04-24 19:43:18 +000099 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
101 '9.3 -3 24x100y')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
103 '9.3 -3 23x99y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000105 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000106
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000110 s = r"\1\1"
111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
Serhiy Storchakab748e3b2017-12-12 19:21:50 +0200112 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000114
R David Murray44b548d2016-09-08 13:59:53 -0400115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000119
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
125 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300126 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000128
R David Murray44b548d2016-09-08 13:59:53 -0400129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000130
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000131 def test_bug_449964(self):
132 # fails for group followed by other escape
R David Murray44b548d2016-09-08 13:59:53 -0400133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000134 'xx\bxx\b')
135
136 def test_bug_449000(self):
137 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
139 'abc\ndef\n')
140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
141 'abc\ndef\n')
142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
143 'abc\ndef\n')
144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
145 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000146
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000147 def test_bug_1661(self):
148 # Verify that flags do not get silently ignored with compiled patterns
149 pattern = re.compile('.')
150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.compile, pattern, re.I)
154
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000155 def test_bug_3629(self):
156 # A regex that triggered a bug in the sre-code validator
157 re.compile("(?P<quote>)(?(quote))")
158
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000159 def test_sub_template_numeric_escape(self):
160 # bug 776311 and friends
161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000169
170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
172
173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
178
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200179 self.checkTemplateError('x', r'\400', 'x',
180 r'octal escape value \400 outside of '
181 r'range 0-0o377', 0)
182 self.checkTemplateError('x', r'\777', 'x',
183 r'octal escape value \777 outside of '
184 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000185
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Victor Stinner55e614a2014-10-29 16:58:59 +0100210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000211
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000212 def test_bug_114660(self):
213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
214 'hello there')
215
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216 def test_symbolic_groups(self):
R David Murray44b548d2016-09-08 13:59:53 -0400217 re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
218 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
219 re.compile(r'(?P<a1>x)\1(?(1)y)')
220 self.checkPatternError(r'(?P<a>)(?P<a>)',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200221 "redefinition of group name 'a' as group 2; "
222 "was group 1")
R David Murray44b548d2016-09-08 13:59:53 -0400223 self.checkPatternError(r'(?P<a>(?P=a))',
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300224 "cannot refer to an open group", 10)
R David Murray44b548d2016-09-08 13:59:53 -0400225 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
226 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
227 self.checkPatternError(r'(?P=', 'missing group name', 4)
228 self.checkPatternError(r'(?P=)', 'missing group name', 4)
229 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
230 self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
231 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
232 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
233 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
234 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
235 self.checkPatternError(r'(?P<', 'missing group name', 4)
236 self.checkPatternError(r'(?P<>)', 'missing group name', 4)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200237 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
238 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
239 self.checkPatternError(r'(?(', 'missing group name', 3)
240 self.checkPatternError(r'(?())', 'missing group name', 3)
241 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
242 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
243 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
244 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200245 # New valid/invalid identifiers in Python 3
246 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
247 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200248 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300249 # Support > 100 groups.
250 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
251 pat = '(?:%s)(?(200)z|t)' % pat
252 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200253
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000254 def test_symbolic_refs(self):
R David Murray44b548d2016-09-08 13:59:53 -0400255 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200256 'missing >, unterminated name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400257 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200258 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400259 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
260 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200261 "bad character in group name 'a a'", 3)
R David Murray44b548d2016-09-08 13:59:53 -0400262 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200263 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400264 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200265 "bad character in group name '1a1'", 3)
266 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300267 'invalid group reference 2', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200268 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300269 'invalid group reference 2', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200270 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
R David Murray44b548d2016-09-08 13:59:53 -0400271 re.sub('(?P<a>x)', r'\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300272 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
273 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
R David Murray44b548d2016-09-08 13:59:53 -0400274 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200275 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200276 # New valid/invalid identifiers in Python 3
277 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
278 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
R David Murray44b548d2016-09-08 13:59:53 -0400279 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200280 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300281 # Support > 100 groups.
282 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
R David Murray44b548d2016-09-08 13:59:53 -0400283 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000284
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000285 def test_re_subn(self):
286 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
287 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
288 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
289 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300290 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Victor Stinner55e614a2014-10-29 16:58:59 +0100291 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000292
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000293 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300294 for string in ":a:b::c", S(":a:b::c"):
295 self.assertTypedEqual(re.split(":", string),
296 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200297 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300298 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200299 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300300 ['', ':', 'a', ':', 'b', '::', 'c'])
301 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
302 memoryview(b":a:b::c")):
303 self.assertTypedEqual(re.split(b":", string),
304 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200305 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300306 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200307 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300308 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300309 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
310 "\U0001d49c\U0001d49e\U0001d4b5"):
311 string = ":%s:%s::%s" % (a, b, c)
312 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200313 self.assertEqual(re.split(":+", string), ['', a, b, c])
314 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300315 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300316
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200317 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
318 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000319 ['', ':', 'a', ':', 'b', ':', 'c'])
320 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
321 ['', ':', 'a', ':b::', 'c'])
322 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
323 ['', None, ':', 'a', None, ':', '', 'b', None, '',
324 None, '::', 'c'])
325 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
326 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000327
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200328 for sep, expected in [
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200329 (':*', ['', '', 'a', '', 'b', '', 'c', '']),
330 ('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
331 ('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
332 ('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200333 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200334 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200335 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
336
337 for sep, expected in [
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200338 ('', ['', ':', 'a', ':', 'b', ':', ':', 'c', '']),
339 (r'\b', [':', 'a', ':', 'b', '::', 'c', '']),
340 (r'(?=:)', ['', ':a', ':b', ':', ':c']),
341 (r'(?<=:)', [':', 'a:', 'b:', ':', 'c']),
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200342 ]:
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200343 with self.subTest(sep=sep):
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200344 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
345
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000346 def test_qualified_re_split(self):
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300347 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100348 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
349 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
350 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200352 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000353 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +0200354 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
Serhiy Storchakafbb490f2018-01-04 11:06:13 +0200355 ['', ':', '', '', 'a:b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000356
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000357 def test_re_findall(self):
358 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300359 for string in "a:b::c:::d", S("a:b::c:::d"):
360 self.assertTypedEqual(re.findall(":+", string),
361 [":", "::", ":::"])
362 self.assertTypedEqual(re.findall("(:+)", string),
363 [":", "::", ":::"])
364 self.assertTypedEqual(re.findall("(:)(:*)", string),
365 [(":", ""), (":", ":"), (":", "::")])
366 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
367 memoryview(b"a:b::c:::d")):
368 self.assertTypedEqual(re.findall(b":+", string),
369 [b":", b"::", b":::"])
370 self.assertTypedEqual(re.findall(b"(:+)", string),
371 [b":", b"::", b":::"])
372 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
373 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300374 for x in ("\xe0", "\u0430", "\U0001d49c"):
375 xx = x * 2
376 xxx = x * 3
377 string = "a%sb%sc%sd" % (x, xx, xxx)
378 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
379 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
380 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
381 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000382
Skip Montanaro5ba00542003-04-25 16:00:14 +0000383 def test_bug_117612(self):
384 self.assertEqual(re.findall(r"(a|(b))", "aba"),
385 [("a", ""),("b", "b"),("a", "")])
386
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000387 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300388 for string in 'a', S('a'):
389 self.assertEqual(re.match('a', string).groups(), ())
390 self.assertEqual(re.match('(a)', string).groups(), ('a',))
391 self.assertEqual(re.match('(a)', string).group(0), 'a')
392 self.assertEqual(re.match('(a)', string).group(1), 'a')
393 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
394 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
395 self.assertEqual(re.match(b'a', string).groups(), ())
396 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
397 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
398 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
399 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300400 for a in ("\xe0", "\u0430", "\U0001d49c"):
401 self.assertEqual(re.match(a, a).groups(), ())
402 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
403 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
404 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
405 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000406
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000407 pat = re.compile('((a)|(b))(c)?')
408 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
409 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
410 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
411 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
412 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000413
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000414 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
415 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
416 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
417 (None, 'b', None))
418 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000419
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300420 def test_group(self):
421 class Index:
422 def __init__(self, value):
423 self.value = value
424 def __index__(self):
425 return self.value
426 # A single group
427 m = re.match('(a)(b)', 'ab')
428 self.assertEqual(m.group(), 'ab')
429 self.assertEqual(m.group(0), 'ab')
430 self.assertEqual(m.group(1), 'a')
431 self.assertEqual(m.group(Index(1)), 'a')
432 self.assertRaises(IndexError, m.group, -1)
433 self.assertRaises(IndexError, m.group, 3)
434 self.assertRaises(IndexError, m.group, 1<<1000)
435 self.assertRaises(IndexError, m.group, Index(1<<1000))
436 self.assertRaises(IndexError, m.group, 'x')
437 # Multiple groups
438 self.assertEqual(m.group(2, 1), ('b', 'a'))
439 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
440
Eric V. Smith605bdae2016-09-11 08:55:43 -0400441 def test_match_getitem(self):
442 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
443
444 m = pat.match('a')
445 self.assertEqual(m['a1'], 'a')
446 self.assertEqual(m['b2'], None)
447 self.assertEqual(m['c3'], None)
448 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
449 self.assertEqual(m[0], 'a')
450 self.assertEqual(m[1], 'a')
451 self.assertEqual(m[2], None)
452 self.assertEqual(m[3], None)
453 with self.assertRaisesRegex(IndexError, 'no such group'):
454 m['X']
455 with self.assertRaisesRegex(IndexError, 'no such group'):
456 m[-1]
457 with self.assertRaisesRegex(IndexError, 'no such group'):
458 m[4]
459 with self.assertRaisesRegex(IndexError, 'no such group'):
460 m[0, 1]
461 with self.assertRaisesRegex(IndexError, 'no such group'):
462 m[(0,)]
463 with self.assertRaisesRegex(IndexError, 'no such group'):
464 m[(0, 1)]
Serhiy Storchaka50754162017-08-03 11:45:23 +0300465 with self.assertRaisesRegex(IndexError, 'no such group'):
Eric V. Smith605bdae2016-09-11 08:55:43 -0400466 'a1={a2}'.format_map(m)
467
468 m = pat.match('ac')
469 self.assertEqual(m['a1'], 'a')
470 self.assertEqual(m['b2'], None)
471 self.assertEqual(m['c3'], 'c')
472 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
473 self.assertEqual(m[0], 'ac')
474 self.assertEqual(m[1], 'a')
475 self.assertEqual(m[2], None)
476 self.assertEqual(m[3], 'c')
477
478 # Cannot assign.
479 with self.assertRaises(TypeError):
480 m[0] = 1
481
482 # No len().
483 self.assertRaises(TypeError, len, m)
484
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200485 def test_re_fullmatch(self):
486 # Issue 16203: Proposal: add re.fullmatch() method.
487 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
488 for string in "ab", S("ab"):
489 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
490 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
491 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
492 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
493 r = r"%s|%s" % (a, a + b)
494 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
495 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
496 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
497 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
498 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
499 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
500 self.assertIsNone(re.fullmatch(r"a+", "ab"))
501 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
502 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
503 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
504 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
505 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
506 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
507
508 self.assertEqual(
509 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
510 self.assertEqual(
511 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
512 self.assertEqual(
513 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
514
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515 def test_re_groupref_exists(self):
R David Murray44b548d2016-09-08 13:59:53 -0400516 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000517 ('(', 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400518 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000519 (None, 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400520 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
521 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000522 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
523 ('a', 'b'))
R David Murray44b548d2016-09-08 13:59:53 -0400524 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000525 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400526 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000527 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400528 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000529 ('a', ''))
530
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000531 # Tests for bug #1177831: exercise groups other than the first group
532 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
533 self.assertEqual(p.match('abc').groups(),
534 ('a', 'b', 'c'))
535 self.assertEqual(p.match('ad').groups(),
536 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300537 self.assertIsNone(p.match('abd'))
538 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000539
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300540 # Support > 100 groups.
541 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
542 pat = '(?:%s)(?(200)z)' % pat
543 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000544
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200545 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
546 self.checkPatternError(r'()(?(1)a|b',
547 'missing ), unterminated subpattern', 2)
548 self.checkPatternError(r'()(?(1)a|b|c)',
549 'conditional backref with more than '
550 'two branches', 10)
551
552 def test_re_groupref_overflow(self):
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300553 from sre_constants import MAXGROUPS
554 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
555 'invalid group reference %d' % MAXGROUPS, 3)
556 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
557 'invalid group reference %d' % MAXGROUPS, 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200558
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000559 def test_re_groupref(self):
560 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
561 ('|', 'a'))
562 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
563 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300564 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
565 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000566 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
567 ('a', 'a'))
568 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
569 (None, None))
570
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200571 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
572
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000573 def test_groupdict(self):
574 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
575 'first second').groupdict(),
576 {'first':'first', 'second':'second'})
577
578 def test_expand(self):
579 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
580 "first second")
581 .expand(r"\2 \1 \g<second> \g<first>"),
582 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300583 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
584 "first")
585 .expand(r"\2 \g<second>"),
586 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000587
588 def test_repeat_minmax(self):
R David Murray44b548d2016-09-08 13:59:53 -0400589 self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
590 self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
591 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
592 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000593
R David Murray44b548d2016-09-08 13:59:53 -0400594 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
595 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
596 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
597 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
598 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
599 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
600 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
601 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000602
R David Murray44b548d2016-09-08 13:59:53 -0400603 self.assertIsNone(re.match(r"^x{1}$", "xxx"))
604 self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
605 self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
606 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000607
R David Murray44b548d2016-09-08 13:59:53 -0400608 self.assertTrue(re.match(r"^x{3}$", "xxx"))
609 self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
610 self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
611 self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
612 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
613 self.assertTrue(re.match(r"^x{3}?$", "xxx"))
614 self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
615 self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
616 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000617
R David Murray44b548d2016-09-08 13:59:53 -0400618 self.assertIsNone(re.match(r"^x{}$", "xxx"))
619 self.assertTrue(re.match(r"^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000620
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200621 self.checkPatternError(r'x{2,1}',
622 'min repeat greater than max repeat', 2)
623
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000624 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000625 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000626 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000627 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
628 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
629 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
630 {'first': 1, 'other': 2})
631
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000632 self.assertEqual(re.match("(a)", "a").pos, 0)
633 self.assertEqual(re.match("(a)", "a").endpos, 1)
634 self.assertEqual(re.match("(a)", "a").string, "a")
635 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300636 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000637
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300638 # Issue 14260. groupindex should be non-modifiable mapping.
639 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
640 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
641 self.assertEqual(p.groupindex['other'], 2)
642 with self.assertRaises(TypeError):
643 p.groupindex['other'] = 0
644 self.assertEqual(p.groupindex['other'], 2)
645
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000646 def test_special_escapes(self):
647 self.assertEqual(re.search(r"\b(b.)\b",
648 "abcd abc bcd bx").group(1), "bx")
649 self.assertEqual(re.search(r"\B(b.)\B",
650 "abc bcd bc abxd").group(1), "bx")
651 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300652 "abcd abc bcd bx", re.ASCII).group(1), "bx")
653 self.assertEqual(re.search(r"\B(b.)\B",
654 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000655 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
656 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300657 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300658 self.assertEqual(re.search(br"\b(b.)\b",
659 b"abcd abc bcd bx").group(1), b"bx")
660 self.assertEqual(re.search(br"\B(b.)\B",
661 b"abc bcd bc abxd").group(1), b"bx")
662 self.assertEqual(re.search(br"\b(b.)\b",
663 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
664 self.assertEqual(re.search(br"\B(b.)\B",
665 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
666 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
667 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300668 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000669 self.assertEqual(re.search(r"\d\D\w\W\s\S",
670 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300671 self.assertEqual(re.search(br"\d\D\w\W\s\S",
672 b"1aa! a").group(0), b"1aa! a")
673 self.assertEqual(re.search(r"\d\D\w\W\s\S",
674 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300675 self.assertEqual(re.search(br"\d\D\w\W\s\S",
676 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000677
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200678 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200679 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200680 self.assertEqual(re.match(r"\(", '(').group(), '(')
681 self.assertIsNone(re.match(r"\(", ')'))
682 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200683 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
684 self.assertIsNone(re.match(r"[\]]", '['))
685 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
686 self.assertIsNone(re.match(r"[a\-c]", 'b'))
687 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
688 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200689 re.purge() # for warnings
690 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
691 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300692 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200693 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
694 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300695 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200696
Ezio Melotti5a045b92012-02-29 11:48:44 +0200697 def test_string_boundaries(self):
698 # See http://bugs.python.org/issue10713
699 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
700 "abc")
701 # There's a word boundary at the start of a string.
702 self.assertTrue(re.match(r"\b", "abc"))
703 # A non-empty string includes a non-boundary zero-length match.
704 self.assertTrue(re.search(r"\B", "abc"))
705 # There is no non-boundary match at the start of a string.
706 self.assertFalse(re.match(r"\B", "abc"))
707 # However, an empty string contains no word boundaries, and also no
708 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300709 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200710 # This one is questionable and different from the perlre behaviour,
711 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300712 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200713 # A single word-character string has two boundaries, but no
714 # non-boundary gaps.
715 self.assertEqual(len(re.findall(r"\b", "a")), 2)
716 self.assertEqual(len(re.findall(r"\B", "a")), 0)
717 # If there are no words, there are no boundaries
718 self.assertEqual(len(re.findall(r"\b", " ")), 0)
719 self.assertEqual(len(re.findall(r"\b", " ")), 0)
720 # Can match around the whitespace.
721 self.assertEqual(len(re.findall(r"\B", " ")), 2)
722
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000723 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000724 self.assertEqual(re.match("([\u2222\u2223])",
725 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300726 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300727 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000728
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100729 def test_big_codesize(self):
730 # Issue #1160
731 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300732 self.assertTrue(r.match('1000'))
733 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100734
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000735 def test_anyall(self):
736 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
737 "a\nb")
738 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
739 "a\n\nb")
740
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200741 def test_lookahead(self):
R David Murray44b548d2016-09-08 13:59:53 -0400742 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
743 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
744 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
745 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000746 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
747 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
748 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
749
750 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
751 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
752 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
753 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
754
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200755 # Group reference.
756 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
757 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
758 # Conditional group reference.
759 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
760 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
761 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
762 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
763 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
764 # Group used before defined.
765 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
766 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
767 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
768
769 def test_lookbehind(self):
770 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
771 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
772 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
773 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
774 # Group reference.
775 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
776 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
777 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
778 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
779 # Conditional group reference.
780 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
781 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
782 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
783 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
784 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
785 # Group used before defined.
786 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
787 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
788 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
789 # Group defined in the same lookbehind pattern
790 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
791 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
792 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
793 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
794
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000795 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000796 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300797 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000798 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
799 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
800 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
801 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
802 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
803 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
804 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
805 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
806
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200807 assert '\u212a'.lower() == 'k' # 'K'
808 self.assertTrue(re.match(r'K', '\u212a', re.I))
809 self.assertTrue(re.match(r'k', '\u212a', re.I))
810 self.assertTrue(re.match(r'\u212a', 'K', re.I))
811 self.assertTrue(re.match(r'\u212a', 'k', re.I))
812 assert '\u017f'.upper() == 'S' # 'ſ'
813 self.assertTrue(re.match(r'S', '\u017f', re.I))
814 self.assertTrue(re.match(r's', '\u017f', re.I))
815 self.assertTrue(re.match(r'\u017f', 'S', re.I))
816 self.assertTrue(re.match(r'\u017f', 's', re.I))
817 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
818 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
819 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
820
821 def test_ignore_case_set(self):
822 self.assertTrue(re.match(r'[19A]', 'A', re.I))
823 self.assertTrue(re.match(r'[19a]', 'a', re.I))
824 self.assertTrue(re.match(r'[19a]', 'A', re.I))
825 self.assertTrue(re.match(r'[19A]', 'a', re.I))
826 self.assertTrue(re.match(br'[19A]', b'A', re.I))
827 self.assertTrue(re.match(br'[19a]', b'a', re.I))
828 self.assertTrue(re.match(br'[19a]', b'A', re.I))
829 self.assertTrue(re.match(br'[19A]', b'a', re.I))
830 assert '\u212a'.lower() == 'k' # 'K'
831 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
832 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
833 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
834 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
835 assert '\u017f'.upper() == 'S' # 'ſ'
836 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
837 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
838 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
839 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
840 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
841 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
842 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
843
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200844 def test_ignore_case_range(self):
845 # Issues #3511, #17381.
846 self.assertTrue(re.match(r'[9-a]', '_', re.I))
847 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
848 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
849 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
850 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
851 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
852 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
853 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
854 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
855 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
856 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
857 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
858 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
859 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
860 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
861 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
862
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200863 assert '\u212a'.lower() == 'k' # 'K'
864 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
865 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
866 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
867 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
868 assert '\u017f'.upper() == 'S' # 'ſ'
869 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
870 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
871 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
872 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
873 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
874 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
875 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
876
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000877 def test_category(self):
878 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
879
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300880 @cpython_only
881 def test_case_helpers(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000882 import _sre
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300883 for i in range(128):
884 c = chr(i)
885 lo = ord(c.lower())
886 self.assertEqual(_sre.ascii_tolower(i), lo)
887 self.assertEqual(_sre.unicode_tolower(i), lo)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300888 iscased = c in string.ascii_letters
889 self.assertEqual(_sre.ascii_iscased(i), iscased)
890 self.assertEqual(_sre.unicode_iscased(i), iscased)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000891
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300892 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
893 c = chr(i)
894 self.assertEqual(_sre.ascii_tolower(i), i)
895 if i != 0x0130:
896 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300897 iscased = c != c.lower() or c != c.upper()
898 self.assertFalse(_sre.ascii_iscased(i))
899 self.assertEqual(_sre.unicode_iscased(i),
900 c != c.lower() or c != c.upper())
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300901
902 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
903 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300904 self.assertFalse(_sre.ascii_iscased(0x0130))
905 self.assertTrue(_sre.unicode_iscased(0x0130))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000906
907 def test_not_literal(self):
R David Murray44b548d2016-09-08 13:59:53 -0400908 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
909 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000910
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200911 def test_possible_set_operations(self):
912 s = bytes(range(128)).decode()
913 with self.assertWarns(FutureWarning):
914 p = re.compile(r'[0-9--1]')
915 self.assertEqual(p.findall(s), list('-./0123456789'))
916 self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
917 with self.assertWarns(FutureWarning):
918 p = re.compile(r'[%--1]')
919 self.assertEqual(p.findall(s), list("%&'()*+,-1"))
920 with self.assertWarns(FutureWarning):
921 p = re.compile(r'[%--]')
922 self.assertEqual(p.findall(s), list("%&'()*+,-"))
923
924 with self.assertWarns(FutureWarning):
925 p = re.compile(r'[0-9&&1]')
926 self.assertEqual(p.findall(s), list('&0123456789'))
927 with self.assertWarns(FutureWarning):
928 p = re.compile(r'[\d&&1]')
929 self.assertEqual(p.findall(s), list('&0123456789'))
930 self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
931
932 with self.assertWarns(FutureWarning):
933 p = re.compile(r'[0-9||a]')
934 self.assertEqual(p.findall(s), list('0123456789a|'))
935 with self.assertWarns(FutureWarning):
936 p = re.compile(r'[\d||a]')
937 self.assertEqual(p.findall(s), list('0123456789a|'))
938 self.assertEqual(re.findall(r'[||1]', s), list('1|'))
939
940 with self.assertWarns(FutureWarning):
941 p = re.compile(r'[0-9~~1]')
942 self.assertEqual(p.findall(s), list('0123456789~'))
943 with self.assertWarns(FutureWarning):
944 p = re.compile(r'[\d~~1]')
945 self.assertEqual(p.findall(s), list('0123456789~'))
946 self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
947
948 with self.assertWarns(FutureWarning):
949 p = re.compile(r'[[0-9]|]')
950 self.assertEqual(p.findall(s), list('0123456789[]'))
951
952 with self.assertWarns(FutureWarning):
953 p = re.compile(r'[[:digit:]|]')
954 self.assertEqual(p.findall(s), list(':[]dgit'))
955
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000956 def test_search_coverage(self):
R David Murray44b548d2016-09-08 13:59:53 -0400957 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
958 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000959
Ezio Melottid2114eb2011-03-25 14:08:44 +0200960 def assertMatch(self, pattern, text, match=None, span=None,
Serhiy Storchaka59083002017-04-13 21:06:43 +0300961 matcher=re.fullmatch):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200962 if match is None and span is None:
963 # the pattern matches the whole text
964 match = text
965 span = (0, len(text))
966 elif match is None or span is None:
967 raise ValueError('If match is not None, span should be specified '
968 '(and vice versa).')
969 m = matcher(pattern, text)
970 self.assertTrue(m)
971 self.assertEqual(m.group(), match)
972 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000973
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200974 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
Serhiy Storchaka59083002017-04-13 21:06:43 +0300975
Ezio Melottid2114eb2011-03-25 14:08:44 +0200976 def test_re_escape(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200977 p = ''.join(chr(i) for i in range(256))
978 for c in p:
Ezio Melottid2114eb2011-03-25 14:08:44 +0200979 self.assertMatch(re.escape(c), c)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300980 self.assertMatch('[' + re.escape(c) + ']', c)
981 self.assertMatch('(?x)' + re.escape(c), c)
Ezio Melottid2114eb2011-03-25 14:08:44 +0200982 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300983 for c in '-.]{}':
984 self.assertEqual(re.escape(c)[:1], '\\')
985 literal_chars = self.LITERAL_CHARS
986 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum49946571997-07-18 04:26:25 +0000987
Serhiy Storchaka59083002017-04-13 21:06:43 +0300988 def test_re_escape_bytes(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200989 p = bytes(range(256))
990 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000991 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200992 self.assertMatch(re.escape(b), b)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300993 self.assertMatch(b'[' + re.escape(b) + b']', b)
994 self.assertMatch(b'(?x)' + re.escape(b), b)
Ezio Melottid2114eb2011-03-25 14:08:44 +0200995 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300996 for i in b'-.]{}':
997 b = bytes([i])
998 self.assertEqual(re.escape(b)[:1], b'\\')
999 literal_chars = self.LITERAL_CHARS.encode('ascii')
1000 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum698280d2008-09-10 17:44:35 +00001001
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001002 def test_re_escape_non_ascii(self):
1003 s = 'xxx\u2620\u2620\u2620xxx'
1004 s_escaped = re.escape(s)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001005 self.assertEqual(s_escaped, s)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001006 self.assertMatch(s_escaped, s)
1007 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1008 'x\u2620\u2620\u2620x', (2, 7), re.search)
1009
1010 def test_re_escape_non_ascii_bytes(self):
1011 b = 'y\u2620y\u2620y'.encode('utf-8')
1012 b_escaped = re.escape(b)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001013 self.assertEqual(b_escaped, b)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001014 self.assertMatch(b_escaped, b)
1015 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1016 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +00001017
Serhiy Storchakab85a9762014-09-15 11:33:19 +03001018 def test_pickling(self):
1019 import pickle
1020 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1021 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1022 pickled = pickle.dumps(oldpat, proto)
1023 newpat = pickle.loads(pickled)
1024 self.assertEqual(newpat, oldpat)
1025 # current pickle expects the _compile() reconstructor in re module
1026 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +00001027
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001028 def test_copying(self):
1029 import copy
1030 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1031 self.assertIs(copy.copy(p), p)
1032 self.assertIs(copy.deepcopy(p), p)
1033 m = p.match('12.34')
1034 self.assertIs(copy.copy(m), m)
1035 self.assertIs(copy.deepcopy(m), m)
1036
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001037 def test_constants(self):
1038 self.assertEqual(re.I, re.IGNORECASE)
1039 self.assertEqual(re.L, re.LOCALE)
1040 self.assertEqual(re.M, re.MULTILINE)
1041 self.assertEqual(re.S, re.DOTALL)
1042 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001043
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001044 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001045 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001046 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001047 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1048 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +00001049
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001050 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001051 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1052 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001053 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1054 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1055 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1056 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1057 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1058 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001059 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001060 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1061 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1062 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1063 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1064 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1065 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1066 self.assertTrue(re.match(r"\0", "\000"))
1067 self.assertTrue(re.match(r"\08", "\0008"))
1068 self.assertTrue(re.match(r"\01", "\001"))
1069 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001070 self.checkPatternError(r"\567",
1071 r'octal escape value \567 outside of '
1072 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001073 self.checkPatternError(r"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001074 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1075 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1076 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1077 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1078 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1079 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1080 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001081
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001082 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001083 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1084 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001085 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1086 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1087 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1088 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1089 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1090 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1091 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1092 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001093 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001094 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1095 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1096 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1097 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1098 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1099 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001100 self.checkPatternError(r"[\567]",
1101 r'octal escape value \567 outside of '
1102 r'range 0-0o377', 1)
1103 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1104 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1105 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1106 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1107 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +03001108 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001109
1110 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001111 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001112 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1113 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1114 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1115 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1116 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1117 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001118 self.assertRaises(re.error, re.compile, br"\u1234")
1119 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001120 self.assertTrue(re.match(br"\0", b"\000"))
1121 self.assertTrue(re.match(br"\08", b"\0008"))
1122 self.assertTrue(re.match(br"\01", b"\001"))
1123 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001124 self.checkPatternError(br"\567",
1125 r'octal escape value \567 outside of '
1126 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001127 self.checkPatternError(br"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001128 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1129 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001130
1131 def test_sre_byte_class_literals(self):
1132 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001133 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1134 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1135 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1136 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1137 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1138 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1139 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1140 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001141 self.assertRaises(re.error, re.compile, br"[\u1234]")
1142 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001143 self.checkPatternError(br"[\567]",
1144 r'octal escape value \567 outside of '
1145 r'range 0-0o377', 1)
1146 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1147 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1148
1149 def test_character_set_errors(self):
1150 self.checkPatternError(r'[', 'unterminated character set', 0)
1151 self.checkPatternError(r'[^', 'unterminated character set', 0)
1152 self.checkPatternError(r'[a', 'unterminated character set', 0)
1153 # bug 545855 -- This pattern failed to cause a compile error as it
1154 # should, instead provoking a TypeError.
1155 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1156 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1157 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1158 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001159
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001160 def test_bug_113254(self):
1161 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1162 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1163 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1164
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001165 def test_bug_527371(self):
1166 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001167 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001168 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1169 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
R David Murray44b548d2016-09-08 13:59:53 -04001170 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1171 self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001172
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001173 def test_bug_418626(self):
1174 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1175 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1176 # pattern '*?' on a long string.
1177 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1178 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1179 20003)
1180 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001181 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001182 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001183 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001184
1185 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001186 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001187 self.assertEqual(re.compile(pat) and 1, 1)
1188
Skip Montanaro1e703c62003-04-25 15:40:28 +00001189 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001190 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001191 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001192 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1193 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1194 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001195
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001196 def test_nothing_to_repeat(self):
1197 for reps in '*', '+', '?', '{1,2}':
1198 for mod in '', '?':
1199 self.checkPatternError('%s%s' % (reps, mod),
1200 'nothing to repeat', 0)
1201 self.checkPatternError('(?:%s%s)' % (reps, mod),
1202 'nothing to repeat', 3)
1203
1204 def test_multiple_repeat(self):
1205 for outer_reps in '*', '+', '{1,2}':
1206 for outer_mod in '', '?':
1207 outer_op = outer_reps + outer_mod
1208 for inner_reps in '*', '+', '?', '{1,2}':
1209 for inner_mod in '', '?':
1210 inner_op = inner_reps + inner_mod
1211 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1212 'multiple repeat', 1 + len(inner_op))
1213
Serhiy Storchakafa468162013-02-16 21:23:53 +02001214 def test_unlimited_zero_width_repeat(self):
1215 # Issue #9669
1216 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1217 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1218 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1219 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1220 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1221 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1222
Skip Montanaro1e703c62003-04-25 15:40:28 +00001223 def test_scanner(self):
1224 def s_ident(scanner, token): return token
1225 def s_operator(scanner, token): return "op%s" % token
1226 def s_float(scanner, token): return float(token)
1227 def s_int(scanner, token): return int(token)
1228
1229 scanner = Scanner([
1230 (r"[a-zA-Z_]\w*", s_ident),
1231 (r"\d+\.\d*", s_float),
1232 (r"\d+", s_int),
1233 (r"=|\+|-|\*|/", s_operator),
1234 (r"\s+", None),
1235 ])
1236
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001237 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001238
Skip Montanaro1e703c62003-04-25 15:40:28 +00001239 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1240 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1241 'op+', 'bar'], ''))
1242
Skip Montanaro5ba00542003-04-25 16:00:14 +00001243 def test_bug_448951(self):
1244 # bug 448951 (similar to 429357, but with single char match)
1245 # (Also test greedy matches.)
1246 for op in '','?','*':
1247 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1248 (None, None))
1249 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1250 ('a:', 'a'))
1251
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001252 def test_bug_725106(self):
1253 # capturing groups in alternatives in repeats
1254 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1255 ('b', 'a'))
1256 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1257 ('c', 'b'))
1258 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1259 ('b', None))
1260 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1261 ('b', None))
1262 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1263 ('b', 'a'))
1264 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1265 ('c', 'b'))
1266 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1267 ('b', None))
1268 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1269 ('b', None))
1270
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001271 def test_bug_725149(self):
1272 # mark_stack_base restoring before restoring marks
1273 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1274 ('a', None))
1275 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1276 ('a', None, None))
1277
Just van Rossum12723ba2003-07-02 20:03:04 +00001278 def test_bug_764548(self):
1279 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001280 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001281 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001282 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001283
Skip Montanaro5ba00542003-04-25 16:00:14 +00001284 def test_finditer(self):
1285 iter = re.finditer(r":+", "a:b::c:::d")
1286 self.assertEqual([item.group(0) for item in iter],
1287 [":", "::", ":::"])
1288
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001289 pat = re.compile(r":+")
1290 iter = pat.finditer("a:b::c:::d", 1, 10)
1291 self.assertEqual([item.group(0) for item in iter],
1292 [":", "::", ":::"])
1293
1294 pat = re.compile(r":+")
1295 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1296 self.assertEqual([item.group(0) for item in iter],
1297 [":", "::", ":::"])
1298
1299 pat = re.compile(r":+")
1300 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1301 self.assertEqual([item.group(0) for item in iter],
1302 [":", "::", ":::"])
1303
1304 pat = re.compile(r":+")
1305 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1306 self.assertEqual([item.group(0) for item in iter],
1307 ["::", "::"])
1308
Thomas Wouters40a088d2008-03-18 20:19:54 +00001309 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001310 self.assertIsNot(re.compile('bug_926075'),
1311 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001312
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001313 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001314 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001315 self.assertEqual(re.compile(pattern).split("a.b.c"),
1316 ['a','b','c'])
1317
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001318 def test_bug_581080(self):
1319 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001320 self.assertEqual(next(iter).span(), (1,2))
1321 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001322
1323 scanner = re.compile(r"\s").scanner("a b")
1324 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001325 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001326
1327 def test_bug_817234(self):
1328 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001329 self.assertEqual(next(iter).span(), (0, 4))
1330 self.assertEqual(next(iter).span(), (4, 4))
1331 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001332
Mark Dickinson1f268282009-07-28 17:22:36 +00001333 def test_bug_6561(self):
1334 # '\d' should match characters in Unicode category 'Nd'
1335 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1336 # Letter) or 'No' (Number, Other).
1337 decimal_digits = [
1338 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1339 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1340 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1341 ]
1342 for x in decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001343 self.assertEqual(re.match(r'^\d$', x).group(0), x)
Mark Dickinson1f268282009-07-28 17:22:36 +00001344
1345 not_decimal_digits = [
1346 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1347 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1348 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1349 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1350 ]
1351 for x in not_decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001352 self.assertIsNone(re.match(r'^\d$', x))
Mark Dickinson1f268282009-07-28 17:22:36 +00001353
Guido van Rossumd8faa362007-04-27 19:54:29 +00001354 def test_empty_array(self):
1355 # SF buf 1647541
1356 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001357 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001358 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001359 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001360 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001361
Christian Heimes072c0f12008-01-03 23:01:04 +00001362 def test_inline_flags(self):
1363 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001364 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1365 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001366
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001367 p = re.compile('.' + upper_char, re.I | re.S)
1368 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001369 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001370
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001371 p = re.compile('.' + lower_char, re.I | re.S)
1372 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001373 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001374
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001375 p = re.compile('(?i).' + upper_char, re.S)
1376 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001377 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001378
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001379 p = re.compile('(?i).' + lower_char, re.S)
1380 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001381 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001382
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001383 p = re.compile('(?is).' + upper_char)
1384 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001385 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001386
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001387 p = re.compile('(?is).' + lower_char)
1388 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001389 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001390
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001391 p = re.compile('(?s)(?i).' + upper_char)
1392 q = p.match('\n' + lower_char)
1393 self.assertTrue(q)
1394
1395 p = re.compile('(?s)(?i).' + lower_char)
1396 q = p.match('\n' + upper_char)
1397 self.assertTrue(q)
1398
1399 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1400 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1401 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1402 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1403 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
Serhiy Storchakad65cd092016-09-11 01:39:01 +03001404
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001405 p = upper_char + '(?i)'
1406 with self.assertWarns(DeprecationWarning) as warns:
1407 self.assertTrue(re.match(p, lower_char))
1408 self.assertEqual(
1409 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001410 'Flags not at the start of the expression %r' % p
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001411 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001412 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001413
1414 p = upper_char + '(?i)%s' % ('.?' * 100)
1415 with self.assertWarns(DeprecationWarning) as warns:
1416 self.assertTrue(re.match(p, lower_char))
1417 self.assertEqual(
1418 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001419 'Flags not at the start of the expression %r (truncated)' % p[:20]
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001420 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001421 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakabd48d272016-09-11 12:50:02 +03001422
Roy Williams171b9a32017-06-09 22:01:16 -07001423 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1424 with warnings.catch_warnings():
1425 warnings.simplefilter('error', BytesWarning)
1426 p = b'A(?i)'
1427 with self.assertWarns(DeprecationWarning) as warns:
1428 self.assertTrue(re.match(p, b'a'))
1429 self.assertEqual(
1430 str(warns.warnings[0].message),
1431 'Flags not at the start of the expression %r' % p
1432 )
1433 self.assertEqual(warns.warnings[0].filename, __file__)
1434
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001435 with self.assertWarns(DeprecationWarning):
1436 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1437 with self.assertWarns(DeprecationWarning):
1438 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1439 with self.assertWarns(DeprecationWarning):
1440 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1441 with self.assertWarns(DeprecationWarning):
1442 self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1443 with self.assertWarns(DeprecationWarning):
1444 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001445 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001446 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001447 self.assertRegex(str(warns.warnings[0].message),
1448 'Flags not at the start')
1449 self.assertEqual(warns.warnings[0].filename, __file__)
1450 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001451 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1452 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001453 self.assertRegex(str(warns.warnings[0].message),
1454 'Flags not at the start')
1455 self.assertEqual(warns.warnings[0].filename, __file__)
1456 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001457 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1458 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001459 self.assertRegex(str(warns.warnings[0].message),
1460 'Flags not at the start')
1461 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001462
1463
Christian Heimes25bb7832008-01-11 16:17:00 +00001464 def test_dollar_matches_twice(self):
1465 "$ matches the end of string, and just before the terminating \n"
1466 pattern = re.compile('$')
1467 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1468 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1469 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1470
1471 pattern = re.compile('$', re.MULTILINE)
1472 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1473 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1474 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1475
Antoine Pitroufd036452008-08-19 17:56:33 +00001476 def test_bytes_str_mixing(self):
1477 # Mixing str and bytes is disallowed
1478 pat = re.compile('.')
1479 bpat = re.compile(b'.')
1480 self.assertRaises(TypeError, pat.match, b'b')
1481 self.assertRaises(TypeError, bpat.match, 'b')
1482 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1483 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1484 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1485 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1486 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1487 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1488
1489 def test_ascii_and_unicode_flag(self):
1490 # String patterns
1491 for flags in (0, re.UNICODE):
1492 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001493 self.assertTrue(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001494 pat = re.compile(r'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001495 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001496 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001497 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001498 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001499 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001500 pat = re.compile(r'\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001501 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001502 pat = re.compile(r'(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001503 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001504 # Bytes patterns
1505 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001506 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001507 self.assertIsNone(pat.match(b'\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001508 pat = re.compile(br'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001509 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001510 # Incompatibilities
R David Murray44b548d2016-09-08 13:59:53 -04001511 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001512 self.assertRaises(re.error, re.compile, br'(?u)\w')
R David Murray44b548d2016-09-08 13:59:53 -04001513 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1514 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1515 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001516 self.assertRaises(re.error, re.compile, r'(?au)\w')
Antoine Pitroufd036452008-08-19 17:56:33 +00001517
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001518 def test_locale_flag(self):
Victor Stinner279657b2019-03-05 16:17:43 +01001519 # On Windows, Python 3.7 doesn't call setlocale(LC_CTYPE, "") at
1520 # startup and so the LC_CTYPE locale uses Latin1 encoding by default,
1521 # whereas getpreferredencoding() returns the ANSI code page. Set
1522 # temporarily the LC_CTYPE locale to the user preferred encoding to
1523 # ensure that it uses the ANSI code page.
1524 oldloc = locale.setlocale(locale.LC_CTYPE, None)
1525 locale.setlocale(locale.LC_CTYPE, "")
1526 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldloc)
1527
1528 # Get the current locale encoding
1529 enc = locale.getpreferredencoding(False)
1530
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001531 # Search non-ASCII letter
1532 for i in range(128, 256):
1533 try:
1534 c = bytes([i]).decode(enc)
1535 sletter = c.lower()
1536 if sletter == c: continue
1537 bletter = sletter.encode(enc)
1538 if len(bletter) != 1: continue
1539 if bletter.decode(enc) != sletter: continue
1540 bpat = re.escape(bytes([i]))
1541 break
1542 except (UnicodeError, TypeError):
1543 pass
Benjamin Peterson1e687162017-03-01 21:53:00 -08001544 else:
1545 bletter = None
1546 bpat = b'A'
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001547 # Bytes patterns
1548 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1549 if bletter:
1550 self.assertTrue(pat.match(bletter))
1551 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1552 if bletter:
1553 self.assertTrue(pat.match(bletter))
1554 pat = re.compile(bpat, re.IGNORECASE)
1555 if bletter:
1556 self.assertIsNone(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001557 pat = re.compile(br'\w', re.LOCALE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001558 if bletter:
1559 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001560 pat = re.compile(br'(?L)\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001561 if bletter:
1562 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001563 pat = re.compile(br'\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001564 if bletter:
1565 self.assertIsNone(pat.match(bletter))
1566 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001567 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001568 self.assertRaises(re.error, re.compile, '(?L)')
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001569 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1570 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1571 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001572 self.assertRaises(re.error, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001573
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001574 def test_scoped_flags(self):
1575 self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1576 self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1577 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1578 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1579 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1580 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1581
1582 self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1583 self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1584 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1585 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1586
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001587 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1588 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1589 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1590
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001591 self.checkPatternError(r'(?a)(?-a:\w)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001592 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001593 self.checkPatternError(r'(?i-i:a)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001594 'bad inline flags: flag turned on and off', 5)
1595 self.checkPatternError(r'(?au:a)',
1596 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1597 self.checkPatternError(br'(?aL:a)',
1598 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001599
1600 self.checkPatternError(r'(?-', 'missing flag', 3)
1601 self.checkPatternError(r'(?-+', 'missing flag', 3)
1602 self.checkPatternError(r'(?-z', 'unknown flag', 3)
1603 self.checkPatternError(r'(?-i', 'missing :', 4)
1604 self.checkPatternError(r'(?-i)', 'missing :', 4)
1605 self.checkPatternError(r'(?-i+', 'missing :', 4)
1606 self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1607 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1608 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1609 self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1610 self.checkPatternError(r'(?iz', 'unknown flag', 3)
1611
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001612 def test_bug_6509(self):
1613 # Replacement strings of both types must parse properly.
1614 # all strings
R David Murray44b548d2016-09-08 13:59:53 -04001615 pat = re.compile(r'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001616 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1617 pat = re.compile('a(.)')
1618 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1619 pat = re.compile('..')
1620 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1621
1622 # all bytes
R David Murray44b548d2016-09-08 13:59:53 -04001623 pat = re.compile(br'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001624 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1625 pat = re.compile(b'a(.)')
1626 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1627 pat = re.compile(b'..')
1628 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1629
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001630 def test_dealloc(self):
1631 # issue 3299: check for segfault in debug build
1632 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001633 # the overflow limit is different on wide and narrow builds and it
1634 # depends on the definition of SRE_CODE (see sre.h).
1635 # 2**128 should be big enough to overflow on both. For smaller values
1636 # a RuntimeError is raised instead of OverflowError.
1637 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001638 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001639 with self.assertRaises(OverflowError):
Victor Stinner726a57d2016-11-22 23:04:39 +01001640 _sre.compile("abc", 0, [long_overflow], 0, {}, ())
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001641 with self.assertRaises(TypeError):
1642 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001645 self.assertTrue(re.search("123.*-", '123abc-'))
1646 self.assertTrue(re.search("123.*-", '123\xe9-'))
1647 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1648 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1649 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650
Ezio Melottidf723e12012-03-13 01:29:48 +02001651 def test_compile(self):
1652 # Test return value when given string and pattern as parameter
1653 pattern = re.compile('random pattern')
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001654 self.assertIsInstance(pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001655 same_pattern = re.compile(pattern)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001656 self.assertIsInstance(same_pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001657 self.assertIs(same_pattern, pattern)
1658 # Test behaviour when not given a string or pattern as parameter
1659 self.assertRaises(TypeError, re.compile, 0)
1660
Antoine Pitroub33941a2012-12-03 20:55:56 +01001661 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001662 def test_large_search(self, size):
1663 # Issue #10182: indices were 32-bit-truncated.
1664 s = 'a' * size
1665 m = re.search('$', s)
1666 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001667 self.assertEqual(m.start(), size)
1668 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001669
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001670 # The huge memuse is because of re.sub() using a list and a join()
1671 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001672 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001673 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001674 # Issue #10182: indices were 32-bit-truncated.
1675 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001676 r, n = re.subn('', '', s)
1677 self.assertEqual(r, s)
1678 self.assertEqual(n, size + 1)
1679
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001680 def test_bug_16688(self):
1681 # Issue 16688: Backreferences make case-insensitive regex fail on
1682 # non-ASCII strings.
1683 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1684 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001685
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001686 def test_repeat_minmax_overflow(self):
1687 # Issue #13169
1688 string = "x" * 100000
1689 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1690 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1691 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1692 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1693 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1694 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1695 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1696 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1697 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1698 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1699 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1700
1701 @cpython_only
1702 def test_repeat_minmax_overflow_maxrepeat(self):
1703 try:
1704 from _sre import MAXREPEAT
1705 except ImportError:
1706 self.skipTest('requires _sre.MAXREPEAT constant')
1707 string = "x" * 100000
1708 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1709 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1710 (0, 100000))
1711 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1712 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1713 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1714 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1715
R David Murray26dfaac92013-04-14 13:00:54 -04001716 def test_backref_group_name_in_exception(self):
1717 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001718 self.checkPatternError('(?P=<foo>)',
1719 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001720
1721 def test_group_name_in_exception(self):
1722 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001723 self.checkPatternError('(?P<?foo>)',
1724 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001725
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001726 def test_issue17998(self):
1727 for reps in '*', '+', '?', '{1}':
1728 for mod in '', '?':
1729 pattern = '.' + reps + mod + 'yz'
1730 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1731 ['xyz'], msg=pattern)
1732 pattern = pattern.encode()
1733 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1734 [b'xyz'], msg=pattern)
1735
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001736 def test_match_repr(self):
1737 for string in '[abracadabra]', S('[abracadabra]'):
1738 m = re.search(r'(.+)(.*?)\1', string)
1739 self.assertEqual(repr(m), "<%s.%s object; "
1740 "span=(1, 12), match='abracadabra'>" %
1741 (type(m).__module__, type(m).__qualname__))
1742 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1743 bytearray(b'[abracadabra]'),
1744 memoryview(b'[abracadabra]')):
R David Murray44b548d2016-09-08 13:59:53 -04001745 m = re.search(br'(.+)(.*?)\1', string)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001746 self.assertEqual(repr(m), "<%s.%s object; "
1747 "span=(1, 12), match=b'abracadabra'>" %
1748 (type(m).__module__, type(m).__qualname__))
1749
1750 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1751 self.assertEqual(repr(first), "<%s.%s object; "
1752 "span=(0, 2), match='aa'>" %
1753 (type(second).__module__, type(first).__qualname__))
1754 self.assertEqual(repr(second), "<%s.%s object; "
1755 "span=(3, 5), match='bb'>" %
1756 (type(second).__module__, type(second).__qualname__))
1757
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001758 def test_zerowidth(self):
1759 # Issues 852532, 1647489, 3262, 25054.
1760 self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001761 self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
1762 self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001763 self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
1764
1765 self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
Serhiy Storchakafbb490f2018-01-04 11:06:13 +02001766 self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
1767 self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
Serhiy Storchaka70d56fb2017-12-04 14:29:05 +02001768
1769 self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
1770 self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
1771 ['', 'a', '', '', 'bc', ''])
1772
1773 self.assertEqual([m.span() for m in re.finditer(r"\b|:+", "a::bc")],
1774 [(0, 0), (1, 1), (1, 3), (3, 3), (5, 5)])
1775 self.assertEqual([m.span() for m in re.finditer(r"\b|\w+", "a::bc")],
1776 [(0, 0), (0, 1), (1, 1), (3, 3), (3, 5), (5, 5)])
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001777
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001778 def test_bug_2537(self):
1779 # issue 2537: empty submatches
1780 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1781 for inner_op in ('{0,}', '*', '?'):
1782 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1783 m = r.match("xyyzy")
1784 self.assertEqual(m.group(0), "xyy")
1785 self.assertEqual(m.group(1), "")
1786 self.assertEqual(m.group(2), "y")
1787
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001788 @cpython_only
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001789 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001790 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001791 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001792 re.compile(pat, re.DEBUG)
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001793 self.maxDiff = None
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001794 dump = '''\
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001795SUBPATTERN 1 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001796 LITERAL 46
Serhiy Storchaka821a9d12017-05-14 08:32:33 +03001797BRANCH
1798 IN
1799 LITERAL 99
1800 LITERAL 104
1801OR
1802 LITERAL 112
1803 LITERAL 121
1804GROUPREF_EXISTS 1
1805 AT AT_END
1806ELSE
1807 LITERAL 58
1808 LITERAL 32
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001809
1810 0. INFO 8 0b1 2 5 (to 9)
1811 prefix_skip 0
1812 prefix [0x2e] ('.')
1813 overlap [0]
1814 9: MARK 0
181511. LITERAL 0x2e ('.')
181613. MARK 1
181715. BRANCH 10 (to 26)
181817. IN 6 (to 24)
181919. LITERAL 0x63 ('c')
182021. LITERAL 0x68 ('h')
182123. FAILURE
182224: JUMP 9 (to 34)
182326: branch 7 (to 33)
182427. LITERAL 0x70 ('p')
182529. LITERAL 0x79 ('y')
182631. JUMP 2 (to 34)
182733: FAILURE
182834: GROUPREF_EXISTS 0 6 (to 41)
182937. AT END
183039. JUMP 5 (to 45)
183141: LITERAL 0x3a (':')
183243. LITERAL 0x20 (' ')
183345: SUCCESS
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001834'''
1835 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001836 # Debug output is output again even a second time (bypassing
1837 # the cache -- issue #20426).
1838 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001839 re.compile(pat, re.DEBUG)
1840 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001841
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001842 def test_keyword_parameters(self):
1843 # Issue #20283: Accepting the string keyword parameter.
1844 pat = re.compile(r'(ab)')
1845 self.assertEqual(
1846 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1847 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001848 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1849 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001850 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1851 self.assertEqual(
1852 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1853 self.assertEqual(
1854 pat.split(string='abracadabra', maxsplit=1),
1855 ['', 'ab', 'racadabra'])
1856 self.assertEqual(
1857 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1858 (7, 9))
1859
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001860 def test_bug_20998(self):
1861 # Issue #20998: Fullmatch of repeated single character pattern
1862 # with ignore case.
1863 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1864
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001865 def test_locale_caching(self):
1866 # Issue #22410
1867 oldlocale = locale.setlocale(locale.LC_CTYPE)
1868 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1869 for loc in 'en_US.iso88591', 'en_US.utf8':
1870 try:
1871 locale.setlocale(locale.LC_CTYPE, loc)
1872 except locale.Error:
1873 # Unsupported locale on this system
1874 self.skipTest('test needs %s locale' % loc)
1875
1876 re.purge()
1877 self.check_en_US_iso88591()
1878 self.check_en_US_utf8()
1879 re.purge()
1880 self.check_en_US_utf8()
1881 self.check_en_US_iso88591()
1882
1883 def check_en_US_iso88591(self):
1884 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1885 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1886 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1887 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1888 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1889 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1890 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1891
1892 def check_en_US_utf8(self):
1893 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1894 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1895 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1896 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1897 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1898 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1899 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1900
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001901 def test_locale_compiled(self):
1902 oldlocale = locale.setlocale(locale.LC_CTYPE)
1903 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1904 for loc in 'en_US.iso88591', 'en_US.utf8':
1905 try:
1906 locale.setlocale(locale.LC_CTYPE, loc)
1907 except locale.Error:
1908 # Unsupported locale on this system
1909 self.skipTest('test needs %s locale' % loc)
1910
1911 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1912 p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1913 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1914 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1915 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1916 for p in p1, p2, p3:
1917 self.assertTrue(p.match(b'\xc5\xe5'))
1918 self.assertTrue(p.match(b'\xe5\xe5'))
1919 self.assertTrue(p.match(b'\xc5\xc5'))
1920 self.assertIsNone(p4.match(b'\xe5\xc5'))
1921 self.assertIsNone(p4.match(b'\xe5\xe5'))
1922 self.assertIsNone(p4.match(b'\xc5\xc5'))
1923
1924 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1925 for p in p1, p2, p3:
1926 self.assertTrue(p.match(b'\xc5\xe5'))
1927 self.assertIsNone(p.match(b'\xe5\xe5'))
1928 self.assertIsNone(p.match(b'\xc5\xc5'))
1929 self.assertTrue(p4.match(b'\xe5\xc5'))
1930 self.assertIsNone(p4.match(b'\xe5\xe5'))
1931 self.assertIsNone(p4.match(b'\xc5\xc5'))
1932
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001933 def test_error(self):
1934 with self.assertRaises(re.error) as cm:
1935 re.compile('(\u20ac))')
1936 err = cm.exception
1937 self.assertIsInstance(err.pattern, str)
1938 self.assertEqual(err.pattern, '(\u20ac))')
1939 self.assertEqual(err.pos, 3)
1940 self.assertEqual(err.lineno, 1)
1941 self.assertEqual(err.colno, 4)
1942 self.assertIn(err.msg, str(err))
1943 self.assertIn(' at position 3', str(err))
1944 self.assertNotIn(' at position 3', err.msg)
1945 # Bytes pattern
1946 with self.assertRaises(re.error) as cm:
1947 re.compile(b'(\xa4))')
1948 err = cm.exception
1949 self.assertIsInstance(err.pattern, bytes)
1950 self.assertEqual(err.pattern, b'(\xa4))')
1951 self.assertEqual(err.pos, 3)
1952 # Multiline pattern
1953 with self.assertRaises(re.error) as cm:
1954 re.compile("""
1955 (
1956 abc
1957 )
1958 )
1959 (
1960 """, re.VERBOSE)
1961 err = cm.exception
1962 self.assertEqual(err.pos, 77)
1963 self.assertEqual(err.lineno, 5)
1964 self.assertEqual(err.colno, 17)
1965 self.assertIn(err.msg, str(err))
1966 self.assertIn(' at position 77', str(err))
1967 self.assertIn('(line 5, column 17)', str(err))
1968
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001969 def test_misc_errors(self):
1970 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1971 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1972 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1973 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1974 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1975 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001976 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001977 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1978 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1979 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1980 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1981
Victor Stinner8bf43e62016-11-14 12:38:43 +01001982 def test_enum(self):
1983 # Issue #28082: Check that str(flag) returns a human readable string
1984 # instead of an integer
1985 self.assertIn('ASCII', str(re.A))
1986 self.assertIn('DOTALL', str(re.S))
1987
Victor Stinnerb44fb122016-11-21 16:35:08 +01001988 def test_pattern_compare(self):
1989 pattern1 = re.compile('abc', re.IGNORECASE)
1990
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01001991 # equal to itself
1992 self.assertEqual(pattern1, pattern1)
1993 self.assertFalse(pattern1 != pattern1)
1994
Victor Stinnerb44fb122016-11-21 16:35:08 +01001995 # equal
1996 re.purge()
1997 pattern2 = re.compile('abc', re.IGNORECASE)
1998 self.assertEqual(hash(pattern2), hash(pattern1))
1999 self.assertEqual(pattern2, pattern1)
2000
2001 # not equal: different pattern
2002 re.purge()
2003 pattern3 = re.compile('XYZ', re.IGNORECASE)
2004 # Don't test hash(pattern3) != hash(pattern1) because there is no
2005 # warranty that hash values are different
2006 self.assertNotEqual(pattern3, pattern1)
2007
2008 # not equal: different flag (flags=0)
2009 re.purge()
2010 pattern4 = re.compile('abc')
2011 self.assertNotEqual(pattern4, pattern1)
2012
2013 # only == and != comparison operators are supported
2014 with self.assertRaises(TypeError):
2015 pattern1 < pattern2
2016
2017 def test_pattern_compare_bytes(self):
2018 pattern1 = re.compile(b'abc')
2019
2020 # equal: test bytes patterns
2021 re.purge()
2022 pattern2 = re.compile(b'abc')
2023 self.assertEqual(hash(pattern2), hash(pattern1))
2024 self.assertEqual(pattern2, pattern1)
2025
2026 # not equal: pattern of a different types (str vs bytes),
2027 # comparison must not raise a BytesWarning
2028 re.purge()
2029 pattern3 = re.compile('abc')
2030 with warnings.catch_warnings():
2031 warnings.simplefilter('error', BytesWarning)
2032 self.assertNotEqual(pattern3, pattern1)
2033
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02002034 def test_bug_29444(self):
2035 s = bytearray(b'abcdefgh')
2036 m = re.search(b'[a-h]+', s)
2037 m2 = re.search(b'[e-h]+', s)
2038 self.assertEqual(m.group(), b'abcdefgh')
2039 self.assertEqual(m2.group(), b'efgh')
2040 s[:] = b'xyz'
2041 self.assertEqual(m.group(), b'xyz')
2042 self.assertEqual(m2.group(), b'')
2043
Miss Islington (bot)0e379d42019-02-18 05:48:23 -08002044 def test_bug_34294(self):
2045 # Issue 34294: wrong capturing groups
2046
2047 # exists since Python 2
2048 s = "a\tx"
2049 p = r"\b(?=(\t)|(x))x"
2050 self.assertEqual(re.search(p, s).groups(), (None, 'x'))
2051
2052 # introduced in Python 3.7.0
2053 s = "ab"
2054 p = r"(?=(.)(.)?)"
2055 self.assertEqual(re.findall(p, s),
2056 [('a', 'b'), ('b', '')])
2057 self.assertEqual([m.groups() for m in re.finditer(p, s)],
2058 [('a', 'b'), ('b', None)])
2059
2060 # test-cases provided by issue34294, introduced in Python 3.7.0
2061 p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
2062 s = "<test><foo2/></test>"
2063 self.assertEqual(re.findall(p, s),
2064 [('test', '<foo2/>'), ('foo2', '')])
2065 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2066 [{'tag': 'test', 'text': '<foo2/>'},
2067 {'tag': 'foo2', 'text': None}])
2068 s = "<test>Hello</test><foo/>"
2069 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2070 [{'tag': 'test', 'text': 'Hello'},
2071 {'tag': 'foo', 'text': None}])
2072 s = "<test>Hello</test><foo/><foo/>"
2073 self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
2074 [{'tag': 'test', 'text': 'Hello'},
2075 {'tag': 'foo', 'text': None},
2076 {'tag': 'foo', 'text': None}])
2077
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002078
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002079class PatternReprTests(unittest.TestCase):
2080 def check(self, pattern, expected):
2081 self.assertEqual(repr(re.compile(pattern)), expected)
2082
2083 def check_flags(self, pattern, flags, expected):
2084 self.assertEqual(repr(re.compile(pattern, flags)), expected)
2085
2086 def test_without_flags(self):
2087 self.check('random pattern',
2088 "re.compile('random pattern')")
2089
2090 def test_single_flag(self):
2091 self.check_flags('random pattern', re.IGNORECASE,
2092 "re.compile('random pattern', re.IGNORECASE)")
2093
2094 def test_multiple_flags(self):
2095 self.check_flags('random pattern', re.I|re.S|re.X,
2096 "re.compile('random pattern', "
2097 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2098
2099 def test_unicode_flag(self):
2100 self.check_flags('random pattern', re.U,
2101 "re.compile('random pattern')")
2102 self.check_flags('random pattern', re.I|re.S|re.U,
2103 "re.compile('random pattern', "
2104 "re.IGNORECASE|re.DOTALL)")
2105
2106 def test_inline_flags(self):
2107 self.check('(?i)pattern',
2108 "re.compile('(?i)pattern', re.IGNORECASE)")
2109
2110 def test_unknown_flags(self):
2111 self.check_flags('random pattern', 0x123000,
2112 "re.compile('random pattern', 0x123000)")
2113 self.check_flags('random pattern', 0x123000|re.I,
2114 "re.compile('random pattern', re.IGNORECASE|0x123000)")
2115
2116 def test_bytes(self):
2117 self.check(b'bytes pattern',
2118 "re.compile(b'bytes pattern')")
2119 self.check_flags(b'bytes pattern', re.A,
2120 "re.compile(b'bytes pattern', re.ASCII)")
2121
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002122 def test_locale(self):
2123 self.check_flags(b'bytes pattern', re.L,
2124 "re.compile(b'bytes pattern', re.LOCALE)")
2125
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002126 def test_quotes(self):
2127 self.check('random "double quoted" pattern',
2128 '''re.compile('random "double quoted" pattern')''')
2129 self.check("random 'single quoted' pattern",
2130 '''re.compile("random 'single quoted' pattern")''')
2131 self.check('''both 'single' and "double" quotes''',
2132 '''re.compile('both \\'single\\' and "double" quotes')''')
2133
2134 def test_long_pattern(self):
2135 pattern = 'Very %spattern' % ('long ' * 1000)
2136 r = repr(re.compile(pattern))
2137 self.assertLess(len(r), 300)
2138 self.assertEqual(r[:30], "re.compile('Very long long lon")
2139 r = repr(re.compile(pattern, re.I))
2140 self.assertLess(len(r), 300)
2141 self.assertEqual(r[:30], "re.compile('Very long long lon")
2142 self.assertEqual(r[-16:], ", re.IGNORECASE)")
2143
2144
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002145class ImplementationTest(unittest.TestCase):
2146 """
2147 Test implementation details of the re module.
2148 """
2149
2150 def test_overlap_table(self):
2151 f = sre_compile._generate_overlap_table
2152 self.assertEqual(f(""), [])
2153 self.assertEqual(f("a"), [0])
2154 self.assertEqual(f("abcd"), [0, 0, 0, 0])
2155 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2156 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2157 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2158
2159
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002160class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00002161
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002162 def test_re_benchmarks(self):
2163 're_tests benchmarks'
2164 from test.re_tests import benchmarks
2165 for pattern, s in benchmarks:
2166 with self.subTest(pattern=pattern, string=s):
2167 p = re.compile(pattern)
2168 self.assertTrue(p.search(s))
2169 self.assertTrue(p.match(s))
2170 self.assertTrue(p.fullmatch(s))
2171 s2 = ' '*10000 + s + ' '*10000
2172 self.assertTrue(p.search(s2))
2173 self.assertTrue(p.match(s2, 10000))
2174 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2175 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002176
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002177 def test_re_tests(self):
2178 're_tests test suite'
2179 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
2180 for t in tests:
2181 pattern = s = outcome = repl = expected = None
2182 if len(t) == 5:
2183 pattern, s, outcome, repl, expected = t
2184 elif len(t) == 3:
2185 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00002186 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002187 raise ValueError('Test tuples should have 3 or 5 fields', t)
2188
2189 with self.subTest(pattern=pattern, string=s):
2190 if outcome == SYNTAX_ERROR: # Expected a syntax error
2191 with self.assertRaises(re.error):
2192 re.compile(pattern)
2193 continue
2194
2195 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002196 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002197 if outcome == FAIL:
2198 self.assertIsNone(result, 'Succeeded incorrectly')
2199 continue
2200
2201 with self.subTest():
2202 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002203 # Matched, as expected, so now we compute the
2204 # result string and compare it to our expected result.
2205 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002206 vardict = {'found': result.group(0),
2207 'groups': result.group(),
2208 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002209 for i in range(1, 100):
2210 try:
2211 gi = result.group(i)
2212 # Special hack because else the string concat fails:
2213 if gi is None:
2214 gi = "None"
2215 except IndexError:
2216 gi = "Error"
2217 vardict['g%d' % i] = gi
2218 for i in result.re.groupindex.keys():
2219 try:
2220 gi = result.group(i)
2221 if gi is None:
2222 gi = "None"
2223 except IndexError:
2224 gi = "Error"
2225 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002226 self.assertEqual(eval(repl, vardict), expected,
2227 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002228
Antoine Pitrou22628c42008-07-22 17:53:22 +00002229 # Try the match with both pattern and string converted to
2230 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002231 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00002232 bpat = bytes(pattern, "ascii")
2233 bs = bytes(s, "ascii")
2234 except UnicodeEncodeError:
2235 # skip non-ascii tests
2236 pass
2237 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002238 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002239 obj = re.compile(bpat)
2240 self.assertTrue(obj.search(bs))
2241
2242 # Try the match with LOCALE enabled, and check that it
2243 # still succeeds.
2244 with self.subTest('locale-sensitive match'):
2245 obj = re.compile(bpat, re.LOCALE)
2246 result = obj.search(bs)
2247 if result is None:
2248 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002249
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002250 # Try the match with the search area limited to the extent
2251 # of the match and see if it still succeeds. \B will
2252 # break (because it won't match at the end or start of a
2253 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002254 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2255 and result is not None):
2256 with self.subTest('range-limited match'):
2257 obj = re.compile(pattern)
2258 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00002259
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002260 # Try the match with IGNORECASE enabled, and check that it
2261 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002262 with self.subTest('case-insensitive match'):
2263 obj = re.compile(pattern, re.IGNORECASE)
2264 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00002265
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002266 # Try the match with UNICODE locale enabled, and check
2267 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002268 with self.subTest('unicode-sensitive match'):
2269 obj = re.compile(pattern, re.UNICODE)
2270 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002271
Gregory P. Smith5a631832010-07-27 05:31:29 +00002272
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002273if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002274 unittest.main()