blob: ee87446b7924a92f8a306cd6cd66831ba0a6b3c9 [file] [log] [blame]
Victor Stinnerd6debb22017-03-27 16:05:26 +02001from test.support import (gc_collect, bigmemtest, _2G,
2 cpython_only, captured_stdout)
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02003import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00004import re
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02005import sre_compile
Ezio Melottid2114eb2011-03-25 14:08:44 +02006import string
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import unittest
Victor Stinnerb44fb122016-11-21 16:35:08 +01008import warnings
9from re import Scanner
Raymond Hettinger027bb632004-05-31 03:09:25 +000010from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000011
Guido van Rossum23b22571997-07-17 22:36:14 +000012# Misc tests from Tim Peters' re.doc
13
Just van Rossum6802c6e2003-07-02 14:36:59 +000014# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020015# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000016# cover most of the code.
17
Serhiy Storchaka25324972013-10-16 12:46:28 +030018class S(str):
19 def __getitem__(self, index):
20 return S(super().__getitem__(index))
21
22class B(bytes):
23 def __getitem__(self, index):
24 return B(super().__getitem__(index))
25
Skip Montanaro8ed06da2003-04-24 19:43:18 +000026class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000027
Serhiy Storchaka25324972013-10-16 12:46:28 +030028 def assertTypedEqual(self, actual, expect, msg=None):
29 self.assertEqual(actual, expect, msg)
30 def recurse(actual, expect):
31 if isinstance(expect, (tuple, list)):
32 for x, y in zip(actual, expect):
33 recurse(x, y)
34 else:
35 self.assertIs(type(actual), type(expect), msg)
36 recurse(actual, expect)
37
Serhiy Storchaka632a77e2015-03-25 21:03:47 +020038 def checkPatternError(self, pattern, errmsg, pos=None):
39 with self.assertRaises(re.error) as cm:
40 re.compile(pattern)
41 with self.subTest(pattern=pattern):
42 err = cm.exception
43 self.assertEqual(err.msg, errmsg)
44 if pos is not None:
45 self.assertEqual(err.pos, pos)
46
47 def checkTemplateError(self, pattern, repl, string, errmsg, pos=None):
48 with self.assertRaises(re.error) as cm:
49 re.sub(pattern, repl, string)
50 with self.subTest(pattern=pattern, repl=repl):
51 err = cm.exception
52 self.assertEqual(err.msg, errmsg)
53 if pos is not None:
54 self.assertEqual(err.pos, pos)
55
Benjamin Petersone48944b2012-03-07 14:50:25 -060056 def test_keep_buffer(self):
57 # See bug 14212
58 b = bytearray(b'x')
59 it = re.finditer(b'a', b)
60 with self.assertRaises(BufferError):
61 b.extend(b'x'*400)
62 list(it)
63 del it
64 gc_collect()
65 b.extend(b'x'*400)
66
Raymond Hettinger027bb632004-05-31 03:09:25 +000067 def test_weakref(self):
68 s = 'QabbbcR'
69 x = re.compile('ab+c')
70 y = proxy(x)
71 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
72
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 def test_search_star_plus(self):
74 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
75 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
76 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
77 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030078 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000079 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
80 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
81 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
82 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030083 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000084
Skip Montanaro8ed06da2003-04-24 19:43:18 +000085 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000086 int_value = int(matchobj.group(0))
87 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000088
Skip Montanaro8ed06da2003-04-24 19:43:18 +000089 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030090 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
91 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
92 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
93 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
94 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
95 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030096 for y in ("\xe0", "\u0430", "\U0001d49c"):
97 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030098
Skip Montanaro8ed06da2003-04-24 19:43:18 +000099 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
100 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
101 '9.3 -3 24x100y')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300102 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
103 '9.3 -3 23x99y')
Victor Stinner55e614a2014-10-29 16:58:59 +0100104 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000105 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000106
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000107 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
108 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +0000109
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000110 s = r"\1\1"
111 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
112 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
113 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +0000114
R David Murray44b548d2016-09-08 13:59:53 -0400115 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx')
116 self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
117 self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
118 self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000119
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200120 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
121 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
122 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
123 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
124 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
125 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300126 with self.assertRaises(re.error):
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200127 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +0000128
R David Murray44b548d2016-09-08 13:59:53 -0400129 self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000130
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000131 def test_bug_449964(self):
132 # fails for group followed by other escape
R David Murray44b548d2016-09-08 13:59:53 -0400133 self.assertEqual(re.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'),
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000134 'xx\bxx\b')
135
136 def test_bug_449000(self):
137 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000138 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
139 'abc\ndef\n')
140 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
141 'abc\ndef\n')
142 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
143 'abc\ndef\n')
144 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
145 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000146
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000147 def test_bug_1661(self):
148 # Verify that flags do not get silently ignored with compiled patterns
149 pattern = re.compile('.')
150 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
151 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
152 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
153 self.assertRaises(ValueError, re.compile, pattern, re.I)
154
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000155 def test_bug_3629(self):
156 # A regex that triggered a bug in the sre-code validator
157 re.compile("(?P<quote>)(?(quote))")
158
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000159 def test_sub_template_numeric_escape(self):
160 # bug 776311 and friends
161 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
162 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
163 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
164 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
165 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
166 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
167 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200168 self.assertEqual(re.sub('x', r'\377', 'x'), '\377')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000169
170 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
171 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
172
173 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
174 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
175 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
176 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
177 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
178
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200179 self.checkTemplateError('x', r'\400', 'x',
180 r'octal escape value \400 outside of '
181 r'range 0-0o377', 0)
182 self.checkTemplateError('x', r'\777', 'x',
183 r'octal escape value \777 outside of '
184 r'range 0-0o377', 0)
Tim Peters0e9980f2004-09-12 03:49:31 +0000185
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300186 self.checkTemplateError('x', r'\1', 'x', 'invalid group reference 1', 1)
187 self.checkTemplateError('x', r'\8', 'x', 'invalid group reference 8', 1)
188 self.checkTemplateError('x', r'\9', 'x', 'invalid group reference 9', 1)
189 self.checkTemplateError('x', r'\11', 'x', 'invalid group reference 11', 1)
190 self.checkTemplateError('x', r'\18', 'x', 'invalid group reference 18', 1)
191 self.checkTemplateError('x', r'\1a', 'x', 'invalid group reference 1', 1)
192 self.checkTemplateError('x', r'\90', 'x', 'invalid group reference 90', 1)
193 self.checkTemplateError('x', r'\99', 'x', 'invalid group reference 99', 1)
194 self.checkTemplateError('x', r'\118', 'x', 'invalid group reference 11', 1)
195 self.checkTemplateError('x', r'\11a', 'x', 'invalid group reference 11', 1)
196 self.checkTemplateError('x', r'\181', 'x', 'invalid group reference 18', 1)
197 self.checkTemplateError('x', r'\800', 'x', 'invalid group reference 80', 1)
198 self.checkTemplateError('x', r'\8', '', 'invalid group reference 8', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000199
200 # in python2.3 (etc), these loop endlessly in sre_parser.py
201 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
202 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
203 'xz8')
204 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
205 'xza')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_qualified_re_sub(self):
208 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300209 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Victor Stinner55e614a2014-10-29 16:58:59 +0100210 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000211
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000212 def test_bug_114660(self):
213 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
214 'hello there')
215
216 def test_bug_462270(self):
217 # Test for empty sub() behaviour, see SF bug #462270
218 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
219 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
220
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221 def test_symbolic_groups(self):
R David Murray44b548d2016-09-08 13:59:53 -0400222 re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
223 re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
224 re.compile(r'(?P<a1>x)\1(?(1)y)')
225 self.checkPatternError(r'(?P<a>)(?P<a>)',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200226 "redefinition of group name 'a' as group 2; "
227 "was group 1")
R David Murray44b548d2016-09-08 13:59:53 -0400228 self.checkPatternError(r'(?P<a>(?P=a))',
Serhiy Storchaka485407c2015-07-18 23:27:00 +0300229 "cannot refer to an open group", 10)
R David Murray44b548d2016-09-08 13:59:53 -0400230 self.checkPatternError(r'(?Pxy)', 'unknown extension ?Px')
231 self.checkPatternError(r'(?P<a>)(?P=a', 'missing ), unterminated name', 11)
232 self.checkPatternError(r'(?P=', 'missing group name', 4)
233 self.checkPatternError(r'(?P=)', 'missing group name', 4)
234 self.checkPatternError(r'(?P=1)', "bad character in group name '1'", 4)
235 self.checkPatternError(r'(?P=a)', "unknown group name 'a'")
236 self.checkPatternError(r'(?P=a1)', "unknown group name 'a1'")
237 self.checkPatternError(r'(?P=a.)', "bad character in group name 'a.'", 4)
238 self.checkPatternError(r'(?P<)', 'missing >, unterminated name', 4)
239 self.checkPatternError(r'(?P<a', 'missing >, unterminated name', 4)
240 self.checkPatternError(r'(?P<', 'missing group name', 4)
241 self.checkPatternError(r'(?P<>)', 'missing group name', 4)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200242 self.checkPatternError(r'(?P<1>)', "bad character in group name '1'", 4)
243 self.checkPatternError(r'(?P<a.>)', "bad character in group name 'a.'", 4)
244 self.checkPatternError(r'(?(', 'missing group name', 3)
245 self.checkPatternError(r'(?())', 'missing group name', 3)
246 self.checkPatternError(r'(?(a))', "unknown group name 'a'", 3)
247 self.checkPatternError(r'(?(-1))', "bad character in group name '-1'", 3)
248 self.checkPatternError(r'(?(1a))', "bad character in group name '1a'", 3)
249 self.checkPatternError(r'(?(a.))', "bad character in group name 'a.'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200250 # New valid/invalid identifiers in Python 3
251 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
252 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200253 self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300254 # Support > 100 groups.
255 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
256 pat = '(?:%s)(?(200)z|t)' % pat
257 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200258
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000259 def test_symbolic_refs(self):
R David Murray44b548d2016-09-08 13:59:53 -0400260 self.checkTemplateError('(?P<a>x)', r'\g<a', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200261 'missing >, unterminated name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400262 self.checkTemplateError('(?P<a>x)', r'\g<', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200263 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400264 self.checkTemplateError('(?P<a>x)', r'\g', 'xx', 'missing <', 2)
265 self.checkTemplateError('(?P<a>x)', r'\g<a a>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200266 "bad character in group name 'a a'", 3)
R David Murray44b548d2016-09-08 13:59:53 -0400267 self.checkTemplateError('(?P<a>x)', r'\g<>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200268 'missing group name', 3)
R David Murray44b548d2016-09-08 13:59:53 -0400269 self.checkTemplateError('(?P<a>x)', r'\g<1a1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200270 "bad character in group name '1a1'", 3)
271 self.checkTemplateError('(?P<a>x)', r'\g<2>', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300272 'invalid group reference 2', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200273 self.checkTemplateError('(?P<a>x)', r'\2', 'xx',
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300274 'invalid group reference 2', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200275 with self.assertRaisesRegex(IndexError, "unknown group name 'ab'"):
R David Murray44b548d2016-09-08 13:59:53 -0400276 re.sub('(?P<a>x)', r'\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300277 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
278 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
R David Murray44b548d2016-09-08 13:59:53 -0400279 self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200280 "bad character in group name '-1'", 3)
Georg Brandl1d472b72013-04-14 11:40:00 +0200281 # New valid/invalid identifiers in Python 3
282 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
283 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
R David Murray44b548d2016-09-08 13:59:53 -0400284 self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200285 "bad character in group name '©'", 3)
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300286 # Support > 100 groups.
287 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
R David Murray44b548d2016-09-08 13:59:53 -0400288 self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000289
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000290 def test_re_subn(self):
291 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
292 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
293 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
294 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300295 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Victor Stinner55e614a2014-10-29 16:58:59 +0100296 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000297
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000298 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300299 for string in ":a:b::c", S(":a:b::c"):
300 self.assertTypedEqual(re.split(":", string),
301 ['', 'a', 'b', '', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200302 self.assertTypedEqual(re.split(":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300303 ['', 'a', 'b', 'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200304 self.assertTypedEqual(re.split("(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300305 ['', ':', 'a', ':', 'b', '::', 'c'])
306 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
307 memoryview(b":a:b::c")):
308 self.assertTypedEqual(re.split(b":", string),
309 [b'', b'a', b'b', b'', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200310 self.assertTypedEqual(re.split(b":+", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300311 [b'', b'a', b'b', b'c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200312 self.assertTypedEqual(re.split(b"(:+)", string),
Serhiy Storchaka25324972013-10-16 12:46:28 +0300313 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300314 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
315 "\U0001d49c\U0001d49e\U0001d4b5"):
316 string = ":%s:%s::%s" % (a, b, c)
317 self.assertEqual(re.split(":", string), ['', a, b, '', c])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200318 self.assertEqual(re.split(":+", string), ['', a, b, c])
319 self.assertEqual(re.split("(:+)", string),
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300320 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300321
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200322 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
323 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000324 ['', ':', 'a', ':', 'b', ':', 'c'])
325 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
326 ['', ':', 'a', ':b::', 'c'])
327 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
328 ['', None, ':', 'a', None, ':', '', 'b', None, '',
329 None, '::', 'c'])
330 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
331 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200333 for sep, expected in [
334 (':*', ['', 'a', 'b', 'c']),
335 ('(?::*)', ['', 'a', 'b', 'c']),
336 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
337 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
338 ]:
339 with self.subTest(sep=sep), self.assertWarns(FutureWarning):
340 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
341
342 for sep, expected in [
343 ('', [':a:b::c']),
344 (r'\b', [':a:b::c']),
345 (r'(?=:)', [':a:b::c']),
346 (r'(?<=:)', [':a:b::c']),
347 ]:
348 with self.subTest(sep=sep), self.assertRaises(ValueError):
349 self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
350
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000351 def test_qualified_re_split(self):
Serhiy Storchakab02f8fc2016-09-25 20:36:23 +0300352 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100353 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
354 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
355 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000356 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200357 self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000358 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200359 with self.assertWarns(FutureWarning):
360 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
361 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000362
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000363 def test_re_findall(self):
364 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300365 for string in "a:b::c:::d", S("a:b::c:::d"):
366 self.assertTypedEqual(re.findall(":+", string),
367 [":", "::", ":::"])
368 self.assertTypedEqual(re.findall("(:+)", string),
369 [":", "::", ":::"])
370 self.assertTypedEqual(re.findall("(:)(:*)", string),
371 [(":", ""), (":", ":"), (":", "::")])
372 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
373 memoryview(b"a:b::c:::d")):
374 self.assertTypedEqual(re.findall(b":+", string),
375 [b":", b"::", b":::"])
376 self.assertTypedEqual(re.findall(b"(:+)", string),
377 [b":", b"::", b":::"])
378 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
379 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300380 for x in ("\xe0", "\u0430", "\U0001d49c"):
381 xx = x * 2
382 xxx = x * 3
383 string = "a%sb%sc%sd" % (x, xx, xxx)
384 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
385 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
386 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
387 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000388
Skip Montanaro5ba00542003-04-25 16:00:14 +0000389 def test_bug_117612(self):
390 self.assertEqual(re.findall(r"(a|(b))", "aba"),
391 [("a", ""),("b", "b"),("a", "")])
392
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000393 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300394 for string in 'a', S('a'):
395 self.assertEqual(re.match('a', string).groups(), ())
396 self.assertEqual(re.match('(a)', string).groups(), ('a',))
397 self.assertEqual(re.match('(a)', string).group(0), 'a')
398 self.assertEqual(re.match('(a)', string).group(1), 'a')
399 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
400 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
401 self.assertEqual(re.match(b'a', string).groups(), ())
402 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
403 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
404 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
405 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300406 for a in ("\xe0", "\u0430", "\U0001d49c"):
407 self.assertEqual(re.match(a, a).groups(), ())
408 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
409 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
410 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
411 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000412
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000413 pat = re.compile('((a)|(b))(c)?')
414 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
415 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
416 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
417 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
418 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000419
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000420 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
421 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
422 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
423 (None, 'b', None))
424 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000425
Serhiy Storchaka977b3ac2016-06-18 16:48:07 +0300426 def test_group(self):
427 class Index:
428 def __init__(self, value):
429 self.value = value
430 def __index__(self):
431 return self.value
432 # A single group
433 m = re.match('(a)(b)', 'ab')
434 self.assertEqual(m.group(), 'ab')
435 self.assertEqual(m.group(0), 'ab')
436 self.assertEqual(m.group(1), 'a')
437 self.assertEqual(m.group(Index(1)), 'a')
438 self.assertRaises(IndexError, m.group, -1)
439 self.assertRaises(IndexError, m.group, 3)
440 self.assertRaises(IndexError, m.group, 1<<1000)
441 self.assertRaises(IndexError, m.group, Index(1<<1000))
442 self.assertRaises(IndexError, m.group, 'x')
443 # Multiple groups
444 self.assertEqual(m.group(2, 1), ('b', 'a'))
445 self.assertEqual(m.group(Index(2), Index(1)), ('b', 'a'))
446
Eric V. Smith605bdae2016-09-11 08:55:43 -0400447 def test_match_getitem(self):
448 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
449
450 m = pat.match('a')
451 self.assertEqual(m['a1'], 'a')
452 self.assertEqual(m['b2'], None)
453 self.assertEqual(m['c3'], None)
454 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=None')
455 self.assertEqual(m[0], 'a')
456 self.assertEqual(m[1], 'a')
457 self.assertEqual(m[2], None)
458 self.assertEqual(m[3], None)
459 with self.assertRaisesRegex(IndexError, 'no such group'):
460 m['X']
461 with self.assertRaisesRegex(IndexError, 'no such group'):
462 m[-1]
463 with self.assertRaisesRegex(IndexError, 'no such group'):
464 m[4]
465 with self.assertRaisesRegex(IndexError, 'no such group'):
466 m[0, 1]
467 with self.assertRaisesRegex(IndexError, 'no such group'):
468 m[(0,)]
469 with self.assertRaisesRegex(IndexError, 'no such group'):
470 m[(0, 1)]
Serhiy Storchaka50754162017-08-03 11:45:23 +0300471 with self.assertRaisesRegex(IndexError, 'no such group'):
Eric V. Smith605bdae2016-09-11 08:55:43 -0400472 'a1={a2}'.format_map(m)
473
474 m = pat.match('ac')
475 self.assertEqual(m['a1'], 'a')
476 self.assertEqual(m['b2'], None)
477 self.assertEqual(m['c3'], 'c')
478 self.assertEqual('a1={a1} b2={b2} c3={c3}'.format_map(m), 'a1=a b2=None c3=c')
479 self.assertEqual(m[0], 'ac')
480 self.assertEqual(m[1], 'a')
481 self.assertEqual(m[2], None)
482 self.assertEqual(m[3], 'c')
483
484 # Cannot assign.
485 with self.assertRaises(TypeError):
486 m[0] = 1
487
488 # No len().
489 self.assertRaises(TypeError, len, m)
490
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200491 def test_re_fullmatch(self):
492 # Issue 16203: Proposal: add re.fullmatch() method.
493 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
494 for string in "ab", S("ab"):
495 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
496 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
497 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
498 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
499 r = r"%s|%s" % (a, a + b)
500 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
501 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
502 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
503 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
504 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
505 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
506 self.assertIsNone(re.fullmatch(r"a+", "ab"))
507 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
508 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
509 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
510 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
511 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
512 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
513
514 self.assertEqual(
515 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
516 self.assertEqual(
517 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
518 self.assertEqual(
519 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
520
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000521 def test_re_groupref_exists(self):
R David Murray44b548d2016-09-08 13:59:53 -0400522 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000523 ('(', 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400524 self.assertEqual(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000525 (None, 'a'))
R David Murray44b548d2016-09-08 13:59:53 -0400526 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'))
527 self.assertIsNone(re.match(r'^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000528 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
529 ('a', 'b'))
R David Murray44b548d2016-09-08 13:59:53 -0400530 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000531 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400532 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000533 (None, 'd'))
R David Murray44b548d2016-09-08 13:59:53 -0400534 self.assertEqual(re.match(r'^(?:(a)|c)((?(1)|d))$', 'a').groups(),
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000535 ('a', ''))
536
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000537 # Tests for bug #1177831: exercise groups other than the first group
538 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
539 self.assertEqual(p.match('abc').groups(),
540 ('a', 'b', 'c'))
541 self.assertEqual(p.match('ad').groups(),
542 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300543 self.assertIsNone(p.match('abd'))
544 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000545
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300546 # Support > 100 groups.
547 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
548 pat = '(?:%s)(?(200)z)' % pat
549 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000550
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200551 self.checkPatternError(r'(?P<a>)(?(0))', 'bad group number', 10)
552 self.checkPatternError(r'()(?(1)a|b',
553 'missing ), unterminated subpattern', 2)
554 self.checkPatternError(r'()(?(1)a|b|c)',
555 'conditional backref with more than '
556 'two branches', 10)
557
558 def test_re_groupref_overflow(self):
Serhiy Storchaka662cef62016-10-23 12:11:19 +0300559 from sre_constants import MAXGROUPS
560 self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
561 'invalid group reference %d' % MAXGROUPS, 3)
562 self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
563 'invalid group reference %d' % MAXGROUPS, 10)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200564
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000565 def test_re_groupref(self):
566 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
567 ('|', 'a'))
568 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
569 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300570 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
571 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000572 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
573 ('a', 'a'))
574 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
575 (None, None))
576
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200577 self.checkPatternError(r'(abc\1)', 'cannot refer to an open group', 4)
578
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000579 def test_groupdict(self):
580 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
581 'first second').groupdict(),
582 {'first':'first', 'second':'second'})
583
584 def test_expand(self):
585 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
586 "first second")
587 .expand(r"\2 \1 \g<second> \g<first>"),
588 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300589 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
590 "first")
591 .expand(r"\2 \g<second>"),
592 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000593
594 def test_repeat_minmax(self):
R David Murray44b548d2016-09-08 13:59:53 -0400595 self.assertIsNone(re.match(r"^(\w){1}$", "abc"))
596 self.assertIsNone(re.match(r"^(\w){1}?$", "abc"))
597 self.assertIsNone(re.match(r"^(\w){1,2}$", "abc"))
598 self.assertIsNone(re.match(r"^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000599
R David Murray44b548d2016-09-08 13:59:53 -0400600 self.assertEqual(re.match(r"^(\w){3}$", "abc").group(1), "c")
601 self.assertEqual(re.match(r"^(\w){1,3}$", "abc").group(1), "c")
602 self.assertEqual(re.match(r"^(\w){1,4}$", "abc").group(1), "c")
603 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
604 self.assertEqual(re.match(r"^(\w){3}?$", "abc").group(1), "c")
605 self.assertEqual(re.match(r"^(\w){1,3}?$", "abc").group(1), "c")
606 self.assertEqual(re.match(r"^(\w){1,4}?$", "abc").group(1), "c")
607 self.assertEqual(re.match(r"^(\w){3,4}?$", "abc").group(1), "c")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000608
R David Murray44b548d2016-09-08 13:59:53 -0400609 self.assertIsNone(re.match(r"^x{1}$", "xxx"))
610 self.assertIsNone(re.match(r"^x{1}?$", "xxx"))
611 self.assertIsNone(re.match(r"^x{1,2}$", "xxx"))
612 self.assertIsNone(re.match(r"^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613
R David Murray44b548d2016-09-08 13:59:53 -0400614 self.assertTrue(re.match(r"^x{3}$", "xxx"))
615 self.assertTrue(re.match(r"^x{1,3}$", "xxx"))
616 self.assertTrue(re.match(r"^x{3,3}$", "xxx"))
617 self.assertTrue(re.match(r"^x{1,4}$", "xxx"))
618 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
619 self.assertTrue(re.match(r"^x{3}?$", "xxx"))
620 self.assertTrue(re.match(r"^x{1,3}?$", "xxx"))
621 self.assertTrue(re.match(r"^x{1,4}?$", "xxx"))
622 self.assertTrue(re.match(r"^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000623
R David Murray44b548d2016-09-08 13:59:53 -0400624 self.assertIsNone(re.match(r"^x{}$", "xxx"))
625 self.assertTrue(re.match(r"^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000626
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200627 self.checkPatternError(r'x{2,1}',
628 'min repeat greater than max repeat', 2)
629
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000630 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000631 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000632 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000633 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
634 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
635 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
636 {'first': 1, 'other': 2})
637
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000638 self.assertEqual(re.match("(a)", "a").pos, 0)
639 self.assertEqual(re.match("(a)", "a").endpos, 1)
640 self.assertEqual(re.match("(a)", "a").string, "a")
641 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300642 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000643
Serhiy Storchaka07360df2015-03-30 01:01:48 +0300644 # Issue 14260. groupindex should be non-modifiable mapping.
645 p = re.compile(r'(?i)(?P<first>a)(?P<other>b)')
646 self.assertEqual(sorted(p.groupindex), ['first', 'other'])
647 self.assertEqual(p.groupindex['other'], 2)
648 with self.assertRaises(TypeError):
649 p.groupindex['other'] = 0
650 self.assertEqual(p.groupindex['other'], 2)
651
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000652 def test_special_escapes(self):
653 self.assertEqual(re.search(r"\b(b.)\b",
654 "abcd abc bcd bx").group(1), "bx")
655 self.assertEqual(re.search(r"\B(b.)\B",
656 "abc bcd bc abxd").group(1), "bx")
657 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300658 "abcd abc bcd bx", re.ASCII).group(1), "bx")
659 self.assertEqual(re.search(r"\B(b.)\B",
660 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000661 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
662 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300663 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300664 self.assertEqual(re.search(br"\b(b.)\b",
665 b"abcd abc bcd bx").group(1), b"bx")
666 self.assertEqual(re.search(br"\B(b.)\B",
667 b"abc bcd bc abxd").group(1), b"bx")
668 self.assertEqual(re.search(br"\b(b.)\b",
669 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
670 self.assertEqual(re.search(br"\B(b.)\B",
671 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
672 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
673 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300674 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000675 self.assertEqual(re.search(r"\d\D\w\W\s\S",
676 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300677 self.assertEqual(re.search(br"\d\D\w\W\s\S",
678 b"1aa! a").group(0), b"1aa! a")
679 self.assertEqual(re.search(r"\d\D\w\W\s\S",
680 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300681 self.assertEqual(re.search(br"\d\D\w\W\s\S",
682 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000683
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200684 def test_other_escapes(self):
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200685 self.checkPatternError("\\", 'bad escape (end of pattern)', 0)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200686 self.assertEqual(re.match(r"\(", '(').group(), '(')
687 self.assertIsNone(re.match(r"\(", ')'))
688 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200689 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
690 self.assertIsNone(re.match(r"[\]]", '['))
691 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
692 self.assertIsNone(re.match(r"[a\-c]", 'b'))
693 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
694 self.assertIsNone(re.match(r"[\^a]+", 'b'))
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200695 re.purge() # for warnings
696 for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY':
697 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300698 self.assertRaises(re.error, re.compile, '\\%c' % c)
Serhiy Storchakaa54aae02015-03-24 22:58:14 +0200699 for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
700 with self.subTest(c):
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +0300701 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200702
Ezio Melotti5a045b92012-02-29 11:48:44 +0200703 def test_string_boundaries(self):
704 # See http://bugs.python.org/issue10713
705 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
706 "abc")
707 # There's a word boundary at the start of a string.
708 self.assertTrue(re.match(r"\b", "abc"))
709 # A non-empty string includes a non-boundary zero-length match.
710 self.assertTrue(re.search(r"\B", "abc"))
711 # There is no non-boundary match at the start of a string.
712 self.assertFalse(re.match(r"\B", "abc"))
713 # However, an empty string contains no word boundaries, and also no
714 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300715 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200716 # This one is questionable and different from the perlre behaviour,
717 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300718 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200719 # A single word-character string has two boundaries, but no
720 # non-boundary gaps.
721 self.assertEqual(len(re.findall(r"\b", "a")), 2)
722 self.assertEqual(len(re.findall(r"\B", "a")), 0)
723 # If there are no words, there are no boundaries
724 self.assertEqual(len(re.findall(r"\b", " ")), 0)
725 self.assertEqual(len(re.findall(r"\b", " ")), 0)
726 # Can match around the whitespace.
727 self.assertEqual(len(re.findall(r"\B", " ")), 2)
728
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000729 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000730 self.assertEqual(re.match("([\u2222\u2223])",
731 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300732 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300733 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000734
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100735 def test_big_codesize(self):
736 # Issue #1160
737 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300738 self.assertTrue(r.match('1000'))
739 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100740
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000741 def test_anyall(self):
742 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
743 "a\nb")
744 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
745 "a\n\nb")
746
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200747 def test_lookahead(self):
R David Murray44b548d2016-09-08 13:59:53 -0400748 self.assertEqual(re.match(r"(a(?=\s[^a]))", "a b").group(1), "a")
749 self.assertEqual(re.match(r"(a(?=\s[^a]*))", "a b").group(1), "a")
750 self.assertEqual(re.match(r"(a(?=\s[abc]))", "a b").group(1), "a")
751 self.assertEqual(re.match(r"(a(?=\s[abc]*))", "a bc").group(1), "a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000752 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
753 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
754 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
755
756 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
757 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
758 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
759 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
760
Serhiy Storchaka4eea62f2015-02-21 10:07:35 +0200761 # Group reference.
762 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
763 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
764 # Conditional group reference.
765 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
766 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
767 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
768 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
769 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
770 # Group used before defined.
771 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
772 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
773 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
774
775 def test_lookbehind(self):
776 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
777 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
778 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
779 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
780 # Group reference.
781 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
782 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
783 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
784 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
785 # Conditional group reference.
786 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
787 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
788 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
789 self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
790 self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
791 # Group used before defined.
792 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
793 self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
794 self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
795 # Group defined in the same lookbehind pattern
796 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
797 self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
798 self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
799 self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
800
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000801 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000802 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300803 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000804 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
805 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
806 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
807 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
808 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
809 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
810 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
811 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
812
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200813 assert '\u212a'.lower() == 'k' # 'K'
814 self.assertTrue(re.match(r'K', '\u212a', re.I))
815 self.assertTrue(re.match(r'k', '\u212a', re.I))
816 self.assertTrue(re.match(r'\u212a', 'K', re.I))
817 self.assertTrue(re.match(r'\u212a', 'k', re.I))
818 assert '\u017f'.upper() == 'S' # 'ſ'
819 self.assertTrue(re.match(r'S', '\u017f', re.I))
820 self.assertTrue(re.match(r's', '\u017f', re.I))
821 self.assertTrue(re.match(r'\u017f', 'S', re.I))
822 self.assertTrue(re.match(r'\u017f', 's', re.I))
823 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
824 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
825 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
826
827 def test_ignore_case_set(self):
828 self.assertTrue(re.match(r'[19A]', 'A', re.I))
829 self.assertTrue(re.match(r'[19a]', 'a', re.I))
830 self.assertTrue(re.match(r'[19a]', 'A', re.I))
831 self.assertTrue(re.match(r'[19A]', 'a', re.I))
832 self.assertTrue(re.match(br'[19A]', b'A', re.I))
833 self.assertTrue(re.match(br'[19a]', b'a', re.I))
834 self.assertTrue(re.match(br'[19a]', b'A', re.I))
835 self.assertTrue(re.match(br'[19A]', b'a', re.I))
836 assert '\u212a'.lower() == 'k' # 'K'
837 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
838 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
839 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
840 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
841 assert '\u017f'.upper() == 'S' # 'ſ'
842 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
843 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
844 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
845 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
846 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
847 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
848 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
849
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200850 def test_ignore_case_range(self):
851 # Issues #3511, #17381.
852 self.assertTrue(re.match(r'[9-a]', '_', re.I))
853 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
854 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
855 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
856 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
857 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
858 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
859 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
860 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
861 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
862 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
863 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
864 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
865 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
866 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
867 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
868
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200869 assert '\u212a'.lower() == 'k' # 'K'
870 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
871 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
872 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
873 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
874 assert '\u017f'.upper() == 'S' # 'ſ'
875 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
876 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
877 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
878 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
879 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
880 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
881 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
882
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000883 def test_category(self):
884 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
885
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300886 @cpython_only
887 def test_case_helpers(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000888 import _sre
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300889 for i in range(128):
890 c = chr(i)
891 lo = ord(c.lower())
892 self.assertEqual(_sre.ascii_tolower(i), lo)
893 self.assertEqual(_sre.unicode_tolower(i), lo)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300894 iscased = c in string.ascii_letters
895 self.assertEqual(_sre.ascii_iscased(i), iscased)
896 self.assertEqual(_sre.unicode_iscased(i), iscased)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000897
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300898 for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
899 c = chr(i)
900 self.assertEqual(_sre.ascii_tolower(i), i)
901 if i != 0x0130:
902 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300903 iscased = c != c.lower() or c != c.upper()
904 self.assertFalse(_sre.ascii_iscased(i))
905 self.assertEqual(_sre.unicode_iscased(i),
906 c != c.lower() or c != c.upper())
Serhiy Storchaka7186cc22017-05-05 10:42:46 +0300907
908 self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
909 self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300910 self.assertFalse(_sre.ascii_iscased(0x0130))
911 self.assertTrue(_sre.unicode_iscased(0x0130))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000912
913 def test_not_literal(self):
R David Murray44b548d2016-09-08 13:59:53 -0400914 self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
915 self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000916
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200917 def test_possible_set_operations(self):
918 s = bytes(range(128)).decode()
919 with self.assertWarns(FutureWarning):
920 p = re.compile(r'[0-9--1]')
921 self.assertEqual(p.findall(s), list('-./0123456789'))
922 self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
923 with self.assertWarns(FutureWarning):
924 p = re.compile(r'[%--1]')
925 self.assertEqual(p.findall(s), list("%&'()*+,-1"))
926 with self.assertWarns(FutureWarning):
927 p = re.compile(r'[%--]')
928 self.assertEqual(p.findall(s), list("%&'()*+,-"))
929
930 with self.assertWarns(FutureWarning):
931 p = re.compile(r'[0-9&&1]')
932 self.assertEqual(p.findall(s), list('&0123456789'))
933 with self.assertWarns(FutureWarning):
934 p = re.compile(r'[\d&&1]')
935 self.assertEqual(p.findall(s), list('&0123456789'))
936 self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
937
938 with self.assertWarns(FutureWarning):
939 p = re.compile(r'[0-9||a]')
940 self.assertEqual(p.findall(s), list('0123456789a|'))
941 with self.assertWarns(FutureWarning):
942 p = re.compile(r'[\d||a]')
943 self.assertEqual(p.findall(s), list('0123456789a|'))
944 self.assertEqual(re.findall(r'[||1]', s), list('1|'))
945
946 with self.assertWarns(FutureWarning):
947 p = re.compile(r'[0-9~~1]')
948 self.assertEqual(p.findall(s), list('0123456789~'))
949 with self.assertWarns(FutureWarning):
950 p = re.compile(r'[\d~~1]')
951 self.assertEqual(p.findall(s), list('0123456789~'))
952 self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
953
954 with self.assertWarns(FutureWarning):
955 p = re.compile(r'[[0-9]|]')
956 self.assertEqual(p.findall(s), list('0123456789[]'))
957
958 with self.assertWarns(FutureWarning):
959 p = re.compile(r'[[:digit:]|]')
960 self.assertEqual(p.findall(s), list(':[]dgit'))
961
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000962 def test_search_coverage(self):
R David Murray44b548d2016-09-08 13:59:53 -0400963 self.assertEqual(re.search(r"\s(b)", " b").group(1), "b")
964 self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000965
Ezio Melottid2114eb2011-03-25 14:08:44 +0200966 def assertMatch(self, pattern, text, match=None, span=None,
Serhiy Storchaka59083002017-04-13 21:06:43 +0300967 matcher=re.fullmatch):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200968 if match is None and span is None:
969 # the pattern matches the whole text
970 match = text
971 span = (0, len(text))
972 elif match is None or span is None:
973 raise ValueError('If match is not None, span should be specified '
974 '(and vice versa).')
975 m = matcher(pattern, text)
976 self.assertTrue(m)
977 self.assertEqual(m.group(), match)
978 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000979
Serhiy Storchaka05cb7282017-11-16 12:38:26 +0200980 LITERAL_CHARS = string.ascii_letters + string.digits + '!"%\',/:;<=>@_`'
Serhiy Storchaka59083002017-04-13 21:06:43 +0300981
Ezio Melottid2114eb2011-03-25 14:08:44 +0200982 def test_re_escape(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200983 p = ''.join(chr(i) for i in range(256))
984 for c in p:
Ezio Melottid2114eb2011-03-25 14:08:44 +0200985 self.assertMatch(re.escape(c), c)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300986 self.assertMatch('[' + re.escape(c) + ']', c)
987 self.assertMatch('(?x)' + re.escape(c), c)
Ezio Melottid2114eb2011-03-25 14:08:44 +0200988 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300989 for c in '-.]{}':
990 self.assertEqual(re.escape(c)[:1], '\\')
991 literal_chars = self.LITERAL_CHARS
992 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum49946571997-07-18 04:26:25 +0000993
Serhiy Storchaka59083002017-04-13 21:06:43 +0300994 def test_re_escape_bytes(self):
Ezio Melottid2114eb2011-03-25 14:08:44 +0200995 p = bytes(range(256))
996 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000997 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200998 self.assertMatch(re.escape(b), b)
Serhiy Storchaka59083002017-04-13 21:06:43 +0300999 self.assertMatch(b'[' + re.escape(b) + b']', b)
1000 self.assertMatch(b'(?x)' + re.escape(b), b)
Ezio Melottid2114eb2011-03-25 14:08:44 +02001001 self.assertMatch(re.escape(p), p)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001002 for i in b'-.]{}':
1003 b = bytes([i])
1004 self.assertEqual(re.escape(b)[:1], b'\\')
1005 literal_chars = self.LITERAL_CHARS.encode('ascii')
1006 self.assertEqual(re.escape(literal_chars), literal_chars)
Guido van Rossum698280d2008-09-10 17:44:35 +00001007
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001008 def test_re_escape_non_ascii(self):
1009 s = 'xxx\u2620\u2620\u2620xxx'
1010 s_escaped = re.escape(s)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001011 self.assertEqual(s_escaped, s)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001012 self.assertMatch(s_escaped, s)
1013 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
1014 'x\u2620\u2620\u2620x', (2, 7), re.search)
1015
1016 def test_re_escape_non_ascii_bytes(self):
1017 b = 'y\u2620y\u2620y'.encode('utf-8')
1018 b_escaped = re.escape(b)
Serhiy Storchaka59083002017-04-13 21:06:43 +03001019 self.assertEqual(b_escaped, b)
Ezio Melotti7b9e97b2011-03-25 14:09:33 +02001020 self.assertMatch(b_escaped, b)
1021 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
1022 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +00001023
Serhiy Storchakab85a9762014-09-15 11:33:19 +03001024 def test_pickling(self):
1025 import pickle
1026 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
1027 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
1028 pickled = pickle.dumps(oldpat, proto)
1029 newpat = pickle.loads(pickled)
1030 self.assertEqual(newpat, oldpat)
1031 # current pickle expects the _compile() reconstructor in re module
1032 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +00001033
Serhiy Storchakafdbd0112017-04-16 10:16:03 +03001034 def test_copying(self):
1035 import copy
1036 p = re.compile(r'(?P<int>\d+)(?:\.(?P<frac>\d*))?')
1037 self.assertIs(copy.copy(p), p)
1038 self.assertIs(copy.deepcopy(p), p)
1039 m = p.match('12.34')
1040 self.assertIs(copy.copy(m), m)
1041 self.assertIs(copy.deepcopy(m), m)
1042
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001043 def test_constants(self):
1044 self.assertEqual(re.I, re.IGNORECASE)
1045 self.assertEqual(re.L, re.LOCALE)
1046 self.assertEqual(re.M, re.MULTILINE)
1047 self.assertEqual(re.S, re.DOTALL)
1048 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001049
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001050 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001051 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001052 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001053 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
1054 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +00001055
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001056 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001057 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1058 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001059 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
1060 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
1061 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
1062 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
1063 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
1064 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001065 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001066 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
1067 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
1068 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
1069 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
1070 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
1071 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
1072 self.assertTrue(re.match(r"\0", "\000"))
1073 self.assertTrue(re.match(r"\08", "\0008"))
1074 self.assertTrue(re.match(r"\01", "\001"))
1075 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001076 self.checkPatternError(r"\567",
1077 r'octal escape value \567 outside of '
1078 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001079 self.checkPatternError(r"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001080 self.checkPatternError(r"\x1", r'incomplete escape \x1', 0)
1081 self.checkPatternError(r"\x1z", r'incomplete escape \x1', 0)
1082 self.checkPatternError(r"\u123", r'incomplete escape \u123', 0)
1083 self.checkPatternError(r"\u123z", r'incomplete escape \u123', 0)
1084 self.checkPatternError(r"\U0001234", r'incomplete escape \U0001234', 0)
1085 self.checkPatternError(r"\U0001234z", r'incomplete escape \U0001234', 0)
1086 self.checkPatternError(r"\U00110000", r'bad escape \U00110000', 0)
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001087
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001088 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +02001089 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
1090 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001091 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
1092 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
1093 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
1094 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
1095 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
1096 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
1097 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
1098 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001099 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001100 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
1101 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
1102 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
1103 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
1104 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
1105 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001106 self.checkPatternError(r"[\567]",
1107 r'octal escape value \567 outside of '
1108 r'range 0-0o377', 1)
1109 self.checkPatternError(r"[\911]", r'bad escape \9', 1)
1110 self.checkPatternError(r"[\x1z]", r'incomplete escape \x1', 1)
1111 self.checkPatternError(r"[\u123z]", r'incomplete escape \u123', 1)
1112 self.checkPatternError(r"[\U0001234z]", r'incomplete escape \U0001234', 1)
1113 self.checkPatternError(r"[\U00110000]", r'bad escape \U00110000', 1)
Serhiy Storchakac563caf2014-09-23 23:22:41 +03001114 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +02001115
1116 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001117 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001118 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
1119 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
1120 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
1121 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
1122 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
1123 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001124 self.assertRaises(re.error, re.compile, br"\u1234")
1125 self.assertRaises(re.error, re.compile, br"\U00012345")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001126 self.assertTrue(re.match(br"\0", b"\000"))
1127 self.assertTrue(re.match(br"\08", b"\0008"))
1128 self.assertTrue(re.match(br"\01", b"\001"))
1129 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001130 self.checkPatternError(br"\567",
1131 r'octal escape value \567 outside of '
1132 r'range 0-0o377', 0)
Serhiy Storchaka662cef62016-10-23 12:11:19 +03001133 self.checkPatternError(br"\911", 'invalid group reference 91', 1)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001134 self.checkPatternError(br"\x1", r'incomplete escape \x1', 0)
1135 self.checkPatternError(br"\x1z", r'incomplete escape \x1', 0)
Antoine Pitrou463badf2012-06-23 13:29:19 +02001136
1137 def test_sre_byte_class_literals(self):
1138 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001139 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
1140 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
1141 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
1142 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
1143 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
1144 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
1145 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
1146 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001147 self.assertRaises(re.error, re.compile, br"[\u1234]")
1148 self.assertRaises(re.error, re.compile, br"[\U00012345]")
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001149 self.checkPatternError(br"[\567]",
1150 r'octal escape value \567 outside of '
1151 r'range 0-0o377', 1)
1152 self.checkPatternError(br"[\911]", r'bad escape \9', 1)
1153 self.checkPatternError(br"[\x1z]", r'incomplete escape \x1', 1)
1154
1155 def test_character_set_errors(self):
1156 self.checkPatternError(r'[', 'unterminated character set', 0)
1157 self.checkPatternError(r'[^', 'unterminated character set', 0)
1158 self.checkPatternError(r'[a', 'unterminated character set', 0)
1159 # bug 545855 -- This pattern failed to cause a compile error as it
1160 # should, instead provoking a TypeError.
1161 self.checkPatternError(r"[a-", 'unterminated character set', 0)
1162 self.checkPatternError(r"[\w-b]", r'bad character range \w-b', 1)
1163 self.checkPatternError(r"[a-\w]", r'bad character range a-\w', 1)
1164 self.checkPatternError(r"[b-a]", 'bad character range b-a', 1)
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +00001165
Skip Montanaro7d9963f2003-04-25 14:12:40 +00001166 def test_bug_113254(self):
1167 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
1168 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
1169 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
1170
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001171 def test_bug_527371(self):
1172 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001173 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001174 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
1175 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
R David Murray44b548d2016-09-08 13:59:53 -04001176 self.assertEqual(re.match(r"(?P<a>a(b))", "ab").lastgroup, 'a')
1177 self.assertEqual(re.match(r"((a))", "a").lastindex, 1)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001178
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001179 def test_bug_418626(self):
1180 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
1181 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
1182 # pattern '*?' on a long string.
1183 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
1184 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
1185 20003)
1186 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001187 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +00001188 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001189 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001190
1191 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001192 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +00001193 self.assertEqual(re.compile(pat) and 1, 1)
1194
Skip Montanaro1e703c62003-04-25 15:40:28 +00001195 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001196 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +00001197 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +00001198 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
1199 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
1200 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +00001201
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001202 def test_nothing_to_repeat(self):
1203 for reps in '*', '+', '?', '{1,2}':
1204 for mod in '', '?':
1205 self.checkPatternError('%s%s' % (reps, mod),
1206 'nothing to repeat', 0)
1207 self.checkPatternError('(?:%s%s)' % (reps, mod),
1208 'nothing to repeat', 3)
1209
1210 def test_multiple_repeat(self):
1211 for outer_reps in '*', '+', '{1,2}':
1212 for outer_mod in '', '?':
1213 outer_op = outer_reps + outer_mod
1214 for inner_reps in '*', '+', '?', '{1,2}':
1215 for inner_mod in '', '?':
1216 inner_op = inner_reps + inner_mod
1217 self.checkPatternError(r'x%s%s' % (inner_op, outer_op),
1218 'multiple repeat', 1 + len(inner_op))
1219
Serhiy Storchakafa468162013-02-16 21:23:53 +02001220 def test_unlimited_zero_width_repeat(self):
1221 # Issue #9669
1222 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
1223 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
1224 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
1225 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
1226 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
1227 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
1228
Skip Montanaro1e703c62003-04-25 15:40:28 +00001229 def test_scanner(self):
1230 def s_ident(scanner, token): return token
1231 def s_operator(scanner, token): return "op%s" % token
1232 def s_float(scanner, token): return float(token)
1233 def s_int(scanner, token): return int(token)
1234
1235 scanner = Scanner([
1236 (r"[a-zA-Z_]\w*", s_ident),
1237 (r"\d+\.\d*", s_float),
1238 (r"\d+", s_int),
1239 (r"=|\+|-|\*|/", s_operator),
1240 (r"\s+", None),
1241 ])
1242
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001243 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +00001244
Skip Montanaro1e703c62003-04-25 15:40:28 +00001245 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
1246 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
1247 'op+', 'bar'], ''))
1248
Skip Montanaro5ba00542003-04-25 16:00:14 +00001249 def test_bug_448951(self):
1250 # bug 448951 (similar to 429357, but with single char match)
1251 # (Also test greedy matches.)
1252 for op in '','?','*':
1253 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
1254 (None, None))
1255 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
1256 ('a:', 'a'))
1257
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +00001258 def test_bug_725106(self):
1259 # capturing groups in alternatives in repeats
1260 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
1261 ('b', 'a'))
1262 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
1263 ('c', 'b'))
1264 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
1265 ('b', None))
1266 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1267 ('b', None))
1268 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1269 ('b', 'a'))
1270 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1271 ('c', 'b'))
1272 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1273 ('b', None))
1274 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1275 ('b', None))
1276
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001277 def test_bug_725149(self):
1278 # mark_stack_base restoring before restoring marks
1279 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1280 ('a', None))
1281 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1282 ('a', None, None))
1283
Just van Rossum12723ba2003-07-02 20:03:04 +00001284 def test_bug_764548(self):
1285 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001286 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001287 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001288 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001289
Skip Montanaro5ba00542003-04-25 16:00:14 +00001290 def test_finditer(self):
1291 iter = re.finditer(r":+", "a:b::c:::d")
1292 self.assertEqual([item.group(0) for item in iter],
1293 [":", "::", ":::"])
1294
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001295 pat = re.compile(r":+")
1296 iter = pat.finditer("a:b::c:::d", 1, 10)
1297 self.assertEqual([item.group(0) for item in iter],
1298 [":", "::", ":::"])
1299
1300 pat = re.compile(r":+")
1301 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1302 self.assertEqual([item.group(0) for item in iter],
1303 [":", "::", ":::"])
1304
1305 pat = re.compile(r":+")
1306 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1307 self.assertEqual([item.group(0) for item in iter],
1308 [":", "::", ":::"])
1309
1310 pat = re.compile(r":+")
1311 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1312 self.assertEqual([item.group(0) for item in iter],
1313 ["::", "::"])
1314
Thomas Wouters40a088d2008-03-18 20:19:54 +00001315 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001316 self.assertIsNot(re.compile('bug_926075'),
1317 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001318
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001319 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001320 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001321 self.assertEqual(re.compile(pattern).split("a.b.c"),
1322 ['a','b','c'])
1323
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001324 def test_bug_581080(self):
1325 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001326 self.assertEqual(next(iter).span(), (1,2))
1327 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001328
1329 scanner = re.compile(r"\s").scanner("a b")
1330 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001331 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001332
1333 def test_bug_817234(self):
1334 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001335 self.assertEqual(next(iter).span(), (0, 4))
1336 self.assertEqual(next(iter).span(), (4, 4))
1337 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001338
Mark Dickinson1f268282009-07-28 17:22:36 +00001339 def test_bug_6561(self):
1340 # '\d' should match characters in Unicode category 'Nd'
1341 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1342 # Letter) or 'No' (Number, Other).
1343 decimal_digits = [
1344 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1345 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1346 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1347 ]
1348 for x in decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001349 self.assertEqual(re.match(r'^\d$', x).group(0), x)
Mark Dickinson1f268282009-07-28 17:22:36 +00001350
1351 not_decimal_digits = [
1352 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1353 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1354 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1355 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1356 ]
1357 for x in not_decimal_digits:
R David Murray44b548d2016-09-08 13:59:53 -04001358 self.assertIsNone(re.match(r'^\d$', x))
Mark Dickinson1f268282009-07-28 17:22:36 +00001359
Guido van Rossumd8faa362007-04-27 19:54:29 +00001360 def test_empty_array(self):
1361 # SF buf 1647541
1362 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001363 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001364 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001365 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001366 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001367
Christian Heimes072c0f12008-01-03 23:01:04 +00001368 def test_inline_flags(self):
1369 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001370 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1371 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001372
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001373 p = re.compile('.' + upper_char, re.I | re.S)
1374 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001375 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001376
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001377 p = re.compile('.' + lower_char, re.I | re.S)
1378 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001379 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001380
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001381 p = re.compile('(?i).' + upper_char, re.S)
1382 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001383 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001384
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001385 p = re.compile('(?i).' + lower_char, re.S)
1386 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001387 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001388
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001389 p = re.compile('(?is).' + upper_char)
1390 q = p.match('\n' + lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001391 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001392
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001393 p = re.compile('(?is).' + lower_char)
1394 q = p.match('\n' + upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001395 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001396
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001397 p = re.compile('(?s)(?i).' + upper_char)
1398 q = p.match('\n' + lower_char)
1399 self.assertTrue(q)
1400
1401 p = re.compile('(?s)(?i).' + lower_char)
1402 q = p.match('\n' + upper_char)
1403 self.assertTrue(q)
1404
1405 self.assertTrue(re.match('(?ix) ' + upper_char, lower_char))
1406 self.assertTrue(re.match('(?ix) ' + lower_char, upper_char))
1407 self.assertTrue(re.match(' (?i) ' + upper_char, lower_char, re.X))
1408 self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char))
1409 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X))
Serhiy Storchakad65cd092016-09-11 01:39:01 +03001410
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001411 p = upper_char + '(?i)'
1412 with self.assertWarns(DeprecationWarning) as warns:
1413 self.assertTrue(re.match(p, lower_char))
1414 self.assertEqual(
1415 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001416 'Flags not at the start of the expression %r' % p
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001417 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001418 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001419
1420 p = upper_char + '(?i)%s' % ('.?' * 100)
1421 with self.assertWarns(DeprecationWarning) as warns:
1422 self.assertTrue(re.match(p, lower_char))
1423 self.assertEqual(
1424 str(warns.warnings[0].message),
Roy Williams171b9a32017-06-09 22:01:16 -07001425 'Flags not at the start of the expression %r (truncated)' % p[:20]
Serhiy Storchakaabf275a2016-09-17 01:29:58 +03001426 )
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001427 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchakabd48d272016-09-11 12:50:02 +03001428
Roy Williams171b9a32017-06-09 22:01:16 -07001429 # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning
1430 with warnings.catch_warnings():
1431 warnings.simplefilter('error', BytesWarning)
1432 p = b'A(?i)'
1433 with self.assertWarns(DeprecationWarning) as warns:
1434 self.assertTrue(re.match(p, b'a'))
1435 self.assertEqual(
1436 str(warns.warnings[0].message),
1437 'Flags not at the start of the expression %r' % p
1438 )
1439 self.assertEqual(warns.warnings[0].filename, __file__)
1440
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001441 with self.assertWarns(DeprecationWarning):
1442 self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char))
1443 with self.assertWarns(DeprecationWarning):
1444 self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char))
1445 with self.assertWarns(DeprecationWarning):
1446 self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char))
1447 with self.assertWarns(DeprecationWarning):
1448 self.assertTrue(re.match('^(?i)' + upper_char, lower_char))
1449 with self.assertWarns(DeprecationWarning):
1450 self.assertTrue(re.match('$|(?i)' + upper_char, lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001451 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001452 self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001453 self.assertRegex(str(warns.warnings[0].message),
1454 'Flags not at the start')
1455 self.assertEqual(warns.warnings[0].filename, __file__)
1456 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001457 self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')',
1458 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001459 self.assertRegex(str(warns.warnings[0].message),
1460 'Flags not at the start')
1461 self.assertEqual(warns.warnings[0].filename, __file__)
1462 with self.assertWarns(DeprecationWarning) as warns:
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001463 self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')',
1464 lower_char))
Serhiy Storchakac7ac7282017-05-16 15:16:15 +03001465 self.assertRegex(str(warns.warnings[0].message),
1466 'Flags not at the start')
1467 self.assertEqual(warns.warnings[0].filename, __file__)
Serhiy Storchaka305ccbe2017-05-10 06:05:20 +03001468
1469
Christian Heimes25bb7832008-01-11 16:17:00 +00001470 def test_dollar_matches_twice(self):
1471 "$ matches the end of string, and just before the terminating \n"
1472 pattern = re.compile('$')
1473 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1474 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1475 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1476
1477 pattern = re.compile('$', re.MULTILINE)
1478 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1479 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1480 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1481
Antoine Pitroufd036452008-08-19 17:56:33 +00001482 def test_bytes_str_mixing(self):
1483 # Mixing str and bytes is disallowed
1484 pat = re.compile('.')
1485 bpat = re.compile(b'.')
1486 self.assertRaises(TypeError, pat.match, b'b')
1487 self.assertRaises(TypeError, bpat.match, 'b')
1488 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1489 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1490 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1491 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1492 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1493 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1494
1495 def test_ascii_and_unicode_flag(self):
1496 # String patterns
1497 for flags in (0, re.UNICODE):
1498 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001499 self.assertTrue(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001500 pat = re.compile(r'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001501 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001502 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001503 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001504 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001505 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001506 pat = re.compile(r'\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001507 self.assertIsNone(pat.match('\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001508 pat = re.compile(r'(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001509 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001510 # Bytes patterns
1511 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001512 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001513 self.assertIsNone(pat.match(b'\xe0'))
R David Murray44b548d2016-09-08 13:59:53 -04001514 pat = re.compile(br'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001515 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001516 # Incompatibilities
R David Murray44b548d2016-09-08 13:59:53 -04001517 self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001518 self.assertRaises(re.error, re.compile, br'(?u)\w')
R David Murray44b548d2016-09-08 13:59:53 -04001519 self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII)
1520 self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII)
1521 self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001522 self.assertRaises(re.error, re.compile, r'(?au)\w')
Antoine Pitroufd036452008-08-19 17:56:33 +00001523
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001524 def test_locale_flag(self):
1525 import locale
Benjamin Peterson21a74312017-03-07 22:48:09 -08001526 _, enc = locale.getlocale(locale.LC_CTYPE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001527 # Search non-ASCII letter
1528 for i in range(128, 256):
1529 try:
1530 c = bytes([i]).decode(enc)
1531 sletter = c.lower()
1532 if sletter == c: continue
1533 bletter = sletter.encode(enc)
1534 if len(bletter) != 1: continue
1535 if bletter.decode(enc) != sletter: continue
1536 bpat = re.escape(bytes([i]))
1537 break
1538 except (UnicodeError, TypeError):
1539 pass
Benjamin Peterson1e687162017-03-01 21:53:00 -08001540 else:
1541 bletter = None
1542 bpat = b'A'
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001543 # Bytes patterns
1544 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1545 if bletter:
1546 self.assertTrue(pat.match(bletter))
1547 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1548 if bletter:
1549 self.assertTrue(pat.match(bletter))
1550 pat = re.compile(bpat, re.IGNORECASE)
1551 if bletter:
1552 self.assertIsNone(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001553 pat = re.compile(br'\w', re.LOCALE)
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001554 if bletter:
1555 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001556 pat = re.compile(br'(?L)\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001557 if bletter:
1558 self.assertTrue(pat.match(bletter))
R David Murray44b548d2016-09-08 13:59:53 -04001559 pat = re.compile(br'\w')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001560 if bletter:
1561 self.assertIsNone(pat.match(bletter))
1562 # Incompatibilities
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001563 self.assertRaises(ValueError, re.compile, '', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001564 self.assertRaises(re.error, re.compile, '(?L)')
Serhiy Storchaka9bd85b82016-06-11 19:15:00 +03001565 self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII)
1566 self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII)
1567 self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001568 self.assertRaises(re.error, re.compile, b'(?aL)')
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001569
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001570 def test_scoped_flags(self):
1571 self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
1572 self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
1573 self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
1574 self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
1575 self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
1576 self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
1577
1578 self.assertTrue(re.match(r'(?x: a) b', 'a b'))
1579 self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
1580 self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
1581 self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
1582
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001583 self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0'))
1584 self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0'))
1585 self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII))
1586
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001587 self.checkPatternError(r'(?a)(?-a:\w)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001588 "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001589 self.checkPatternError(r'(?i-i:a)',
Serhiy Storchaka3557b052017-10-24 23:31:42 +03001590 'bad inline flags: flag turned on and off', 5)
1591 self.checkPatternError(r'(?au:a)',
1592 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
1593 self.checkPatternError(br'(?aL:a)',
1594 "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001595
1596 self.checkPatternError(r'(?-', 'missing flag', 3)
1597 self.checkPatternError(r'(?-+', 'missing flag', 3)
1598 self.checkPatternError(r'(?-z', 'unknown flag', 3)
1599 self.checkPatternError(r'(?-i', 'missing :', 4)
1600 self.checkPatternError(r'(?-i)', 'missing :', 4)
1601 self.checkPatternError(r'(?-i+', 'missing :', 4)
1602 self.checkPatternError(r'(?-iz', 'unknown flag', 4)
1603 self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
1604 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
1605 self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
1606 self.checkPatternError(r'(?iz', 'unknown flag', 3)
1607
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001608 def test_bug_6509(self):
1609 # Replacement strings of both types must parse properly.
1610 # all strings
R David Murray44b548d2016-09-08 13:59:53 -04001611 pat = re.compile(r'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001612 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1613 pat = re.compile('a(.)')
1614 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1615 pat = re.compile('..')
1616 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1617
1618 # all bytes
R David Murray44b548d2016-09-08 13:59:53 -04001619 pat = re.compile(br'a(\w)')
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001620 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1621 pat = re.compile(b'a(.)')
1622 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1623 pat = re.compile(b'..')
1624 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1625
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001626 def test_dealloc(self):
1627 # issue 3299: check for segfault in debug build
1628 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001629 # the overflow limit is different on wide and narrow builds and it
1630 # depends on the definition of SRE_CODE (see sre.h).
1631 # 2**128 should be big enough to overflow on both. For smaller values
1632 # a RuntimeError is raised instead of OverflowError.
1633 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001634 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001635 with self.assertRaises(OverflowError):
Victor Stinner726a57d2016-11-22 23:04:39 +01001636 _sre.compile("abc", 0, [long_overflow], 0, {}, ())
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001637 with self.assertRaises(TypeError):
1638 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001641 self.assertTrue(re.search("123.*-", '123abc-'))
1642 self.assertTrue(re.search("123.*-", '123\xe9-'))
1643 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1644 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1645 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646
Ezio Melottidf723e12012-03-13 01:29:48 +02001647 def test_compile(self):
1648 # Test return value when given string and pattern as parameter
1649 pattern = re.compile('random pattern')
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001650 self.assertIsInstance(pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001651 same_pattern = re.compile(pattern)
Serhiy Storchaka0b5e61d2017-10-04 20:09:49 +03001652 self.assertIsInstance(same_pattern, re.Pattern)
Ezio Melottidf723e12012-03-13 01:29:48 +02001653 self.assertIs(same_pattern, pattern)
1654 # Test behaviour when not given a string or pattern as parameter
1655 self.assertRaises(TypeError, re.compile, 0)
1656
Antoine Pitroub33941a2012-12-03 20:55:56 +01001657 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001658 def test_large_search(self, size):
1659 # Issue #10182: indices were 32-bit-truncated.
1660 s = 'a' * size
1661 m = re.search('$', s)
1662 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001663 self.assertEqual(m.start(), size)
1664 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001665
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001666 # The huge memuse is because of re.sub() using a list and a join()
1667 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001668 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001669 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001670 # Issue #10182: indices were 32-bit-truncated.
1671 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001672 r, n = re.subn('', '', s)
1673 self.assertEqual(r, s)
1674 self.assertEqual(n, size + 1)
1675
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001676 def test_bug_16688(self):
1677 # Issue 16688: Backreferences make case-insensitive regex fail on
1678 # non-ASCII strings.
1679 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1680 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001681
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001682 def test_repeat_minmax_overflow(self):
1683 # Issue #13169
1684 string = "x" * 100000
1685 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1686 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1687 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1688 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1689 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1690 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1691 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1692 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1693 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1694 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1695 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1696
1697 @cpython_only
1698 def test_repeat_minmax_overflow_maxrepeat(self):
1699 try:
1700 from _sre import MAXREPEAT
1701 except ImportError:
1702 self.skipTest('requires _sre.MAXREPEAT constant')
1703 string = "x" * 100000
1704 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1705 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1706 (0, 100000))
1707 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1708 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1709 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1710 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1711
R David Murray26dfaac92013-04-14 13:00:54 -04001712 def test_backref_group_name_in_exception(self):
1713 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001714 self.checkPatternError('(?P=<foo>)',
1715 "bad character in group name '<foo>'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001716
1717 def test_group_name_in_exception(self):
1718 # Issue 17341: Poor error message when compiling invalid regex
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001719 self.checkPatternError('(?P<?foo>)',
1720 "bad character in group name '?foo'", 4)
R David Murray26dfaac92013-04-14 13:00:54 -04001721
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001722 def test_issue17998(self):
1723 for reps in '*', '+', '?', '{1}':
1724 for mod in '', '?':
1725 pattern = '.' + reps + mod + 'yz'
1726 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1727 ['xyz'], msg=pattern)
1728 pattern = pattern.encode()
1729 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1730 [b'xyz'], msg=pattern)
1731
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001732 def test_match_repr(self):
1733 for string in '[abracadabra]', S('[abracadabra]'):
1734 m = re.search(r'(.+)(.*?)\1', string)
1735 self.assertEqual(repr(m), "<%s.%s object; "
1736 "span=(1, 12), match='abracadabra'>" %
1737 (type(m).__module__, type(m).__qualname__))
1738 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1739 bytearray(b'[abracadabra]'),
1740 memoryview(b'[abracadabra]')):
R David Murray44b548d2016-09-08 13:59:53 -04001741 m = re.search(br'(.+)(.*?)\1', string)
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001742 self.assertEqual(repr(m), "<%s.%s object; "
1743 "span=(1, 12), match=b'abracadabra'>" %
1744 (type(m).__module__, type(m).__qualname__))
1745
1746 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1747 self.assertEqual(repr(first), "<%s.%s object; "
1748 "span=(0, 2), match='aa'>" %
1749 (type(second).__module__, type(first).__qualname__))
1750 self.assertEqual(repr(second), "<%s.%s object; "
1751 "span=(3, 5), match='bb'>" %
1752 (type(second).__module__, type(second).__qualname__))
1753
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001754
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001755 def test_bug_2537(self):
1756 # issue 2537: empty submatches
1757 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1758 for inner_op in ('{0,}', '*', '?'):
1759 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1760 m = r.match("xyyzy")
1761 self.assertEqual(m.group(0), "xyy")
1762 self.assertEqual(m.group(1), "")
1763 self.assertEqual(m.group(2), "y")
1764
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001765 @cpython_only
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001766 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001767 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001768 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001769 re.compile(pat, re.DEBUG)
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001770 self.maxDiff = None
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001771 dump = '''\
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001772SUBPATTERN 1 0 0
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001773 LITERAL 46
Serhiy Storchaka821a9d12017-05-14 08:32:33 +03001774BRANCH
1775 IN
1776 LITERAL 99
1777 LITERAL 104
1778OR
1779 LITERAL 112
1780 LITERAL 121
1781GROUPREF_EXISTS 1
1782 AT AT_END
1783ELSE
1784 LITERAL 58
1785 LITERAL 32
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +03001786
1787 0. INFO 8 0b1 2 5 (to 9)
1788 prefix_skip 0
1789 prefix [0x2e] ('.')
1790 overlap [0]
1791 9: MARK 0
179211. LITERAL 0x2e ('.')
179313. MARK 1
179415. BRANCH 10 (to 26)
179517. IN 6 (to 24)
179619. LITERAL 0x63 ('c')
179721. LITERAL 0x68 ('h')
179823. FAILURE
179924: JUMP 9 (to 34)
180026: branch 7 (to 33)
180127. LITERAL 0x70 ('p')
180229. LITERAL 0x79 ('y')
180331. JUMP 2 (to 34)
180433: FAILURE
180534: GROUPREF_EXISTS 0 6 (to 41)
180637. AT END
180739. JUMP 5 (to 45)
180841: LITERAL 0x3a (':')
180943. LITERAL 0x20 (' ')
181045: SUCCESS
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001811'''
1812 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001813 # Debug output is output again even a second time (bypassing
1814 # the cache -- issue #20426).
1815 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001816 re.compile(pat, re.DEBUG)
1817 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001818
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001819 def test_keyword_parameters(self):
1820 # Issue #20283: Accepting the string keyword parameter.
1821 pat = re.compile(r'(ab)')
1822 self.assertEqual(
1823 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1824 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001825 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1826 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001827 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1828 self.assertEqual(
1829 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1830 self.assertEqual(
1831 pat.split(string='abracadabra', maxsplit=1),
1832 ['', 'ab', 'racadabra'])
1833 self.assertEqual(
1834 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1835 (7, 9))
1836
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001837 def test_bug_20998(self):
1838 # Issue #20998: Fullmatch of repeated single character pattern
1839 # with ignore case.
1840 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1841
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001842 def test_locale_caching(self):
1843 # Issue #22410
1844 oldlocale = locale.setlocale(locale.LC_CTYPE)
1845 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1846 for loc in 'en_US.iso88591', 'en_US.utf8':
1847 try:
1848 locale.setlocale(locale.LC_CTYPE, loc)
1849 except locale.Error:
1850 # Unsupported locale on this system
1851 self.skipTest('test needs %s locale' % loc)
1852
1853 re.purge()
1854 self.check_en_US_iso88591()
1855 self.check_en_US_utf8()
1856 re.purge()
1857 self.check_en_US_utf8()
1858 self.check_en_US_iso88591()
1859
1860 def check_en_US_iso88591(self):
1861 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1862 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1863 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1864 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1865 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1866 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1867 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1868
1869 def check_en_US_utf8(self):
1870 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1871 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1872 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1873 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1874 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1875 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1876 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1877
Serhiy Storchaka898ff032017-05-05 08:53:40 +03001878 def test_locale_compiled(self):
1879 oldlocale = locale.setlocale(locale.LC_CTYPE)
1880 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1881 for loc in 'en_US.iso88591', 'en_US.utf8':
1882 try:
1883 locale.setlocale(locale.LC_CTYPE, loc)
1884 except locale.Error:
1885 # Unsupported locale on this system
1886 self.skipTest('test needs %s locale' % loc)
1887
1888 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1889 p1 = re.compile(b'\xc5\xe5', re.L|re.I)
1890 p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
1891 p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
1892 p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
1893 for p in p1, p2, p3:
1894 self.assertTrue(p.match(b'\xc5\xe5'))
1895 self.assertTrue(p.match(b'\xe5\xe5'))
1896 self.assertTrue(p.match(b'\xc5\xc5'))
1897 self.assertIsNone(p4.match(b'\xe5\xc5'))
1898 self.assertIsNone(p4.match(b'\xe5\xe5'))
1899 self.assertIsNone(p4.match(b'\xc5\xc5'))
1900
1901 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1902 for p in p1, p2, p3:
1903 self.assertTrue(p.match(b'\xc5\xe5'))
1904 self.assertIsNone(p.match(b'\xe5\xe5'))
1905 self.assertIsNone(p.match(b'\xc5\xc5'))
1906 self.assertTrue(p4.match(b'\xe5\xc5'))
1907 self.assertIsNone(p4.match(b'\xe5\xe5'))
1908 self.assertIsNone(p4.match(b'\xc5\xc5'))
1909
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001910 def test_error(self):
1911 with self.assertRaises(re.error) as cm:
1912 re.compile('(\u20ac))')
1913 err = cm.exception
1914 self.assertIsInstance(err.pattern, str)
1915 self.assertEqual(err.pattern, '(\u20ac))')
1916 self.assertEqual(err.pos, 3)
1917 self.assertEqual(err.lineno, 1)
1918 self.assertEqual(err.colno, 4)
1919 self.assertIn(err.msg, str(err))
1920 self.assertIn(' at position 3', str(err))
1921 self.assertNotIn(' at position 3', err.msg)
1922 # Bytes pattern
1923 with self.assertRaises(re.error) as cm:
1924 re.compile(b'(\xa4))')
1925 err = cm.exception
1926 self.assertIsInstance(err.pattern, bytes)
1927 self.assertEqual(err.pattern, b'(\xa4))')
1928 self.assertEqual(err.pos, 3)
1929 # Multiline pattern
1930 with self.assertRaises(re.error) as cm:
1931 re.compile("""
1932 (
1933 abc
1934 )
1935 )
1936 (
1937 """, re.VERBOSE)
1938 err = cm.exception
1939 self.assertEqual(err.pos, 77)
1940 self.assertEqual(err.lineno, 5)
1941 self.assertEqual(err.colno, 17)
1942 self.assertIn(err.msg, str(err))
1943 self.assertIn(' at position 77', str(err))
1944 self.assertIn('(line 5, column 17)', str(err))
1945
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001946 def test_misc_errors(self):
1947 self.checkPatternError(r'(', 'missing ), unterminated subpattern', 0)
1948 self.checkPatternError(r'((a|b)', 'missing ), unterminated subpattern', 0)
1949 self.checkPatternError(r'(a|b))', 'unbalanced parenthesis', 5)
1950 self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
1951 self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
1952 self.checkPatternError(r'(?iz)', 'unknown flag', 3)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +03001953 self.checkPatternError(r'(?i', 'missing -, : or )', 3)
Serhiy Storchaka632a77e2015-03-25 21:03:47 +02001954 self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
1955 self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
1956 self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
1957 self.checkPatternError(r'(?', 'unexpected end of pattern', 2)
1958
Victor Stinner8bf43e62016-11-14 12:38:43 +01001959 def test_enum(self):
1960 # Issue #28082: Check that str(flag) returns a human readable string
1961 # instead of an integer
1962 self.assertIn('ASCII', str(re.A))
1963 self.assertIn('DOTALL', str(re.S))
1964
Victor Stinnerb44fb122016-11-21 16:35:08 +01001965 def test_pattern_compare(self):
1966 pattern1 = re.compile('abc', re.IGNORECASE)
1967
Victor Stinnerbcf4dcc2016-11-22 15:30:38 +01001968 # equal to itself
1969 self.assertEqual(pattern1, pattern1)
1970 self.assertFalse(pattern1 != pattern1)
1971
Victor Stinnerb44fb122016-11-21 16:35:08 +01001972 # equal
1973 re.purge()
1974 pattern2 = re.compile('abc', re.IGNORECASE)
1975 self.assertEqual(hash(pattern2), hash(pattern1))
1976 self.assertEqual(pattern2, pattern1)
1977
1978 # not equal: different pattern
1979 re.purge()
1980 pattern3 = re.compile('XYZ', re.IGNORECASE)
1981 # Don't test hash(pattern3) != hash(pattern1) because there is no
1982 # warranty that hash values are different
1983 self.assertNotEqual(pattern3, pattern1)
1984
1985 # not equal: different flag (flags=0)
1986 re.purge()
1987 pattern4 = re.compile('abc')
1988 self.assertNotEqual(pattern4, pattern1)
1989
1990 # only == and != comparison operators are supported
1991 with self.assertRaises(TypeError):
1992 pattern1 < pattern2
1993
1994 def test_pattern_compare_bytes(self):
1995 pattern1 = re.compile(b'abc')
1996
1997 # equal: test bytes patterns
1998 re.purge()
1999 pattern2 = re.compile(b'abc')
2000 self.assertEqual(hash(pattern2), hash(pattern1))
2001 self.assertEqual(pattern2, pattern1)
2002
2003 # not equal: pattern of a different types (str vs bytes),
2004 # comparison must not raise a BytesWarning
2005 re.purge()
2006 pattern3 = re.compile('abc')
2007 with warnings.catch_warnings():
2008 warnings.simplefilter('error', BytesWarning)
2009 self.assertNotEqual(pattern3, pattern1)
2010
Serhiy Storchaka7e10dbb2017-02-04 22:53:57 +02002011 def test_bug_29444(self):
2012 s = bytearray(b'abcdefgh')
2013 m = re.search(b'[a-h]+', s)
2014 m2 = re.search(b'[e-h]+', s)
2015 self.assertEqual(m.group(), b'abcdefgh')
2016 self.assertEqual(m2.group(), b'efgh')
2017 s[:] = b'xyz'
2018 self.assertEqual(m.group(), b'xyz')
2019 self.assertEqual(m2.group(), b'')
2020
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002021
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002022class PatternReprTests(unittest.TestCase):
2023 def check(self, pattern, expected):
2024 self.assertEqual(repr(re.compile(pattern)), expected)
2025
2026 def check_flags(self, pattern, flags, expected):
2027 self.assertEqual(repr(re.compile(pattern, flags)), expected)
2028
2029 def test_without_flags(self):
2030 self.check('random pattern',
2031 "re.compile('random pattern')")
2032
2033 def test_single_flag(self):
2034 self.check_flags('random pattern', re.IGNORECASE,
2035 "re.compile('random pattern', re.IGNORECASE)")
2036
2037 def test_multiple_flags(self):
2038 self.check_flags('random pattern', re.I|re.S|re.X,
2039 "re.compile('random pattern', "
2040 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
2041
2042 def test_unicode_flag(self):
2043 self.check_flags('random pattern', re.U,
2044 "re.compile('random pattern')")
2045 self.check_flags('random pattern', re.I|re.S|re.U,
2046 "re.compile('random pattern', "
2047 "re.IGNORECASE|re.DOTALL)")
2048
2049 def test_inline_flags(self):
2050 self.check('(?i)pattern',
2051 "re.compile('(?i)pattern', re.IGNORECASE)")
2052
2053 def test_unknown_flags(self):
2054 self.check_flags('random pattern', 0x123000,
2055 "re.compile('random pattern', 0x123000)")
2056 self.check_flags('random pattern', 0x123000|re.I,
2057 "re.compile('random pattern', re.IGNORECASE|0x123000)")
2058
2059 def test_bytes(self):
2060 self.check(b'bytes pattern',
2061 "re.compile(b'bytes pattern')")
2062 self.check_flags(b'bytes pattern', re.A,
2063 "re.compile(b'bytes pattern', re.ASCII)")
2064
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002065 def test_locale(self):
2066 self.check_flags(b'bytes pattern', re.L,
2067 "re.compile(b'bytes pattern', re.LOCALE)")
2068
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02002069 def test_quotes(self):
2070 self.check('random "double quoted" pattern',
2071 '''re.compile('random "double quoted" pattern')''')
2072 self.check("random 'single quoted' pattern",
2073 '''re.compile("random 'single quoted' pattern")''')
2074 self.check('''both 'single' and "double" quotes''',
2075 '''re.compile('both \\'single\\' and "double" quotes')''')
2076
2077 def test_long_pattern(self):
2078 pattern = 'Very %spattern' % ('long ' * 1000)
2079 r = repr(re.compile(pattern))
2080 self.assertLess(len(r), 300)
2081 self.assertEqual(r[:30], "re.compile('Very long long lon")
2082 r = repr(re.compile(pattern, re.I))
2083 self.assertLess(len(r), 300)
2084 self.assertEqual(r[:30], "re.compile('Very long long lon")
2085 self.assertEqual(r[-16:], ", re.IGNORECASE)")
2086
2087
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02002088class ImplementationTest(unittest.TestCase):
2089 """
2090 Test implementation details of the re module.
2091 """
2092
2093 def test_overlap_table(self):
2094 f = sre_compile._generate_overlap_table
2095 self.assertEqual(f(""), [])
2096 self.assertEqual(f("a"), [0])
2097 self.assertEqual(f("abcd"), [0, 0, 0, 0])
2098 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
2099 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
2100 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
2101
2102
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002103class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00002104
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002105 def test_re_benchmarks(self):
2106 're_tests benchmarks'
2107 from test.re_tests import benchmarks
2108 for pattern, s in benchmarks:
2109 with self.subTest(pattern=pattern, string=s):
2110 p = re.compile(pattern)
2111 self.assertTrue(p.search(s))
2112 self.assertTrue(p.match(s))
2113 self.assertTrue(p.fullmatch(s))
2114 s2 = ' '*10000 + s + ' '*10000
2115 self.assertTrue(p.search(s2))
2116 self.assertTrue(p.match(s2, 10000))
2117 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
2118 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002119
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002120 def test_re_tests(self):
2121 're_tests test suite'
2122 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
2123 for t in tests:
2124 pattern = s = outcome = repl = expected = None
2125 if len(t) == 5:
2126 pattern, s, outcome, repl, expected = t
2127 elif len(t) == 3:
2128 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00002129 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002130 raise ValueError('Test tuples should have 3 or 5 fields', t)
2131
2132 with self.subTest(pattern=pattern, string=s):
2133 if outcome == SYNTAX_ERROR: # Expected a syntax error
2134 with self.assertRaises(re.error):
2135 re.compile(pattern)
2136 continue
2137
2138 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002139 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002140 if outcome == FAIL:
2141 self.assertIsNone(result, 'Succeeded incorrectly')
2142 continue
2143
2144 with self.subTest():
2145 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002146 # Matched, as expected, so now we compute the
2147 # result string and compare it to our expected result.
2148 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002149 vardict = {'found': result.group(0),
2150 'groups': result.group(),
2151 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002152 for i in range(1, 100):
2153 try:
2154 gi = result.group(i)
2155 # Special hack because else the string concat fails:
2156 if gi is None:
2157 gi = "None"
2158 except IndexError:
2159 gi = "Error"
2160 vardict['g%d' % i] = gi
2161 for i in result.re.groupindex.keys():
2162 try:
2163 gi = result.group(i)
2164 if gi is None:
2165 gi = "None"
2166 except IndexError:
2167 gi = "Error"
2168 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002169 self.assertEqual(eval(repl, vardict), expected,
2170 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002171
Antoine Pitrou22628c42008-07-22 17:53:22 +00002172 # Try the match with both pattern and string converted to
2173 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002174 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00002175 bpat = bytes(pattern, "ascii")
2176 bs = bytes(s, "ascii")
2177 except UnicodeEncodeError:
2178 # skip non-ascii tests
2179 pass
2180 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002181 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02002182 obj = re.compile(bpat)
2183 self.assertTrue(obj.search(bs))
2184
2185 # Try the match with LOCALE enabled, and check that it
2186 # still succeeds.
2187 with self.subTest('locale-sensitive match'):
2188 obj = re.compile(bpat, re.LOCALE)
2189 result = obj.search(bs)
2190 if result is None:
2191 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002192
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002193 # Try the match with the search area limited to the extent
2194 # of the match and see if it still succeeds. \B will
2195 # break (because it won't match at the end or start of a
2196 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002197 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
2198 and result is not None):
2199 with self.subTest('range-limited match'):
2200 obj = re.compile(pattern)
2201 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00002202
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002203 # Try the match with IGNORECASE enabled, and check that it
2204 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002205 with self.subTest('case-insensitive match'):
2206 obj = re.compile(pattern, re.IGNORECASE)
2207 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00002208
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002209 # Try the match with UNICODE locale enabled, and check
2210 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002211 with self.subTest('unicode-sensitive match'):
2212 obj = re.compile(pattern, re.UNICODE)
2213 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00002214
Gregory P. Smith5a631832010-07-27 05:31:29 +00002215
Skip Montanaro8ed06da2003-04-24 19:43:18 +00002216if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02002217 unittest.main()