blob: ae314841c68422eb40a705fc3e7857968e622afb [file] [log] [blame]
Serhiy Storchakae9277572014-11-10 12:37:02 +02001# -*- coding: utf-8 -*-
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +02002from test.test_support import (
3 verbose, run_unittest, import_module,
4 precisionbigmemtest, _2G, cpython_only,
5 captured_stdout, have_unicode, requires_unicode, u,
Serhiy Storchaka955b6762017-05-18 12:34:40 +03006 check_warnings, check_py3k_warnings)
Serhiy Storchakad4c72902014-10-31 00:53:19 +02007import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00008import re
Neal Norwitz94a9c092006-03-16 06:30:02 +00009from re import Scanner
R David Murray60773392013-04-14 13:08:50 -040010import sre_constants
Ezio Melotti46645632011-03-25 14:50:52 +020011import sys
12import string
13import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000014from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000015
Antoine Pitrou735f36e2012-12-03 20:53:12 +010016
Guido van Rossum23b22571997-07-17 22:36:14 +000017# Misc tests from Tim Peters' re.doc
18
Just van Rossum6802c6e2003-07-02 14:36:59 +000019# WARNING: Don't change details in these tests if you don't know
Ezio Melotti24b07bc2011-03-15 18:55:01 +020020# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000021# cover most of the code.
22
Skip Montanaro8ed06da2003-04-24 19:43:18 +000023import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000024
Skip Montanaro8ed06da2003-04-24 19:43:18 +000025class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000026
27 def test_weakref(self):
28 s = 'QabbbcR'
29 x = re.compile('ab+c')
30 y = proxy(x)
31 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
32
Skip Montanaro8ed06da2003-04-24 19:43:18 +000033 def test_search_star_plus(self):
34 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
35 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
36 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
37 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030038 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000039 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
40 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
41 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
42 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030043 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000044
Skip Montanaro8ed06da2003-04-24 19:43:18 +000045 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000046 int_value = int(matchobj.group(0))
47 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000048
Skip Montanaro8ed06da2003-04-24 19:43:18 +000049 def test_basic_re_sub(self):
50 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
52 '9.3 -3 24x100y')
53 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
54 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000055
Skip Montanaro8ed06da2003-04-24 19:43:18 +000056 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
57 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000058
Skip Montanaro8ed06da2003-04-24 19:43:18 +000059 s = r"\1\1"
60 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
Miss Islington (bot)d6635492017-12-12 10:49:18 -080061 self.assertEqual(re.sub('(.)', s.replace('\\', r'\\'), 'x'), s)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000062 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000063
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
65 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
66 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
67 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000068
Serhiy Storchaka955b6762017-05-18 12:34:40 +030069 self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
70 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
71 self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
72 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
73 for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
74 with check_py3k_warnings():
75 self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
Guido van Rossum95e80531997-08-13 22:34:14 +000076
Skip Montanaro8ed06da2003-04-24 19:43:18 +000077 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000078
Skip Montanaro2726fcd2003-04-25 14:31:54 +000079 def test_bug_449964(self):
80 # fails for group followed by other escape
81 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
82 'xx\bxx\b')
83
84 def test_bug_449000(self):
85 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000086 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
87 'abc\ndef\n')
88 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
90 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
91 'abc\ndef\n')
92 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
93 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000094
Serhiy Storchaka7644ff12014-09-14 17:40:44 +030095 @requires_unicode
Guido van Rossum1ff91d92007-09-10 22:02:25 +000096 def test_bug_1140(self):
97 # re.sub(x, y, u'') should return u'', not '', and
98 # re.sub(x, y, '') should return '', not u''.
99 # Also:
100 # re.sub(x, y, unicode(x)) should return unicode(y), and
101 # re.sub(x, y, str(x)) should return
102 # str(y) if isinstance(y, str) else unicode(y).
103 for x in 'x', u'x':
104 for y in 'y', u'y':
105 z = re.sub(x, y, u'')
106 self.assertEqual(z, u'')
107 self.assertEqual(type(z), unicode)
108 #
109 z = re.sub(x, y, '')
110 self.assertEqual(z, '')
111 self.assertEqual(type(z), str)
112 #
113 z = re.sub(x, y, unicode(x))
114 self.assertEqual(z, y)
115 self.assertEqual(type(z), unicode)
116 #
117 z = re.sub(x, y, str(x))
118 self.assertEqual(z, y)
119 self.assertEqual(type(z), type(y))
120
Raymond Hettinger80016c92007-12-19 18:13:31 +0000121 def test_bug_1661(self):
122 # Verify that flags do not get silently ignored with compiled patterns
123 pattern = re.compile('.')
124 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
125 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
126 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
127 self.assertRaises(ValueError, re.compile, pattern, re.I)
128
Guido van Rossume3c4fd92008-09-10 14:27:00 +0000129 def test_bug_3629(self):
130 # A regex that triggered a bug in the sre-code validator
131 re.compile("(?P<quote>)(?(quote))")
132
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000133 def test_sub_template_numeric_escape(self):
134 # bug 776311 and friends
135 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
136 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
137 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
138 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
139 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
140 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
141 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
142
143 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
144 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
145
146 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
147 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
148 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
149 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
150 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
151
152 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
153 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000154
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000155 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
156 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
157 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
158 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
160 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
164 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
166 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
167
168 # in python2.3 (etc), these loop endlessly in sre_parser.py
169 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
170 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
171 'xz8')
172 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
173 'xza')
174
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000175 def test_qualified_re_sub(self):
176 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
177 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000178
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000179 def test_bug_114660(self):
180 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
181 'hello there')
182
183 def test_bug_462270(self):
184 # Test for empty sub() behaviour, see SF bug #462270
185 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
186 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
187
Ezio Melottief317382012-11-03 20:31:12 +0200188 def test_symbolic_groups(self):
189 re.compile('(?P<a>x)(?P=a)(?(a)y)')
190 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
191 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
192 self.assertRaises(re.error, re.compile, '(?Px)')
193 self.assertRaises(re.error, re.compile, '(?P=)')
194 self.assertRaises(re.error, re.compile, '(?P=1)')
195 self.assertRaises(re.error, re.compile, '(?P=a)')
196 self.assertRaises(re.error, re.compile, '(?P=a1)')
197 self.assertRaises(re.error, re.compile, '(?P=a.)')
198 self.assertRaises(re.error, re.compile, '(?P<)')
199 self.assertRaises(re.error, re.compile, '(?P<>)')
200 self.assertRaises(re.error, re.compile, '(?P<1>)')
201 self.assertRaises(re.error, re.compile, '(?P<a.>)')
202 self.assertRaises(re.error, re.compile, '(?())')
203 self.assertRaises(re.error, re.compile, '(?(a))')
204 self.assertRaises(re.error, re.compile, '(?(1a))')
205 self.assertRaises(re.error, re.compile, '(?(a.))')
206
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000207 def test_symbolic_refs(self):
208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
210 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
211 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melottief317382012-11-03 20:31:12 +0200212 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000213 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
214 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
215 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
216 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000217 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000218
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000219 def test_re_subn(self):
220 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
221 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
222 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
223 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
224 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000225
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000226 def test_re_split(self):
227 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300228 self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c'])
229 self.assertEqual(re.split("(:+)", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000230 ['', ':', 'a', ':', 'b', '::', 'c'])
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300231 self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
232 self.assertEqual(re.split("(:)+", ":a:b::c"),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 ['', ':', 'a', ':', 'b', ':', 'c'])
234 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
235 ['', ':', 'a', ':b::', 'c'])
236 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
237 ['', None, ':', 'a', None, ':', '', 'b', None, '',
238 None, '::', 'c'])
239 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
240 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000241
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300242 for sep, expected in [
243 (':*', ['', 'a', 'b', 'c']),
244 ('(?::*)', ['', 'a', 'b', 'c']),
245 ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
246 ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
247 ]:
248 with check_py3k_warnings(('', FutureWarning)):
249 self.assertEqual(re.split(sep, ':a:b::c'), expected)
250
251 for sep, expected in [
252 ('', [':a:b::c']),
253 (r'\b', [':a:b::c']),
254 (r'(?=:)', [':a:b::c']),
255 (r'(?<=:)', [':a:b::c']),
256 ]:
257 with check_py3k_warnings():
258 self.assertEqual(re.split(sep, ':a:b::c'), expected)
259
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000260 def test_qualified_re_split(self):
261 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
262 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
263 self.assertEqual(re.split("(:)", ":a:b::c", 2),
264 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300265 self.assertEqual(re.split("(:+)", ":a:b::c", 2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000266 ['', ':', 'a', ':', 'b::c'])
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300267 with check_py3k_warnings(('', FutureWarning)):
268 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
269 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000270
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000271 def test_re_findall(self):
272 self.assertEqual(re.findall(":+", "abc"), [])
273 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
274 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
275 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
276 (":", ":"),
277 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000278
Skip Montanaro5ba00542003-04-25 16:00:14 +0000279 def test_bug_117612(self):
280 self.assertEqual(re.findall(r"(a|(b))", "aba"),
281 [("a", ""),("b", "b"),("a", "")])
282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000284 self.assertEqual(re.match('a', 'a').groups(), ())
285 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
286 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
287 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
288 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000289
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000290 pat = re.compile('((a)|(b))(c)?')
291 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
292 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
293 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
294 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
295 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000296
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000297 # A single group
298 m = re.match('(a)', 'a')
299 self.assertEqual(m.group(0), 'a')
300 self.assertEqual(m.group(0), 'a')
301 self.assertEqual(m.group(1), 'a')
302 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000303
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000304 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
305 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
306 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
307 (None, 'b', None))
308 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000309
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000310 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000311 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
312 ('(', 'a'))
313 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
314 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300315 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
316 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000317 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
318 ('a', 'b'))
319 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
320 (None, 'd'))
321 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
322 (None, 'd'))
323 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
324 ('a', ''))
325
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000326 # Tests for bug #1177831: exercise groups other than the first group
327 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
328 self.assertEqual(p.match('abc').groups(),
329 ('a', 'b', 'c'))
330 self.assertEqual(p.match('ad').groups(),
331 ('a', None, 'd'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300332 self.assertIsNone(p.match('abd'))
333 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000334
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000335
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000336 def test_re_groupref(self):
337 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
338 ('|', 'a'))
339 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
340 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300341 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
342 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000343 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
344 ('a', 'a'))
345 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
346 (None, None))
347
348 def test_groupdict(self):
349 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
350 'first second').groupdict(),
351 {'first':'first', 'second':'second'})
352
353 def test_expand(self):
354 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
355 "first second")
356 .expand(r"\2 \1 \g<second> \g<first>"),
357 "second first second first")
358
359 def test_repeat_minmax(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300360 self.assertIsNone(re.match("^(\w){1}$", "abc"))
361 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
362 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
363 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000364
365 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
366 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
367 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
368 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
369 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
370 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
371 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
372 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
373
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300374 self.assertIsNone(re.match("^x{1}$", "xxx"))
375 self.assertIsNone(re.match("^x{1}?$", "xxx"))
376 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
377 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000378
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300379 self.assertTrue(re.match("^x{3}$", "xxx"))
380 self.assertTrue(re.match("^x{1,3}$", "xxx"))
381 self.assertTrue(re.match("^x{1,4}$", "xxx"))
382 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
383 self.assertTrue(re.match("^x{3}?$", "xxx"))
384 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
385 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
386 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000387
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300388 self.assertIsNone(re.match("^x{}$", "xxx"))
389 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000390
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000391 def test_getattr(self):
392 self.assertEqual(re.match("(a)", "a").pos, 0)
393 self.assertEqual(re.match("(a)", "a").endpos, 1)
394 self.assertEqual(re.match("(a)", "a").string, "a")
395 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300396 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000397
398 def test_special_escapes(self):
399 self.assertEqual(re.search(r"\b(b.)\b",
400 "abcd abc bcd bx").group(1), "bx")
401 self.assertEqual(re.search(r"\B(b.)\B",
402 "abc bcd bc abxd").group(1), "bx")
403 self.assertEqual(re.search(r"\b(b.)\b",
404 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
405 self.assertEqual(re.search(r"\B(b.)\B",
406 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300407 if have_unicode:
408 self.assertEqual(re.search(r"\b(b.)\b",
409 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
410 self.assertEqual(re.search(r"\B(b.)\B",
411 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000412 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
413 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300414 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000415 self.assertEqual(re.search(r"\b(b.)\b",
416 u"abcd abc bcd bx").group(1), "bx")
417 self.assertEqual(re.search(r"\B(b.)\B",
418 u"abc bcd bc abxd").group(1), "bx")
419 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
420 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300421 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000422 self.assertEqual(re.search(r"\d\D\w\W\s\S",
423 "1aa! a").group(0), "1aa! a")
424 self.assertEqual(re.search(r"\d\D\w\W\s\S",
425 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300426 if have_unicode:
427 self.assertEqual(re.search(r"\d\D\w\W\s\S",
428 "1aa! a", re.UNICODE).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000429
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300430 def test_other_escapes(self):
431 self.assertRaises(re.error, re.compile, "\\")
432 self.assertEqual(re.match(r"\(", '(').group(), '(')
433 self.assertIsNone(re.match(r"\(", ')'))
434 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
435 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
436 self.assertIsNone(re.match(r"[\]]", '['))
437 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
438 self.assertIsNone(re.match(r"[a\-c]", 'b'))
439 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
440 self.assertIsNone(re.match(r"[\^a]+", 'b'))
441 re.purge() # for warnings
442 for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY':
443 warn = FutureWarning if c in 'Uu' else DeprecationWarning
444 with check_py3k_warnings(('', warn)):
445 self.assertEqual(re.match('\\%c$' % c, c).group(), c)
446 self.assertIsNone(re.match('\\%c' % c, 'a'))
447 for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ':
448 warn = FutureWarning if c in 'Uu' else DeprecationWarning
449 with check_py3k_warnings(('', warn)):
450 self.assertEqual(re.match('[\\%c]$' % c, c).group(), c)
451 self.assertIsNone(re.match('[\\%c]' % c, 'a'))
452
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200453 def test_string_boundaries(self):
454 # See http://bugs.python.org/issue10713
455 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
456 "abc")
457 # There's a word boundary at the start of a string.
458 self.assertTrue(re.match(r"\b", "abc"))
459 # A non-empty string includes a non-boundary zero-length match.
460 self.assertTrue(re.search(r"\B", "abc"))
461 # There is no non-boundary match at the start of a string.
462 self.assertFalse(re.match(r"\B", "abc"))
463 # However, an empty string contains no word boundaries, and also no
464 # non-boundaries.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300465 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200466 # This one is questionable and different from the perlre behaviour,
467 # but describes current behavior.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300468 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200469 # A single word-character string has two boundaries, but no
470 # non-boundary gaps.
471 self.assertEqual(len(re.findall(r"\b", "a")), 2)
472 self.assertEqual(len(re.findall(r"\B", "a")), 0)
473 # If there are no words, there are no boundaries
474 self.assertEqual(len(re.findall(r"\b", " ")), 0)
475 self.assertEqual(len(re.findall(r"\b", " ")), 0)
476 # Can match around the whitespace.
477 self.assertEqual(len(re.findall(r"\B", " ")), 2)
478
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300479 @requires_unicode
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000480 def test_bigcharset(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300481 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
482 unichr(0x2222)).group(1), unichr(0x2222))
483 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
484 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
Serhiy Storchaka22fb0de2013-10-24 22:02:42 +0300485 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300486 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000487
Antoine Pitroub83ea142012-11-20 22:30:42 +0100488 def test_big_codesize(self):
489 # Issue #1160
490 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300491 self.assertTrue(r.match('1000'))
492 self.assertTrue(r.match('9999'))
Antoine Pitroub83ea142012-11-20 22:30:42 +0100493
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000494 def test_anyall(self):
495 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
496 "a\nb")
497 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
498 "a\n\nb")
499
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200500 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000501 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
502 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
503 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
504 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
505 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
506 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
507 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
508
509 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
510 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
511 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
512 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
513
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200514 # Group reference.
515 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
516 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
517 # Named group reference.
518 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
519 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
520 # Conditional group reference.
521 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
522 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
523 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
524 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
525 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
526 # Group used before defined.
527 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
528 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
529 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
530
531 def test_lookbehind(self):
532 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
533 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
534 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
535 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
536 # Group reference.
537 with check_warnings(('', RuntimeWarning)):
538 re.compile(r'(a)a(?<=\1)c')
539 # Named group reference.
540 with check_warnings(('', RuntimeWarning)):
541 re.compile(r'(?P<g>a)a(?<=(?P=g))c')
542 # Conditional group reference.
543 with check_warnings(('', RuntimeWarning)):
544 re.compile(r'(a)b(?<=(?(1)b|x))c')
545 # Group used before defined.
546 with check_warnings(('', RuntimeWarning)):
547 re.compile(r'(a)b(?<=(?(2)b|x))(c)')
548
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000549 def test_ignore_case(self):
Georg Brandl30de77b2008-08-24 18:11:07 +0000550 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
551 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000552 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
553 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
554 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
555 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
556 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
557 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
558 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
559 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
560
Serhiy Storchakae9277572014-11-10 12:37:02 +0200561 if have_unicode:
562 assert u(r'\u212a').lower() == u'k' # 'K'
563 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
564 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
565 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
566 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
567 assert u(r'\u017f').upper() == u'S' # 'ſ'
568 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
569 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
570 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
571 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
572
573 def test_ignore_case_set(self):
574 self.assertTrue(re.match(r'[19A]', 'A', re.I))
575 self.assertTrue(re.match(r'[19a]', 'a', re.I))
576 self.assertTrue(re.match(r'[19a]', 'A', re.I))
577 self.assertTrue(re.match(r'[19A]', 'a', re.I))
578 if have_unicode:
579 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
580 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
581 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
582 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
583 assert u(r'\u212a').lower() == u'k' # 'K'
584 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
585 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
586 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
587 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
588 assert u(r'\u017f').upper() == u'S' # 'ſ'
589 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
590 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
591 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
592 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
593
Serhiy Storchakae9e54ae2014-10-31 13:53:21 +0200594 def test_ignore_case_range(self):
595 # Issues #3511, #17381.
596 self.assertTrue(re.match(r'[9-a]', '_', re.I))
597 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
598 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
599 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
600 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
601 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
602 if have_unicode:
603 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
604 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
605 self.assertTrue(re.match(u(r'[\xc0-\xde]'),
606 u(r'\xd7'), re.U | re.I))
607 self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
608 u(r'\xf7'), re.U | re.I))
609 self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
610 u(r'\xf7'), re.U | re.I))
611 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
612 u(r'\xd7'), re.U | re.I))
613 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
614 u(r'\u0450'), re.U | re.I))
615 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
616 u(r'\u0400'), re.U | re.I))
617 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
618 u(r'\u0450'), re.U | re.I))
619 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
620 u(r'\u0400'), re.U | re.I))
621 if sys.maxunicode > 0xffff:
622 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
623 u(r'\U00010428'), re.U | re.I))
624 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
625 u(r'\U00010400'), re.U | re.I))
626 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
627 u(r'\U00010428'), re.U | re.I))
628 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
629 u(r'\U00010400'), re.U | re.I))
630
Serhiy Storchakae9277572014-11-10 12:37:02 +0200631 assert u(r'\u212a').lower() == u'k' # 'K'
632 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
633 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
634 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
635 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
636 assert u(r'\u017f').upper() == u'S' # 'ſ'
637 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
638 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
639 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
640 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
641
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000642 def test_category(self):
643 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
644
645 def test_getlower(self):
646 import _sre
647 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
648 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300649 if have_unicode:
650 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000651
652 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
653 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
654
655 def test_not_literal(self):
656 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
657 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
658
659 def test_search_coverage(self):
660 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
661 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
662
Ezio Melotti46645632011-03-25 14:50:52 +0200663 def assertMatch(self, pattern, text, match=None, span=None,
664 matcher=re.match):
665 if match is None and span is None:
666 # the pattern matches the whole text
667 match = text
668 span = (0, len(text))
669 elif match is None or span is None:
670 raise ValueError('If match is not None, span should be specified '
671 '(and vice versa).')
672 m = matcher(pattern, text)
673 self.assertTrue(m)
674 self.assertEqual(m.group(), match)
675 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000676
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300677 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200678 def test_re_escape(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300679 alnum_chars = unicode(string.ascii_letters + string.digits)
Ezio Melotti46645632011-03-25 14:50:52 +0200680 p = u''.join(unichr(i) for i in range(256))
681 for c in p:
682 if c in alnum_chars:
683 self.assertEqual(re.escape(c), c)
684 elif c == u'\x00':
685 self.assertEqual(re.escape(c), u'\\000')
686 else:
687 self.assertEqual(re.escape(c), u'\\' + c)
688 self.assertMatch(re.escape(c), c)
689 self.assertMatch(re.escape(p), p)
690
691 def test_re_escape_byte(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300692 alnum_chars = string.ascii_letters + string.digits
Ezio Melotti46645632011-03-25 14:50:52 +0200693 p = ''.join(chr(i) for i in range(256))
694 for b in p:
695 if b in alnum_chars:
696 self.assertEqual(re.escape(b), b)
697 elif b == b'\x00':
698 self.assertEqual(re.escape(b), b'\\000')
699 else:
700 self.assertEqual(re.escape(b), b'\\' + b)
701 self.assertMatch(re.escape(b), b)
702 self.assertMatch(re.escape(p), p)
703
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300704 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200705 def test_re_escape_non_ascii(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300706 s = u(r'xxx\u2620\u2620\u2620xxx')
Ezio Melotti46645632011-03-25 14:50:52 +0200707 s_escaped = re.escape(s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300708 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
Ezio Melotti46645632011-03-25 14:50:52 +0200709 self.assertMatch(s_escaped, s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300710 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
711 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
Ezio Melotti46645632011-03-25 14:50:52 +0200712
713 def test_re_escape_non_ascii_bytes(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300714 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
Ezio Melotti46645632011-03-25 14:50:52 +0200715 b_escaped = re.escape(b)
716 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
717 self.assertMatch(b_escaped, b)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300718 res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
Ezio Melotti46645632011-03-25 14:50:52 +0200719 self.assertEqual(len(res), 2)
Guido van Rossum49946571997-07-18 04:26:25 +0000720
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000721 def test_pickling(self):
722 import pickle
Skip Montanaro1e703c62003-04-25 15:40:28 +0000723 self.pickle_test(pickle)
724 import cPickle
725 self.pickle_test(cPickle)
Žiga Seilnacht7492e422007-03-21 20:07:56 +0000726 # old pickles expect the _compile() reconstructor in sre module
Florent Xicluna6257a7b2010-03-31 22:01:03 +0000727 import_module("sre", deprecated=True)
728 from sre import _compile
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300729 # current pickle expects the _compile() reconstructor in re module
730 from re import _compile
Skip Montanaro1e703c62003-04-25 15:40:28 +0000731
732 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000733 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300734 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
735 pickled = pickle.dumps(oldpat, proto)
736 newpat = pickle.loads(pickled)
737 self.assertEqual(newpat, oldpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000738
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000739 def test_constants(self):
740 self.assertEqual(re.I, re.IGNORECASE)
741 self.assertEqual(re.L, re.LOCALE)
742 self.assertEqual(re.M, re.MULTILINE)
743 self.assertEqual(re.S, re.DOTALL)
744 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000745
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000746 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000747 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300748 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000749
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000750 def test_sre_character_literals(self):
751 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300752 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
753 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
754 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
755 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
756 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
757 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000758 self.assertRaises(re.error, re.match, "\911", "")
759
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000760 def test_sre_character_class_literals(self):
761 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300762 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
763 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
764 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
765 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
766 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
767 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000768 self.assertRaises(re.error, re.match, "[\911]", "")
769
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000770 def test_bug_113254(self):
771 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
772 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
773 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
774
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000775 def test_bug_527371(self):
776 # bug described in patches 527371/672491
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300777 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000778 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
779 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
780 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
781 self.assertEqual(re.match("((a))", "a").lastindex, 1)
782
783 def test_bug_545855(self):
784 # bug 545855 -- This pattern failed to cause a compile error as it
785 # should, instead provoking a TypeError.
786 self.assertRaises(re.error, re.compile, 'foo[a-')
787
788 def test_bug_418626(self):
789 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
790 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
791 # pattern '*?' on a long string.
792 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
793 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
794 20003)
795 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000796 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000797 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000798 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000799
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300800 @requires_unicode
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000801 def test_bug_612074(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300802 pat=u"["+re.escape(unichr(0x2039))+u"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000803 self.assertEqual(re.compile(pat) and 1, 1)
804
Skip Montanaro1e703c62003-04-25 15:40:28 +0000805 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000806 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000807 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000808 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
809 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
810 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000811
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +0200812 def test_unlimited_zero_width_repeat(self):
813 # Issue #9669
814 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
815 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
816 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
817 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
818 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
819 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
820
Skip Montanaro1e703c62003-04-25 15:40:28 +0000821 def test_scanner(self):
822 def s_ident(scanner, token): return token
823 def s_operator(scanner, token): return "op%s" % token
824 def s_float(scanner, token): return float(token)
825 def s_int(scanner, token): return int(token)
826
827 scanner = Scanner([
828 (r"[a-zA-Z_]\w*", s_ident),
829 (r"\d+\.\d*", s_float),
830 (r"\d+", s_int),
831 (r"=|\+|-|\*|/", s_operator),
832 (r"\s+", None),
833 ])
834
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300835 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000836
Skip Montanaro1e703c62003-04-25 15:40:28 +0000837 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
838 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
839 'op+', 'bar'], ''))
840
Skip Montanaro5ba00542003-04-25 16:00:14 +0000841 def test_bug_448951(self):
842 # bug 448951 (similar to 429357, but with single char match)
843 # (Also test greedy matches.)
844 for op in '','?','*':
845 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
846 (None, None))
847 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
848 ('a:', 'a'))
849
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000850 def test_bug_725106(self):
851 # capturing groups in alternatives in repeats
852 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
853 ('b', 'a'))
854 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
855 ('c', 'b'))
856 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
857 ('b', None))
858 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
859 ('b', None))
860 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
861 ('b', 'a'))
862 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
863 ('c', 'b'))
864 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
865 ('b', None))
866 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
867 ('b', None))
868
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000869 def test_bug_725149(self):
870 # mark_stack_base restoring before restoring marks
871 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
872 ('a', None))
873 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
874 ('a', None, None))
875
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300876 @requires_unicode
Just van Rossum12723ba2003-07-02 20:03:04 +0000877 def test_bug_764548(self):
878 # bug 764548, re.compile() barfs on str/unicode subclasses
Just van Rossum12723ba2003-07-02 20:03:04 +0000879 class my_unicode(unicode): pass
880 pat = re.compile(my_unicode("abc"))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300881 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000882
Skip Montanaro5ba00542003-04-25 16:00:14 +0000883 def test_finditer(self):
884 iter = re.finditer(r":+", "a:b::c:::d")
885 self.assertEqual([item.group(0) for item in iter],
886 [":", "::", ":::"])
887
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300888 @requires_unicode
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000889 def test_bug_926075(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300890 self.assertIsNot(re.compile('bug_926075'),
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300891 re.compile(u'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000892
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300893 @requires_unicode
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000894 def test_bug_931848(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300895 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000896 self.assertEqual(re.compile(pattern).split("a.b.c"),
897 ['a','b','c'])
898
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000899 def test_bug_581080(self):
900 iter = re.finditer(r"\s", "a b")
901 self.assertEqual(iter.next().span(), (1,2))
902 self.assertRaises(StopIteration, iter.next)
903
904 scanner = re.compile(r"\s").scanner("a b")
905 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300906 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000907
908 def test_bug_817234(self):
909 iter = re.finditer(r".*", "asdf")
910 self.assertEqual(iter.next().span(), (0, 4))
911 self.assertEqual(iter.next().span(), (4, 4))
912 self.assertRaises(StopIteration, iter.next)
913
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300914 @requires_unicode
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000915 def test_bug_6561(self):
916 # '\d' should match characters in Unicode category 'Nd'
917 # (Number, Decimal Digit), but not those in 'Nl' (Number,
918 # Letter) or 'No' (Number, Other).
919 decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300920 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
921 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
922 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000923 ]
924 for x in decimal_digits:
925 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
926
927 not_decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300928 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
929 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
930 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
931 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000932 ]
933 for x in not_decimal_digits:
934 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
935
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000936 def test_empty_array(self):
937 # SF buf 1647541
938 import array
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300939 typecodes = 'cbBhHiIlLfd'
940 if have_unicode:
941 typecodes += 'u'
942 for typecode in typecodes:
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000943 a = array.array(typecode)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300944 self.assertIsNone(re.compile("bla").match(a))
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000945 self.assertEqual(re.compile("").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000946
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300947 @requires_unicode
Guido van Rossumae04c332008-01-03 19:12:44 +0000948 def test_inline_flags(self):
949 # Bug #1700
950 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
951 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
952
953 p = re.compile(upper_char, re.I | re.U)
954 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300955 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000956
957 p = re.compile(lower_char, re.I | re.U)
958 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300959 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000960
961 p = re.compile('(?i)' + upper_char, re.U)
962 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300963 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000964
965 p = re.compile('(?i)' + lower_char, re.U)
966 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300967 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000968
969 p = re.compile('(?iu)' + upper_char)
970 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300971 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000972
973 p = re.compile('(?iu)' + lower_char)
974 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300975 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000976
Serhiy Storchaka0b5f22d2016-09-11 01:39:51 +0300977 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
978 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
979
Serhiy Storchaka955b6762017-05-18 12:34:40 +0300980 # Incompatibilities
981 re.purge()
982 with check_py3k_warnings():
983 re.compile('', re.LOCALE|re.UNICODE)
984 with check_py3k_warnings():
985 re.compile('(?L)', re.UNICODE)
986 with check_py3k_warnings():
987 re.compile('(?u)', re.LOCALE)
988 with check_py3k_warnings():
989 re.compile('(?Lu)')
990 with check_py3k_warnings():
991 re.compile('(?uL)')
992
Amaury Forgeot d'Arcd08a8eb2008-01-10 21:59:42 +0000993 def test_dollar_matches_twice(self):
994 "$ matches the end of string, and just before the terminating \n"
995 pattern = re.compile('$')
996 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
997 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
998 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
999
1000 pattern = re.compile('$', re.MULTILINE)
1001 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1002 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1003 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1004
Antoine Pitrouefdddd32010-01-14 17:25:24 +00001005 def test_dealloc(self):
1006 # issue 3299: check for segfault in debug build
1007 import _sre
Ezio Melotti0e4e7322010-01-23 10:43:05 +00001008 # the overflow limit is different on wide and narrow builds and it
1009 # depends on the definition of SRE_CODE (see sre.h).
1010 # 2**128 should be big enough to overflow on both. For smaller values
1011 # a RuntimeError is raised instead of OverflowError.
1012 long_overflow = 2**128
Antoine Pitrouefdddd32010-01-14 17:25:24 +00001013 self.assertRaises(TypeError, re.finditer, "a", {})
1014 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Guido van Rossumae04c332008-01-03 19:12:44 +00001015
Ezio Melottib56b6ff2012-03-13 01:25:40 +02001016 def test_compile(self):
1017 # Test return value when given string and pattern as parameter
1018 pattern = re.compile('random pattern')
1019 self.assertIsInstance(pattern, re._pattern_type)
1020 same_pattern = re.compile(pattern)
1021 self.assertIsInstance(same_pattern, re._pattern_type)
1022 self.assertIs(same_pattern, pattern)
1023 # Test behaviour when not given a string or pattern as parameter
1024 self.assertRaises(TypeError, re.compile, 0)
1025
Ezio Melotti5c4e32b2013-01-11 08:32:01 +02001026 def test_bug_13899(self):
1027 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1028 # nothing. Ditto B and Z.
Serhiy Storchaka955b6762017-05-18 12:34:40 +03001029 with check_py3k_warnings():
1030 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1031 ['A', 'B', '\b', 'C', 'Z'])
Ezio Melotti5c4e32b2013-01-11 08:32:01 +02001032
Antoine Pitrou735f36e2012-12-03 20:53:12 +01001033 @precisionbigmemtest(size=_2G, memuse=1)
1034 def test_large_search(self, size):
1035 # Issue #10182: indices were 32-bit-truncated.
1036 s = 'a' * size
1037 m = re.search('$', s)
1038 self.assertIsNotNone(m)
Antoine Pitrou74635c92012-12-03 21:08:43 +01001039 self.assertEqual(m.start(), size)
1040 self.assertEqual(m.end(), size)
Antoine Pitrou735f36e2012-12-03 20:53:12 +01001041
Antoine Pitroub83575b2012-12-02 12:52:36 +01001042 # The huge memuse is because of re.sub() using a list and a join()
1043 # to create the replacement result.
Antoine Pitrou735f36e2012-12-03 20:53:12 +01001044 @precisionbigmemtest(size=_2G, memuse=16 + 2)
1045 def test_large_subn(self, size):
Antoine Pitroub83575b2012-12-02 12:52:36 +01001046 # Issue #10182: indices were 32-bit-truncated.
1047 s = 'a' * size
Antoine Pitroub83575b2012-12-02 12:52:36 +01001048 r, n = re.subn('', '', s)
1049 self.assertEqual(r, s)
1050 self.assertEqual(n, size + 1)
1051
1052
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001053 def test_repeat_minmax_overflow(self):
1054 # Issue #13169
1055 string = "x" * 100000
1056 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1057 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1058 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1059 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1060 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1061 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1062 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1063 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1064 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1065 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1066 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1067
1068 @cpython_only
1069 def test_repeat_minmax_overflow_maxrepeat(self):
1070 try:
1071 from _sre import MAXREPEAT
1072 except ImportError:
1073 self.skipTest('requires _sre.MAXREPEAT constant')
1074 string = "x" * 100000
1075 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1076 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1077 (0, 100000))
1078 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1079 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1080 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1081 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1082
R David Murray60773392013-04-14 13:08:50 -04001083 def test_backref_group_name_in_exception(self):
1084 # Issue 17341: Poor error message when compiling invalid regex
1085 with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1086 re.compile('(?P=<foo>)')
1087
1088 def test_group_name_in_exception(self):
1089 # Issue 17341: Poor error message when compiling invalid regex
1090 with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1091 re.compile('(?P<?foo>)')
1092
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001093 def test_issue17998(self):
1094 for reps in '*', '+', '?', '{1}':
1095 for mod in '', '?':
1096 pattern = '.' + reps + mod + 'yz'
1097 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1098 ['xyz'], msg=pattern)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03001099 if have_unicode:
1100 pattern = unicode(pattern)
1101 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1102 [u'xyz'], msg=pattern)
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001103
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001104
Serhiy Storchaka83737c62013-08-19 23:20:07 +03001105 def test_bug_2537(self):
1106 # issue 2537: empty submatches
1107 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1108 for inner_op in ('{0,}', '*', '?'):
1109 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1110 m = r.match("xyyzy")
1111 self.assertEqual(m.group(0), "xyy")
1112 self.assertEqual(m.group(1), "")
1113 self.assertEqual(m.group(2), "y")
1114
Antoine Pitrouf5814112014-02-03 20:59:59 +01001115 def test_debug_flag(self):
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001116 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitrouf5814112014-02-03 20:59:59 +01001117 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001118 re.compile(pat, re.DEBUG)
1119 dump = '''\
1120subpattern 1
1121 literal 46
1122subpattern None
1123 branch
1124 in
1125 literal 99
1126 literal 104
1127 or
1128 literal 112
1129 literal 121
1130subpattern None
1131 groupref_exists 1
1132 at at_end
1133 else
1134 literal 58
1135 literal 32
1136'''
1137 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001138 # Debug output is output again even a second time (bypassing
1139 # the cache -- issue #20426).
1140 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001141 re.compile(pat, re.DEBUG)
1142 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001143
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001144 def test_keyword_parameters(self):
1145 # Issue #20283: Accepting the string keyword parameter.
1146 pat = re.compile(r'(ab)')
1147 self.assertEqual(
1148 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1149 self.assertEqual(
1150 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1151 self.assertEqual(
1152 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1153 self.assertEqual(
1154 pat.split(string='abracadabra', maxsplit=1),
1155 ['', 'ab', 'racadabra'])
1156
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04001157 def test_match_group_takes_long(self):
1158 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1159 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1160
Serhiy Storchakad4c72902014-10-31 00:53:19 +02001161 def test_locale_caching(self):
1162 # Issue #22410
1163 oldlocale = locale.setlocale(locale.LC_CTYPE)
1164 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1165 for loc in 'en_US.iso88591', 'en_US.utf8':
1166 try:
1167 locale.setlocale(locale.LC_CTYPE, loc)
1168 except locale.Error:
1169 # Unsupported locale on this system
1170 self.skipTest('test needs %s locale' % loc)
1171
1172 re.purge()
1173 self.check_en_US_iso88591()
1174 self.check_en_US_utf8()
1175 re.purge()
1176 self.check_en_US_utf8()
1177 self.check_en_US_iso88591()
1178
1179 def check_en_US_iso88591(self):
1180 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1181 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1182 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1183 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1184 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1185 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1186 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1187
1188 def check_en_US_utf8(self):
1189 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1190 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1191 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1192 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1193 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1194 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1195 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1196
Antoine Pitrouf5814112014-02-03 20:59:59 +01001197
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001198def run_re_tests():
Georg Brandla4f46e12010-02-07 17:03:15 +00001199 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001200 if verbose:
1201 print 'Running re_tests test suite'
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001202 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001203 # To save time, only run the first and last 10 tests
1204 #tests = tests[:10] + tests[-10:]
1205 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001206
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001207 for t in tests:
1208 sys.stdout.flush()
1209 pattern = s = outcome = repl = expected = None
1210 if len(t) == 5:
1211 pattern, s, outcome, repl, expected = t
1212 elif len(t) == 3:
1213 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001214 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001215 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1216
Guido van Rossum41360a41998-03-26 19:42:58 +00001217 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001218 obj = re.compile(pattern)
1219 except re.error:
1220 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001221 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001222 print '=== Syntax error:', t
1223 except KeyboardInterrupt: raise KeyboardInterrupt
1224 except:
1225 print '*** Unexpected error ***', t
1226 if verbose:
1227 traceback.print_exc(file=sys.stdout)
1228 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001229 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001230 result = obj.search(s)
1231 except re.error, msg:
1232 print '=== Unexpected exception', t, repr(msg)
1233 if outcome == SYNTAX_ERROR:
1234 # This should have been a syntax error; forget it.
1235 pass
1236 elif outcome == FAIL:
1237 if result is None: pass # No match, as expected
1238 else: print '=== Succeeded incorrectly', t
1239 elif outcome == SUCCEED:
1240 if result is not None:
1241 # Matched, as expected, so now we compute the
1242 # result string and compare it to our expected result.
1243 start, end = result.span(0)
1244 vardict={'found': result.group(0),
1245 'groups': result.group(),
1246 'flags': result.re.flags}
1247 for i in range(1, 100):
1248 try:
1249 gi = result.group(i)
1250 # Special hack because else the string concat fails:
1251 if gi is None:
1252 gi = "None"
1253 except IndexError:
1254 gi = "Error"
1255 vardict['g%d' % i] = gi
1256 for i in result.re.groupindex.keys():
1257 try:
1258 gi = result.group(i)
1259 if gi is None:
1260 gi = "None"
1261 except IndexError:
1262 gi = "Error"
1263 vardict[i] = gi
1264 repl = eval(repl, vardict)
1265 if repl != expected:
1266 print '=== grouping error', t,
1267 print repr(repl) + ' should be ' + repr(expected)
1268 else:
1269 print '=== Failed incorrectly', t
1270
1271 # Try the match on a unicode string, and check that it
1272 # still succeeds.
1273 try:
1274 result = obj.search(unicode(s, "latin-1"))
1275 if result is None:
1276 print '=== Fails on unicode match', t
1277 except NameError:
1278 continue # 1.5.2
1279 except TypeError:
1280 continue # unicode test case
1281
1282 # Try the match on a unicode pattern, and check that it
1283 # still succeeds.
1284 obj=re.compile(unicode(pattern, "latin-1"))
1285 result = obj.search(s)
Fredrik Lundh17741be2001-03-22 15:51:28 +00001286 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001287 print '=== Fails on unicode pattern match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001288
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001289 # Try the match with the search area limited to the extent
1290 # of the match and see if it still succeeds. \B will
1291 # break (because it won't match at the end or start of a
1292 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001293
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001294 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1295 and result is not None:
1296 obj = re.compile(pattern)
1297 result = obj.search(s, result.start(0), result.end(0) + 1)
1298 if result is None:
1299 print '=== Failed on range-limited match', t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001300
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001301 # Try the match with IGNORECASE enabled, and check that it
1302 # still succeeds.
1303 obj = re.compile(pattern, re.IGNORECASE)
1304 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001305 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001306 print '=== Fails on case-insensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001307
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001308 # Try the match with LOCALE enabled, and check that it
1309 # still succeeds.
1310 obj = re.compile(pattern, re.LOCALE)
1311 result = obj.search(s)
1312 if result is None:
1313 print '=== Fails on locale-sensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001314
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001315 # Try the match with UNICODE locale enabled, and check
1316 # that it still succeeds.
1317 obj = re.compile(pattern, re.UNICODE)
1318 result = obj.search(s)
1319 if result is None:
1320 print '=== Fails on unicode-sensitive match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001321
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001322def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001323 run_unittest(ReTests)
Serhiy Storchaka955b6762017-05-18 12:34:40 +03001324 deprecations = [
1325 ('bad escape', DeprecationWarning),
1326 ]
1327 with check_py3k_warnings(*deprecations):
1328 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001329
1330if __name__ == "__main__":
1331 test_main()