blob: 5725a99ad6d1ef2c0481998500836e2d573ee88e [file] [log] [blame]
Serhiy Storchakae9277572014-11-10 12:37:02 +02001# -*- coding: utf-8 -*-
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +02002from test.test_support import (
3 verbose, run_unittest, import_module,
4 precisionbigmemtest, _2G, cpython_only,
5 captured_stdout, have_unicode, requires_unicode, u,
6 check_warnings)
Serhiy Storchakad4c72902014-10-31 00:53:19 +02007import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00008import re
Neal Norwitz94a9c092006-03-16 06:30:02 +00009from re import Scanner
R David Murray60773392013-04-14 13:08:50 -040010import sre_constants
Ezio Melotti46645632011-03-25 14:50:52 +020011import sys
12import string
13import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000014from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000015
Antoine Pitrou735f36e2012-12-03 20:53:12 +010016
Guido van Rossum23b22571997-07-17 22:36:14 +000017# Misc tests from Tim Peters' re.doc
18
Just van Rossum6802c6e2003-07-02 14:36:59 +000019# WARNING: Don't change details in these tests if you don't know
Ezio Melotti24b07bc2011-03-15 18:55:01 +020020# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000021# cover most of the code.
22
Skip Montanaro8ed06da2003-04-24 19:43:18 +000023import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000024
Skip Montanaro8ed06da2003-04-24 19:43:18 +000025class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000026
27 def test_weakref(self):
28 s = 'QabbbcR'
29 x = re.compile('ab+c')
30 y = proxy(x)
31 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
32
Skip Montanaro8ed06da2003-04-24 19:43:18 +000033 def test_search_star_plus(self):
34 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
35 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
36 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
37 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030038 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000039 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
40 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
41 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
42 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030043 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000044
Skip Montanaro8ed06da2003-04-24 19:43:18 +000045 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000046 int_value = int(matchobj.group(0))
47 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000048
Skip Montanaro8ed06da2003-04-24 19:43:18 +000049 def test_basic_re_sub(self):
50 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
52 '9.3 -3 24x100y')
53 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
54 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000055
Skip Montanaro8ed06da2003-04-24 19:43:18 +000056 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
57 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000058
Skip Montanaro8ed06da2003-04-24 19:43:18 +000059 s = r"\1\1"
60 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
61 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
62 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000063
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
65 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
66 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
67 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
70 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
71 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
72 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
73 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +000074
Skip Montanaro8ed06da2003-04-24 19:43:18 +000075 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000076
Skip Montanaro2726fcd2003-04-25 14:31:54 +000077 def test_bug_449964(self):
78 # fails for group followed by other escape
79 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
80 'xx\bxx\b')
81
82 def test_bug_449000(self):
83 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
85 'abc\ndef\n')
86 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
87 'abc\ndef\n')
88 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
90 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
91 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000092
Serhiy Storchaka7644ff12014-09-14 17:40:44 +030093 @requires_unicode
Guido van Rossum1ff91d92007-09-10 22:02:25 +000094 def test_bug_1140(self):
95 # re.sub(x, y, u'') should return u'', not '', and
96 # re.sub(x, y, '') should return '', not u''.
97 # Also:
98 # re.sub(x, y, unicode(x)) should return unicode(y), and
99 # re.sub(x, y, str(x)) should return
100 # str(y) if isinstance(y, str) else unicode(y).
101 for x in 'x', u'x':
102 for y in 'y', u'y':
103 z = re.sub(x, y, u'')
104 self.assertEqual(z, u'')
105 self.assertEqual(type(z), unicode)
106 #
107 z = re.sub(x, y, '')
108 self.assertEqual(z, '')
109 self.assertEqual(type(z), str)
110 #
111 z = re.sub(x, y, unicode(x))
112 self.assertEqual(z, y)
113 self.assertEqual(type(z), unicode)
114 #
115 z = re.sub(x, y, str(x))
116 self.assertEqual(z, y)
117 self.assertEqual(type(z), type(y))
118
Raymond Hettinger80016c92007-12-19 18:13:31 +0000119 def test_bug_1661(self):
120 # Verify that flags do not get silently ignored with compiled patterns
121 pattern = re.compile('.')
122 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
123 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
124 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
125 self.assertRaises(ValueError, re.compile, pattern, re.I)
126
Guido van Rossume3c4fd92008-09-10 14:27:00 +0000127 def test_bug_3629(self):
128 # A regex that triggered a bug in the sre-code validator
129 re.compile("(?P<quote>)(?(quote))")
130
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000131 def test_sub_template_numeric_escape(self):
132 # bug 776311 and friends
133 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
134 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
135 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
136 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
137 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
138 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
139 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
140
141 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
142 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
143
144 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
145 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
146 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
147 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
148 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
149
150 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
151 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000152
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000153 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
154 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
155 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
156 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
157 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
158 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
160 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
162 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
164 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
165
166 # in python2.3 (etc), these loop endlessly in sre_parser.py
167 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
168 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
169 'xz8')
170 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
171 'xza')
172
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000173 def test_qualified_re_sub(self):
174 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
175 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000176
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000177 def test_bug_114660(self):
178 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
179 'hello there')
180
181 def test_bug_462270(self):
182 # Test for empty sub() behaviour, see SF bug #462270
183 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
184 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
185
Ezio Melottief317382012-11-03 20:31:12 +0200186 def test_symbolic_groups(self):
187 re.compile('(?P<a>x)(?P=a)(?(a)y)')
188 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
189 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
190 self.assertRaises(re.error, re.compile, '(?Px)')
191 self.assertRaises(re.error, re.compile, '(?P=)')
192 self.assertRaises(re.error, re.compile, '(?P=1)')
193 self.assertRaises(re.error, re.compile, '(?P=a)')
194 self.assertRaises(re.error, re.compile, '(?P=a1)')
195 self.assertRaises(re.error, re.compile, '(?P=a.)')
196 self.assertRaises(re.error, re.compile, '(?P<)')
197 self.assertRaises(re.error, re.compile, '(?P<>)')
198 self.assertRaises(re.error, re.compile, '(?P<1>)')
199 self.assertRaises(re.error, re.compile, '(?P<a.>)')
200 self.assertRaises(re.error, re.compile, '(?())')
201 self.assertRaises(re.error, re.compile, '(?(a))')
202 self.assertRaises(re.error, re.compile, '(?(1a))')
203 self.assertRaises(re.error, re.compile, '(?(a.))')
204
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000205 def test_symbolic_refs(self):
206 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
207 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melottief317382012-11-03 20:31:12 +0200210 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000211 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
212 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
213 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
214 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000215 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_re_subn(self):
218 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
219 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
220 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
221 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
222 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000223
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000224 def test_re_split(self):
225 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
226 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
227 self.assertEqual(re.split("(:*)", ":a:b::c"),
228 ['', ':', 'a', ':', 'b', '::', 'c'])
229 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
230 self.assertEqual(re.split("(:)*", ":a:b::c"),
231 ['', ':', 'a', ':', 'b', ':', 'c'])
232 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
233 ['', ':', 'a', ':b::', 'c'])
234 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
235 ['', None, ':', 'a', None, ':', '', 'b', None, '',
236 None, '::', 'c'])
237 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
238 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_qualified_re_split(self):
241 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
242 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
243 self.assertEqual(re.split("(:)", ":a:b::c", 2),
244 ['', ':', 'a', ':', 'b::c'])
245 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
246 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000247
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000248 def test_re_findall(self):
249 self.assertEqual(re.findall(":+", "abc"), [])
250 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
251 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
252 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
253 (":", ":"),
254 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000255
Skip Montanaro5ba00542003-04-25 16:00:14 +0000256 def test_bug_117612(self):
257 self.assertEqual(re.findall(r"(a|(b))", "aba"),
258 [("a", ""),("b", "b"),("a", "")])
259
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000260 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000261 self.assertEqual(re.match('a', 'a').groups(), ())
262 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
263 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
264 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
265 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000266
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000267 pat = re.compile('((a)|(b))(c)?')
268 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
269 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
270 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
271 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
272 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 # A single group
275 m = re.match('(a)', 'a')
276 self.assertEqual(m.group(0), 'a')
277 self.assertEqual(m.group(0), 'a')
278 self.assertEqual(m.group(1), 'a')
279 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000280
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000281 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
282 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
283 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
284 (None, 'b', None))
285 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000286
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000287 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000288 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
289 ('(', 'a'))
290 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
291 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300292 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
293 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000294 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
295 ('a', 'b'))
296 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
297 (None, 'd'))
298 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
299 (None, 'd'))
300 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
301 ('a', ''))
302
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000303 # Tests for bug #1177831: exercise groups other than the first group
304 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
305 self.assertEqual(p.match('abc').groups(),
306 ('a', 'b', 'c'))
307 self.assertEqual(p.match('ad').groups(),
308 ('a', None, 'd'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300309 self.assertIsNone(p.match('abd'))
310 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000311
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000312
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000313 def test_re_groupref(self):
314 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
315 ('|', 'a'))
316 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
317 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300318 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
319 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000320 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
321 ('a', 'a'))
322 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
323 (None, None))
324
325 def test_groupdict(self):
326 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
327 'first second').groupdict(),
328 {'first':'first', 'second':'second'})
329
330 def test_expand(self):
331 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
332 "first second")
333 .expand(r"\2 \1 \g<second> \g<first>"),
334 "second first second first")
335
336 def test_repeat_minmax(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300337 self.assertIsNone(re.match("^(\w){1}$", "abc"))
338 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
339 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
340 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000341
342 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
343 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
344 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
345 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
346 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
347 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
348 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
349 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
350
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300351 self.assertIsNone(re.match("^x{1}$", "xxx"))
352 self.assertIsNone(re.match("^x{1}?$", "xxx"))
353 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
354 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000355
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300356 self.assertTrue(re.match("^x{3}$", "xxx"))
357 self.assertTrue(re.match("^x{1,3}$", "xxx"))
358 self.assertTrue(re.match("^x{1,4}$", "xxx"))
359 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
360 self.assertTrue(re.match("^x{3}?$", "xxx"))
361 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
362 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
363 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000364
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300365 self.assertIsNone(re.match("^x{}$", "xxx"))
366 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000367
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000368 def test_getattr(self):
369 self.assertEqual(re.match("(a)", "a").pos, 0)
370 self.assertEqual(re.match("(a)", "a").endpos, 1)
371 self.assertEqual(re.match("(a)", "a").string, "a")
372 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300373 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000374
375 def test_special_escapes(self):
376 self.assertEqual(re.search(r"\b(b.)\b",
377 "abcd abc bcd bx").group(1), "bx")
378 self.assertEqual(re.search(r"\B(b.)\B",
379 "abc bcd bc abxd").group(1), "bx")
380 self.assertEqual(re.search(r"\b(b.)\b",
381 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
382 self.assertEqual(re.search(r"\B(b.)\B",
383 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300384 if have_unicode:
385 self.assertEqual(re.search(r"\b(b.)\b",
386 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
387 self.assertEqual(re.search(r"\B(b.)\B",
388 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000389 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
390 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300391 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000392 self.assertEqual(re.search(r"\b(b.)\b",
393 u"abcd abc bcd bx").group(1), "bx")
394 self.assertEqual(re.search(r"\B(b.)\B",
395 u"abc bcd bc abxd").group(1), "bx")
396 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
397 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300398 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000399 self.assertEqual(re.search(r"\d\D\w\W\s\S",
400 "1aa! a").group(0), "1aa! a")
401 self.assertEqual(re.search(r"\d\D\w\W\s\S",
402 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300403 if have_unicode:
404 self.assertEqual(re.search(r"\d\D\w\W\s\S",
405 "1aa! a", re.UNICODE).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000406
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200407 def test_string_boundaries(self):
408 # See http://bugs.python.org/issue10713
409 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
410 "abc")
411 # There's a word boundary at the start of a string.
412 self.assertTrue(re.match(r"\b", "abc"))
413 # A non-empty string includes a non-boundary zero-length match.
414 self.assertTrue(re.search(r"\B", "abc"))
415 # There is no non-boundary match at the start of a string.
416 self.assertFalse(re.match(r"\B", "abc"))
417 # However, an empty string contains no word boundaries, and also no
418 # non-boundaries.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300419 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200420 # This one is questionable and different from the perlre behaviour,
421 # but describes current behavior.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300422 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200423 # A single word-character string has two boundaries, but no
424 # non-boundary gaps.
425 self.assertEqual(len(re.findall(r"\b", "a")), 2)
426 self.assertEqual(len(re.findall(r"\B", "a")), 0)
427 # If there are no words, there are no boundaries
428 self.assertEqual(len(re.findall(r"\b", " ")), 0)
429 self.assertEqual(len(re.findall(r"\b", " ")), 0)
430 # Can match around the whitespace.
431 self.assertEqual(len(re.findall(r"\B", " ")), 2)
432
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300433 @requires_unicode
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000434 def test_bigcharset(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300435 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
436 unichr(0x2222)).group(1), unichr(0x2222))
437 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
438 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
Serhiy Storchaka22fb0de2013-10-24 22:02:42 +0300439 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300440 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000441
Antoine Pitroub83ea142012-11-20 22:30:42 +0100442 def test_big_codesize(self):
443 # Issue #1160
444 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300445 self.assertTrue(r.match('1000'))
446 self.assertTrue(r.match('9999'))
Antoine Pitroub83ea142012-11-20 22:30:42 +0100447
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000448 def test_anyall(self):
449 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
450 "a\nb")
451 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
452 "a\n\nb")
453
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200454 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000455 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
456 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
457 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
458 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
459 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
460 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
461 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
462
463 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
464 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
465 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
466 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
467
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200468 # Group reference.
469 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
470 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
471 # Named group reference.
472 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
473 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
474 # Conditional group reference.
475 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
476 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
477 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
478 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
479 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
480 # Group used before defined.
481 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
482 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
483 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
484
485 def test_lookbehind(self):
486 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
487 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
488 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
489 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
490 # Group reference.
491 with check_warnings(('', RuntimeWarning)):
492 re.compile(r'(a)a(?<=\1)c')
493 # Named group reference.
494 with check_warnings(('', RuntimeWarning)):
495 re.compile(r'(?P<g>a)a(?<=(?P=g))c')
496 # Conditional group reference.
497 with check_warnings(('', RuntimeWarning)):
498 re.compile(r'(a)b(?<=(?(1)b|x))c')
499 # Group used before defined.
500 with check_warnings(('', RuntimeWarning)):
501 re.compile(r'(a)b(?<=(?(2)b|x))(c)')
502
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000503 def test_ignore_case(self):
Georg Brandl30de77b2008-08-24 18:11:07 +0000504 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
505 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000506 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
507 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
508 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
509 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
510 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
511 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
512 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
513 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
514
Serhiy Storchakae9277572014-11-10 12:37:02 +0200515 if have_unicode:
516 assert u(r'\u212a').lower() == u'k' # 'K'
517 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
518 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
519 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
520 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
521 assert u(r'\u017f').upper() == u'S' # 'ſ'
522 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
523 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
524 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
525 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
526
527 def test_ignore_case_set(self):
528 self.assertTrue(re.match(r'[19A]', 'A', re.I))
529 self.assertTrue(re.match(r'[19a]', 'a', re.I))
530 self.assertTrue(re.match(r'[19a]', 'A', re.I))
531 self.assertTrue(re.match(r'[19A]', 'a', re.I))
532 if have_unicode:
533 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
534 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
535 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
536 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
537 assert u(r'\u212a').lower() == u'k' # 'K'
538 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
539 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
540 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
541 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
542 assert u(r'\u017f').upper() == u'S' # 'ſ'
543 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
544 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
545 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
546 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
547
Serhiy Storchakae9e54ae2014-10-31 13:53:21 +0200548 def test_ignore_case_range(self):
549 # Issues #3511, #17381.
550 self.assertTrue(re.match(r'[9-a]', '_', re.I))
551 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
552 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
553 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
554 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
555 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
556 if have_unicode:
557 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
558 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
559 self.assertTrue(re.match(u(r'[\xc0-\xde]'),
560 u(r'\xd7'), re.U | re.I))
561 self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
562 u(r'\xf7'), re.U | re.I))
563 self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
564 u(r'\xf7'), re.U | re.I))
565 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
566 u(r'\xd7'), re.U | re.I))
567 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
568 u(r'\u0450'), re.U | re.I))
569 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
570 u(r'\u0400'), re.U | re.I))
571 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
572 u(r'\u0450'), re.U | re.I))
573 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
574 u(r'\u0400'), re.U | re.I))
575 if sys.maxunicode > 0xffff:
576 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
577 u(r'\U00010428'), re.U | re.I))
578 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
579 u(r'\U00010400'), re.U | re.I))
580 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
581 u(r'\U00010428'), re.U | re.I))
582 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
583 u(r'\U00010400'), re.U | re.I))
584
Serhiy Storchakae9277572014-11-10 12:37:02 +0200585 assert u(r'\u212a').lower() == u'k' # 'K'
586 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
587 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
588 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
589 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
590 assert u(r'\u017f').upper() == u'S' # 'ſ'
591 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
592 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
593 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
594 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
595
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000596 def test_category(self):
597 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
598
599 def test_getlower(self):
600 import _sre
601 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
602 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300603 if have_unicode:
604 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000605
606 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
607 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
608
609 def test_not_literal(self):
610 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
611 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
612
613 def test_search_coverage(self):
614 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
615 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
616
Ezio Melotti46645632011-03-25 14:50:52 +0200617 def assertMatch(self, pattern, text, match=None, span=None,
618 matcher=re.match):
619 if match is None and span is None:
620 # the pattern matches the whole text
621 match = text
622 span = (0, len(text))
623 elif match is None or span is None:
624 raise ValueError('If match is not None, span should be specified '
625 '(and vice versa).')
626 m = matcher(pattern, text)
627 self.assertTrue(m)
628 self.assertEqual(m.group(), match)
629 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000630
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300631 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200632 def test_re_escape(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300633 alnum_chars = unicode(string.ascii_letters + string.digits)
Ezio Melotti46645632011-03-25 14:50:52 +0200634 p = u''.join(unichr(i) for i in range(256))
635 for c in p:
636 if c in alnum_chars:
637 self.assertEqual(re.escape(c), c)
638 elif c == u'\x00':
639 self.assertEqual(re.escape(c), u'\\000')
640 else:
641 self.assertEqual(re.escape(c), u'\\' + c)
642 self.assertMatch(re.escape(c), c)
643 self.assertMatch(re.escape(p), p)
644
645 def test_re_escape_byte(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300646 alnum_chars = string.ascii_letters + string.digits
Ezio Melotti46645632011-03-25 14:50:52 +0200647 p = ''.join(chr(i) for i in range(256))
648 for b in p:
649 if b in alnum_chars:
650 self.assertEqual(re.escape(b), b)
651 elif b == b'\x00':
652 self.assertEqual(re.escape(b), b'\\000')
653 else:
654 self.assertEqual(re.escape(b), b'\\' + b)
655 self.assertMatch(re.escape(b), b)
656 self.assertMatch(re.escape(p), p)
657
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300658 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200659 def test_re_escape_non_ascii(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300660 s = u(r'xxx\u2620\u2620\u2620xxx')
Ezio Melotti46645632011-03-25 14:50:52 +0200661 s_escaped = re.escape(s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300662 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
Ezio Melotti46645632011-03-25 14:50:52 +0200663 self.assertMatch(s_escaped, s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300664 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
665 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
Ezio Melotti46645632011-03-25 14:50:52 +0200666
667 def test_re_escape_non_ascii_bytes(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300668 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
Ezio Melotti46645632011-03-25 14:50:52 +0200669 b_escaped = re.escape(b)
670 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
671 self.assertMatch(b_escaped, b)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300672 res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
Ezio Melotti46645632011-03-25 14:50:52 +0200673 self.assertEqual(len(res), 2)
Guido van Rossum49946571997-07-18 04:26:25 +0000674
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000675 def test_pickling(self):
676 import pickle
Skip Montanaro1e703c62003-04-25 15:40:28 +0000677 self.pickle_test(pickle)
678 import cPickle
679 self.pickle_test(cPickle)
Žiga Seilnacht7492e422007-03-21 20:07:56 +0000680 # old pickles expect the _compile() reconstructor in sre module
Florent Xicluna6257a7b2010-03-31 22:01:03 +0000681 import_module("sre", deprecated=True)
682 from sre import _compile
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300683 # current pickle expects the _compile() reconstructor in re module
684 from re import _compile
Skip Montanaro1e703c62003-04-25 15:40:28 +0000685
686 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000687 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300688 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
689 pickled = pickle.dumps(oldpat, proto)
690 newpat = pickle.loads(pickled)
691 self.assertEqual(newpat, oldpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000692
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000693 def test_constants(self):
694 self.assertEqual(re.I, re.IGNORECASE)
695 self.assertEqual(re.L, re.LOCALE)
696 self.assertEqual(re.M, re.MULTILINE)
697 self.assertEqual(re.S, re.DOTALL)
698 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000699
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000700 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000701 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300702 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000703
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000704 def test_sre_character_literals(self):
705 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300706 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
707 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
708 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
709 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
710 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
711 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000712 self.assertRaises(re.error, re.match, "\911", "")
713
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000714 def test_sre_character_class_literals(self):
715 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300716 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
717 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
718 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
719 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
720 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
721 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000722 self.assertRaises(re.error, re.match, "[\911]", "")
723
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000724 def test_bug_113254(self):
725 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
726 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
727 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
728
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000729 def test_bug_527371(self):
730 # bug described in patches 527371/672491
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300731 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000732 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
733 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
734 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
735 self.assertEqual(re.match("((a))", "a").lastindex, 1)
736
737 def test_bug_545855(self):
738 # bug 545855 -- This pattern failed to cause a compile error as it
739 # should, instead provoking a TypeError.
740 self.assertRaises(re.error, re.compile, 'foo[a-')
741
742 def test_bug_418626(self):
743 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
744 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
745 # pattern '*?' on a long string.
746 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
747 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
748 20003)
749 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000750 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000751 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000752 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000753
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300754 @requires_unicode
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000755 def test_bug_612074(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300756 pat=u"["+re.escape(unichr(0x2039))+u"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000757 self.assertEqual(re.compile(pat) and 1, 1)
758
Skip Montanaro1e703c62003-04-25 15:40:28 +0000759 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000760 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000761 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000762 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
763 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
764 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000765
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +0200766 def test_unlimited_zero_width_repeat(self):
767 # Issue #9669
768 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
769 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
770 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
771 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
772 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
773 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
774
Skip Montanaro1e703c62003-04-25 15:40:28 +0000775 def test_scanner(self):
776 def s_ident(scanner, token): return token
777 def s_operator(scanner, token): return "op%s" % token
778 def s_float(scanner, token): return float(token)
779 def s_int(scanner, token): return int(token)
780
781 scanner = Scanner([
782 (r"[a-zA-Z_]\w*", s_ident),
783 (r"\d+\.\d*", s_float),
784 (r"\d+", s_int),
785 (r"=|\+|-|\*|/", s_operator),
786 (r"\s+", None),
787 ])
788
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300789 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000790
Skip Montanaro1e703c62003-04-25 15:40:28 +0000791 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
792 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
793 'op+', 'bar'], ''))
794
Skip Montanaro5ba00542003-04-25 16:00:14 +0000795 def test_bug_448951(self):
796 # bug 448951 (similar to 429357, but with single char match)
797 # (Also test greedy matches.)
798 for op in '','?','*':
799 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
800 (None, None))
801 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
802 ('a:', 'a'))
803
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000804 def test_bug_725106(self):
805 # capturing groups in alternatives in repeats
806 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
807 ('b', 'a'))
808 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
809 ('c', 'b'))
810 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
811 ('b', None))
812 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
813 ('b', None))
814 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
815 ('b', 'a'))
816 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
817 ('c', 'b'))
818 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
819 ('b', None))
820 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
821 ('b', None))
822
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000823 def test_bug_725149(self):
824 # mark_stack_base restoring before restoring marks
825 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
826 ('a', None))
827 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
828 ('a', None, None))
829
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300830 @requires_unicode
Just van Rossum12723ba2003-07-02 20:03:04 +0000831 def test_bug_764548(self):
832 # bug 764548, re.compile() barfs on str/unicode subclasses
Just van Rossum12723ba2003-07-02 20:03:04 +0000833 class my_unicode(unicode): pass
834 pat = re.compile(my_unicode("abc"))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300835 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000836
Skip Montanaro5ba00542003-04-25 16:00:14 +0000837 def test_finditer(self):
838 iter = re.finditer(r":+", "a:b::c:::d")
839 self.assertEqual([item.group(0) for item in iter],
840 [":", "::", ":::"])
841
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300842 @requires_unicode
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000843 def test_bug_926075(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300844 self.assertIsNot(re.compile('bug_926075'),
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300845 re.compile(u'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000846
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300847 @requires_unicode
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000848 def test_bug_931848(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300849 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000850 self.assertEqual(re.compile(pattern).split("a.b.c"),
851 ['a','b','c'])
852
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000853 def test_bug_581080(self):
854 iter = re.finditer(r"\s", "a b")
855 self.assertEqual(iter.next().span(), (1,2))
856 self.assertRaises(StopIteration, iter.next)
857
858 scanner = re.compile(r"\s").scanner("a b")
859 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300860 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000861
862 def test_bug_817234(self):
863 iter = re.finditer(r".*", "asdf")
864 self.assertEqual(iter.next().span(), (0, 4))
865 self.assertEqual(iter.next().span(), (4, 4))
866 self.assertRaises(StopIteration, iter.next)
867
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300868 @requires_unicode
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000869 def test_bug_6561(self):
870 # '\d' should match characters in Unicode category 'Nd'
871 # (Number, Decimal Digit), but not those in 'Nl' (Number,
872 # Letter) or 'No' (Number, Other).
873 decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300874 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
875 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
876 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000877 ]
878 for x in decimal_digits:
879 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
880
881 not_decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300882 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
883 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
884 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
885 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000886 ]
887 for x in not_decimal_digits:
888 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
889
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000890 def test_empty_array(self):
891 # SF buf 1647541
892 import array
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300893 typecodes = 'cbBhHiIlLfd'
894 if have_unicode:
895 typecodes += 'u'
896 for typecode in typecodes:
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000897 a = array.array(typecode)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300898 self.assertIsNone(re.compile("bla").match(a))
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000899 self.assertEqual(re.compile("").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000900
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300901 @requires_unicode
Guido van Rossumae04c332008-01-03 19:12:44 +0000902 def test_inline_flags(self):
903 # Bug #1700
904 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
905 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
906
907 p = re.compile(upper_char, re.I | re.U)
908 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300909 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000910
911 p = re.compile(lower_char, re.I | re.U)
912 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300913 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000914
915 p = re.compile('(?i)' + upper_char, re.U)
916 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300917 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000918
919 p = re.compile('(?i)' + lower_char, re.U)
920 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300921 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000922
923 p = re.compile('(?iu)' + upper_char)
924 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300925 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000926
927 p = re.compile('(?iu)' + lower_char)
928 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300929 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000930
Serhiy Storchaka0b5f22d2016-09-11 01:39:51 +0300931 self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
932 self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
933
Amaury Forgeot d'Arcd08a8eb2008-01-10 21:59:42 +0000934 def test_dollar_matches_twice(self):
935 "$ matches the end of string, and just before the terminating \n"
936 pattern = re.compile('$')
937 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
938 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
939 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
940
941 pattern = re.compile('$', re.MULTILINE)
942 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
943 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
944 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
945
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000946 def test_dealloc(self):
947 # issue 3299: check for segfault in debug build
948 import _sre
Ezio Melotti0e4e7322010-01-23 10:43:05 +0000949 # the overflow limit is different on wide and narrow builds and it
950 # depends on the definition of SRE_CODE (see sre.h).
951 # 2**128 should be big enough to overflow on both. For smaller values
952 # a RuntimeError is raised instead of OverflowError.
953 long_overflow = 2**128
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000954 self.assertRaises(TypeError, re.finditer, "a", {})
955 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Guido van Rossumae04c332008-01-03 19:12:44 +0000956
Ezio Melottib56b6ff2012-03-13 01:25:40 +0200957 def test_compile(self):
958 # Test return value when given string and pattern as parameter
959 pattern = re.compile('random pattern')
960 self.assertIsInstance(pattern, re._pattern_type)
961 same_pattern = re.compile(pattern)
962 self.assertIsInstance(same_pattern, re._pattern_type)
963 self.assertIs(same_pattern, pattern)
964 # Test behaviour when not given a string or pattern as parameter
965 self.assertRaises(TypeError, re.compile, 0)
966
Ezio Melotti5c4e32b2013-01-11 08:32:01 +0200967 def test_bug_13899(self):
968 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
969 # nothing. Ditto B and Z.
970 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
971 ['A', 'B', '\b', 'C', 'Z'])
972
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100973 @precisionbigmemtest(size=_2G, memuse=1)
974 def test_large_search(self, size):
975 # Issue #10182: indices were 32-bit-truncated.
976 s = 'a' * size
977 m = re.search('$', s)
978 self.assertIsNotNone(m)
Antoine Pitrou74635c92012-12-03 21:08:43 +0100979 self.assertEqual(m.start(), size)
980 self.assertEqual(m.end(), size)
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100981
Antoine Pitroub83575b2012-12-02 12:52:36 +0100982 # The huge memuse is because of re.sub() using a list and a join()
983 # to create the replacement result.
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100984 @precisionbigmemtest(size=_2G, memuse=16 + 2)
985 def test_large_subn(self, size):
Antoine Pitroub83575b2012-12-02 12:52:36 +0100986 # Issue #10182: indices were 32-bit-truncated.
987 s = 'a' * size
Antoine Pitroub83575b2012-12-02 12:52:36 +0100988 r, n = re.subn('', '', s)
989 self.assertEqual(r, s)
990 self.assertEqual(n, size + 1)
991
992
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200993 def test_repeat_minmax_overflow(self):
994 # Issue #13169
995 string = "x" * 100000
996 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
997 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
998 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
999 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1000 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1001 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1002 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1003 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1004 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1005 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1006 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1007
1008 @cpython_only
1009 def test_repeat_minmax_overflow_maxrepeat(self):
1010 try:
1011 from _sre import MAXREPEAT
1012 except ImportError:
1013 self.skipTest('requires _sre.MAXREPEAT constant')
1014 string = "x" * 100000
1015 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1016 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1017 (0, 100000))
1018 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1019 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1020 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1021 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1022
R David Murray60773392013-04-14 13:08:50 -04001023 def test_backref_group_name_in_exception(self):
1024 # Issue 17341: Poor error message when compiling invalid regex
1025 with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1026 re.compile('(?P=<foo>)')
1027
1028 def test_group_name_in_exception(self):
1029 # Issue 17341: Poor error message when compiling invalid regex
1030 with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1031 re.compile('(?P<?foo>)')
1032
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001033 def test_issue17998(self):
1034 for reps in '*', '+', '?', '{1}':
1035 for mod in '', '?':
1036 pattern = '.' + reps + mod + 'yz'
1037 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1038 ['xyz'], msg=pattern)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03001039 if have_unicode:
1040 pattern = unicode(pattern)
1041 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1042 [u'xyz'], msg=pattern)
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001043
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001044
Serhiy Storchaka83737c62013-08-19 23:20:07 +03001045 def test_bug_2537(self):
1046 # issue 2537: empty submatches
1047 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1048 for inner_op in ('{0,}', '*', '?'):
1049 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1050 m = r.match("xyyzy")
1051 self.assertEqual(m.group(0), "xyy")
1052 self.assertEqual(m.group(1), "")
1053 self.assertEqual(m.group(2), "y")
1054
Antoine Pitrouf5814112014-02-03 20:59:59 +01001055 def test_debug_flag(self):
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001056 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitrouf5814112014-02-03 20:59:59 +01001057 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001058 re.compile(pat, re.DEBUG)
1059 dump = '''\
1060subpattern 1
1061 literal 46
1062subpattern None
1063 branch
1064 in
1065 literal 99
1066 literal 104
1067 or
1068 literal 112
1069 literal 121
1070subpattern None
1071 groupref_exists 1
1072 at at_end
1073 else
1074 literal 58
1075 literal 32
1076'''
1077 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001078 # Debug output is output again even a second time (bypassing
1079 # the cache -- issue #20426).
1080 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001081 re.compile(pat, re.DEBUG)
1082 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001083
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001084 def test_keyword_parameters(self):
1085 # Issue #20283: Accepting the string keyword parameter.
1086 pat = re.compile(r'(ab)')
1087 self.assertEqual(
1088 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1089 self.assertEqual(
1090 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1091 self.assertEqual(
1092 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1093 self.assertEqual(
1094 pat.split(string='abracadabra', maxsplit=1),
1095 ['', 'ab', 'racadabra'])
1096
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04001097 def test_match_group_takes_long(self):
1098 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1099 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1100
Serhiy Storchakad4c72902014-10-31 00:53:19 +02001101 def test_locale_caching(self):
1102 # Issue #22410
1103 oldlocale = locale.setlocale(locale.LC_CTYPE)
1104 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1105 for loc in 'en_US.iso88591', 'en_US.utf8':
1106 try:
1107 locale.setlocale(locale.LC_CTYPE, loc)
1108 except locale.Error:
1109 # Unsupported locale on this system
1110 self.skipTest('test needs %s locale' % loc)
1111
1112 re.purge()
1113 self.check_en_US_iso88591()
1114 self.check_en_US_utf8()
1115 re.purge()
1116 self.check_en_US_utf8()
1117 self.check_en_US_iso88591()
1118
1119 def check_en_US_iso88591(self):
1120 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1121 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1122 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1123 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1124 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1125 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1126 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1127
1128 def check_en_US_utf8(self):
1129 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1130 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1131 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1132 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1133 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1134 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1135 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1136
Antoine Pitrouf5814112014-02-03 20:59:59 +01001137
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001138def run_re_tests():
Georg Brandla4f46e12010-02-07 17:03:15 +00001139 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001140 if verbose:
1141 print 'Running re_tests test suite'
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001142 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001143 # To save time, only run the first and last 10 tests
1144 #tests = tests[:10] + tests[-10:]
1145 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001146
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001147 for t in tests:
1148 sys.stdout.flush()
1149 pattern = s = outcome = repl = expected = None
1150 if len(t) == 5:
1151 pattern, s, outcome, repl, expected = t
1152 elif len(t) == 3:
1153 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001154 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001155 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1156
Guido van Rossum41360a41998-03-26 19:42:58 +00001157 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001158 obj = re.compile(pattern)
1159 except re.error:
1160 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001161 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001162 print '=== Syntax error:', t
1163 except KeyboardInterrupt: raise KeyboardInterrupt
1164 except:
1165 print '*** Unexpected error ***', t
1166 if verbose:
1167 traceback.print_exc(file=sys.stdout)
1168 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001169 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001170 result = obj.search(s)
1171 except re.error, msg:
1172 print '=== Unexpected exception', t, repr(msg)
1173 if outcome == SYNTAX_ERROR:
1174 # This should have been a syntax error; forget it.
1175 pass
1176 elif outcome == FAIL:
1177 if result is None: pass # No match, as expected
1178 else: print '=== Succeeded incorrectly', t
1179 elif outcome == SUCCEED:
1180 if result is not None:
1181 # Matched, as expected, so now we compute the
1182 # result string and compare it to our expected result.
1183 start, end = result.span(0)
1184 vardict={'found': result.group(0),
1185 'groups': result.group(),
1186 'flags': result.re.flags}
1187 for i in range(1, 100):
1188 try:
1189 gi = result.group(i)
1190 # Special hack because else the string concat fails:
1191 if gi is None:
1192 gi = "None"
1193 except IndexError:
1194 gi = "Error"
1195 vardict['g%d' % i] = gi
1196 for i in result.re.groupindex.keys():
1197 try:
1198 gi = result.group(i)
1199 if gi is None:
1200 gi = "None"
1201 except IndexError:
1202 gi = "Error"
1203 vardict[i] = gi
1204 repl = eval(repl, vardict)
1205 if repl != expected:
1206 print '=== grouping error', t,
1207 print repr(repl) + ' should be ' + repr(expected)
1208 else:
1209 print '=== Failed incorrectly', t
1210
1211 # Try the match on a unicode string, and check that it
1212 # still succeeds.
1213 try:
1214 result = obj.search(unicode(s, "latin-1"))
1215 if result is None:
1216 print '=== Fails on unicode match', t
1217 except NameError:
1218 continue # 1.5.2
1219 except TypeError:
1220 continue # unicode test case
1221
1222 # Try the match on a unicode pattern, and check that it
1223 # still succeeds.
1224 obj=re.compile(unicode(pattern, "latin-1"))
1225 result = obj.search(s)
Fredrik Lundh17741be2001-03-22 15:51:28 +00001226 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001227 print '=== Fails on unicode pattern match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001228
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001229 # Try the match with the search area limited to the extent
1230 # of the match and see if it still succeeds. \B will
1231 # break (because it won't match at the end or start of a
1232 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001233
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001234 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1235 and result is not None:
1236 obj = re.compile(pattern)
1237 result = obj.search(s, result.start(0), result.end(0) + 1)
1238 if result is None:
1239 print '=== Failed on range-limited match', t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001240
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001241 # Try the match with IGNORECASE enabled, and check that it
1242 # still succeeds.
1243 obj = re.compile(pattern, re.IGNORECASE)
1244 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001245 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001246 print '=== Fails on case-insensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001247
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001248 # Try the match with LOCALE enabled, and check that it
1249 # still succeeds.
1250 obj = re.compile(pattern, re.LOCALE)
1251 result = obj.search(s)
1252 if result is None:
1253 print '=== Fails on locale-sensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001254
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001255 # Try the match with UNICODE locale enabled, and check
1256 # that it still succeeds.
1257 obj = re.compile(pattern, re.UNICODE)
1258 result = obj.search(s)
1259 if result is None:
1260 print '=== Fails on unicode-sensitive match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001261
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001262def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001263 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001264 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001265
1266if __name__ == "__main__":
1267 test_main()