blob: 588042a85966eae952a64b57bc1f58a4ce25a451 [file] [log] [blame]
Serhiy Storchakae9277572014-11-10 12:37:02 +02001# -*- coding: utf-8 -*-
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +02002from test.test_support import (
3 verbose, run_unittest, import_module,
4 precisionbigmemtest, _2G, cpython_only,
5 captured_stdout, have_unicode, requires_unicode, u,
6 check_warnings)
Serhiy Storchakad4c72902014-10-31 00:53:19 +02007import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00008import re
Neal Norwitz94a9c092006-03-16 06:30:02 +00009from re import Scanner
R David Murray60773392013-04-14 13:08:50 -040010import sre_constants
Ezio Melotti46645632011-03-25 14:50:52 +020011import sys
12import string
13import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000014from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000015
Antoine Pitrou735f36e2012-12-03 20:53:12 +010016
Guido van Rossum23b22571997-07-17 22:36:14 +000017# Misc tests from Tim Peters' re.doc
18
Just van Rossum6802c6e2003-07-02 14:36:59 +000019# WARNING: Don't change details in these tests if you don't know
Ezio Melotti24b07bc2011-03-15 18:55:01 +020020# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000021# cover most of the code.
22
Skip Montanaro8ed06da2003-04-24 19:43:18 +000023import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000024
Skip Montanaro8ed06da2003-04-24 19:43:18 +000025class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000026
27 def test_weakref(self):
28 s = 'QabbbcR'
29 x = re.compile('ab+c')
30 y = proxy(x)
31 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
32
Skip Montanaro8ed06da2003-04-24 19:43:18 +000033 def test_search_star_plus(self):
34 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
35 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
36 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
37 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030038 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000039 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
40 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
41 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
42 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030043 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000044
Skip Montanaro8ed06da2003-04-24 19:43:18 +000045 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000046 int_value = int(matchobj.group(0))
47 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000048
Skip Montanaro8ed06da2003-04-24 19:43:18 +000049 def test_basic_re_sub(self):
50 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
52 '9.3 -3 24x100y')
53 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
54 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000055
Skip Montanaro8ed06da2003-04-24 19:43:18 +000056 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
57 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000058
Skip Montanaro8ed06da2003-04-24 19:43:18 +000059 s = r"\1\1"
60 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
61 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
62 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000063
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
65 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
66 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
67 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000068
Skip Montanaro8ed06da2003-04-24 19:43:18 +000069 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
70 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
71 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
72 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
73 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +000074
Skip Montanaro8ed06da2003-04-24 19:43:18 +000075 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000076
Skip Montanaro2726fcd2003-04-25 14:31:54 +000077 def test_bug_449964(self):
78 # fails for group followed by other escape
79 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
80 'xx\bxx\b')
81
82 def test_bug_449000(self):
83 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
85 'abc\ndef\n')
86 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
87 'abc\ndef\n')
88 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
90 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
91 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000092
Serhiy Storchaka7644ff12014-09-14 17:40:44 +030093 @requires_unicode
Guido van Rossum1ff91d92007-09-10 22:02:25 +000094 def test_bug_1140(self):
95 # re.sub(x, y, u'') should return u'', not '', and
96 # re.sub(x, y, '') should return '', not u''.
97 # Also:
98 # re.sub(x, y, unicode(x)) should return unicode(y), and
99 # re.sub(x, y, str(x)) should return
100 # str(y) if isinstance(y, str) else unicode(y).
101 for x in 'x', u'x':
102 for y in 'y', u'y':
103 z = re.sub(x, y, u'')
104 self.assertEqual(z, u'')
105 self.assertEqual(type(z), unicode)
106 #
107 z = re.sub(x, y, '')
108 self.assertEqual(z, '')
109 self.assertEqual(type(z), str)
110 #
111 z = re.sub(x, y, unicode(x))
112 self.assertEqual(z, y)
113 self.assertEqual(type(z), unicode)
114 #
115 z = re.sub(x, y, str(x))
116 self.assertEqual(z, y)
117 self.assertEqual(type(z), type(y))
118
Raymond Hettinger80016c92007-12-19 18:13:31 +0000119 def test_bug_1661(self):
120 # Verify that flags do not get silently ignored with compiled patterns
121 pattern = re.compile('.')
122 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
123 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
124 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
125 self.assertRaises(ValueError, re.compile, pattern, re.I)
126
Guido van Rossume3c4fd92008-09-10 14:27:00 +0000127 def test_bug_3629(self):
128 # A regex that triggered a bug in the sre-code validator
129 re.compile("(?P<quote>)(?(quote))")
130
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000131 def test_sub_template_numeric_escape(self):
132 # bug 776311 and friends
133 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
134 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
135 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
136 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
137 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
138 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
139 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
140
141 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
142 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
143
144 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
145 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
146 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
147 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
148 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
149
150 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
151 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000152
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000153 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
154 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
155 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
156 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
157 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
158 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
160 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
162 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
164 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
165
166 # in python2.3 (etc), these loop endlessly in sre_parser.py
167 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
168 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
169 'xz8')
170 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
171 'xza')
172
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000173 def test_qualified_re_sub(self):
174 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
175 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000176
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000177 def test_bug_114660(self):
178 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
179 'hello there')
180
181 def test_bug_462270(self):
182 # Test for empty sub() behaviour, see SF bug #462270
183 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
184 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
185
Ezio Melottief317382012-11-03 20:31:12 +0200186 def test_symbolic_groups(self):
187 re.compile('(?P<a>x)(?P=a)(?(a)y)')
188 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
189 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
190 self.assertRaises(re.error, re.compile, '(?Px)')
191 self.assertRaises(re.error, re.compile, '(?P=)')
192 self.assertRaises(re.error, re.compile, '(?P=1)')
193 self.assertRaises(re.error, re.compile, '(?P=a)')
194 self.assertRaises(re.error, re.compile, '(?P=a1)')
195 self.assertRaises(re.error, re.compile, '(?P=a.)')
196 self.assertRaises(re.error, re.compile, '(?P<)')
197 self.assertRaises(re.error, re.compile, '(?P<>)')
198 self.assertRaises(re.error, re.compile, '(?P<1>)')
199 self.assertRaises(re.error, re.compile, '(?P<a.>)')
200 self.assertRaises(re.error, re.compile, '(?())')
201 self.assertRaises(re.error, re.compile, '(?(a))')
202 self.assertRaises(re.error, re.compile, '(?(1a))')
203 self.assertRaises(re.error, re.compile, '(?(a.))')
204
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000205 def test_symbolic_refs(self):
206 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
207 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melottief317382012-11-03 20:31:12 +0200210 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000211 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
212 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
213 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
214 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000215 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_re_subn(self):
218 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
219 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
220 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
221 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
222 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000223
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000224 def test_re_split(self):
225 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
226 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
227 self.assertEqual(re.split("(:*)", ":a:b::c"),
228 ['', ':', 'a', ':', 'b', '::', 'c'])
229 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
230 self.assertEqual(re.split("(:)*", ":a:b::c"),
231 ['', ':', 'a', ':', 'b', ':', 'c'])
232 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
233 ['', ':', 'a', ':b::', 'c'])
234 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
235 ['', None, ':', 'a', None, ':', '', 'b', None, '',
236 None, '::', 'c'])
237 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
238 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_qualified_re_split(self):
241 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
242 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
243 self.assertEqual(re.split("(:)", ":a:b::c", 2),
244 ['', ':', 'a', ':', 'b::c'])
245 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
246 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000247
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000248 def test_re_findall(self):
249 self.assertEqual(re.findall(":+", "abc"), [])
250 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
251 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
252 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
253 (":", ":"),
254 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000255
Skip Montanaro5ba00542003-04-25 16:00:14 +0000256 def test_bug_117612(self):
257 self.assertEqual(re.findall(r"(a|(b))", "aba"),
258 [("a", ""),("b", "b"),("a", "")])
259
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000260 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000261 self.assertEqual(re.match('a', 'a').groups(), ())
262 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
263 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
264 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
265 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000266
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000267 pat = re.compile('((a)|(b))(c)?')
268 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
269 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
270 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
271 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
272 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 # A single group
275 m = re.match('(a)', 'a')
276 self.assertEqual(m.group(0), 'a')
277 self.assertEqual(m.group(0), 'a')
278 self.assertEqual(m.group(1), 'a')
279 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000280
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000281 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
282 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
283 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
284 (None, 'b', None))
285 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000286
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000287 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000288 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
289 ('(', 'a'))
290 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
291 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300292 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
293 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000294 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
295 ('a', 'b'))
296 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
297 (None, 'd'))
298 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
299 (None, 'd'))
300 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
301 ('a', ''))
302
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000303 # Tests for bug #1177831: exercise groups other than the first group
304 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
305 self.assertEqual(p.match('abc').groups(),
306 ('a', 'b', 'c'))
307 self.assertEqual(p.match('ad').groups(),
308 ('a', None, 'd'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300309 self.assertIsNone(p.match('abd'))
310 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000311
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000312
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000313 def test_re_groupref(self):
314 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
315 ('|', 'a'))
316 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
317 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300318 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
319 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000320 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
321 ('a', 'a'))
322 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
323 (None, None))
324
325 def test_groupdict(self):
326 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
327 'first second').groupdict(),
328 {'first':'first', 'second':'second'})
329
330 def test_expand(self):
331 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
332 "first second")
333 .expand(r"\2 \1 \g<second> \g<first>"),
334 "second first second first")
335
336 def test_repeat_minmax(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300337 self.assertIsNone(re.match("^(\w){1}$", "abc"))
338 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
339 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
340 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000341
342 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
343 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
344 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
345 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
346 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
347 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
348 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
349 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
350
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300351 self.assertIsNone(re.match("^x{1}$", "xxx"))
352 self.assertIsNone(re.match("^x{1}?$", "xxx"))
353 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
354 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000355
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300356 self.assertTrue(re.match("^x{3}$", "xxx"))
357 self.assertTrue(re.match("^x{1,3}$", "xxx"))
358 self.assertTrue(re.match("^x{1,4}$", "xxx"))
359 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
360 self.assertTrue(re.match("^x{3}?$", "xxx"))
361 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
362 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
363 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000364
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300365 self.assertIsNone(re.match("^x{}$", "xxx"))
366 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000367
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000368 def test_getattr(self):
369 self.assertEqual(re.match("(a)", "a").pos, 0)
370 self.assertEqual(re.match("(a)", "a").endpos, 1)
371 self.assertEqual(re.match("(a)", "a").string, "a")
372 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300373 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000374
375 def test_special_escapes(self):
376 self.assertEqual(re.search(r"\b(b.)\b",
377 "abcd abc bcd bx").group(1), "bx")
378 self.assertEqual(re.search(r"\B(b.)\B",
379 "abc bcd bc abxd").group(1), "bx")
380 self.assertEqual(re.search(r"\b(b.)\b",
381 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
382 self.assertEqual(re.search(r"\B(b.)\B",
383 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300384 if have_unicode:
385 self.assertEqual(re.search(r"\b(b.)\b",
386 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
387 self.assertEqual(re.search(r"\B(b.)\B",
388 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000389 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
390 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300391 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000392 self.assertEqual(re.search(r"\b(b.)\b",
393 u"abcd abc bcd bx").group(1), "bx")
394 self.assertEqual(re.search(r"\B(b.)\B",
395 u"abc bcd bc abxd").group(1), "bx")
396 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
397 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300398 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000399 self.assertEqual(re.search(r"\d\D\w\W\s\S",
400 "1aa! a").group(0), "1aa! a")
401 self.assertEqual(re.search(r"\d\D\w\W\s\S",
402 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300403 if have_unicode:
404 self.assertEqual(re.search(r"\d\D\w\W\s\S",
405 "1aa! a", re.UNICODE).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000406
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200407 def test_string_boundaries(self):
408 # See http://bugs.python.org/issue10713
409 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
410 "abc")
411 # There's a word boundary at the start of a string.
412 self.assertTrue(re.match(r"\b", "abc"))
413 # A non-empty string includes a non-boundary zero-length match.
414 self.assertTrue(re.search(r"\B", "abc"))
415 # There is no non-boundary match at the start of a string.
416 self.assertFalse(re.match(r"\B", "abc"))
417 # However, an empty string contains no word boundaries, and also no
418 # non-boundaries.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300419 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200420 # This one is questionable and different from the perlre behaviour,
421 # but describes current behavior.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300422 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200423 # A single word-character string has two boundaries, but no
424 # non-boundary gaps.
425 self.assertEqual(len(re.findall(r"\b", "a")), 2)
426 self.assertEqual(len(re.findall(r"\B", "a")), 0)
427 # If there are no words, there are no boundaries
428 self.assertEqual(len(re.findall(r"\b", " ")), 0)
429 self.assertEqual(len(re.findall(r"\b", " ")), 0)
430 # Can match around the whitespace.
431 self.assertEqual(len(re.findall(r"\B", " ")), 2)
432
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300433 @requires_unicode
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000434 def test_bigcharset(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300435 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
436 unichr(0x2222)).group(1), unichr(0x2222))
437 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
438 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
Serhiy Storchaka22fb0de2013-10-24 22:02:42 +0300439 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300440 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000441
Antoine Pitroub83ea142012-11-20 22:30:42 +0100442 def test_big_codesize(self):
443 # Issue #1160
444 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300445 self.assertTrue(r.match('1000'))
446 self.assertTrue(r.match('9999'))
Antoine Pitroub83ea142012-11-20 22:30:42 +0100447
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000448 def test_anyall(self):
449 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
450 "a\nb")
451 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
452 "a\n\nb")
453
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200454 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000455 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
456 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
457 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
458 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
459 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
460 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
461 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
462
463 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
464 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
465 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
466 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
467
Serhiy Storchaka4809d1f2015-02-21 12:08:36 +0200468 # Group reference.
469 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
470 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
471 # Named group reference.
472 self.assertTrue(re.match(r'(?P<g>a)b(?=(?P=g))a', 'aba'))
473 self.assertIsNone(re.match(r'(?P<g>a)b(?=(?P=g))c', 'abac'))
474 # Conditional group reference.
475 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
476 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
477 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
478 self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
479 self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
480 # Group used before defined.
481 self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
482 self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
483 self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
484
485 def test_lookbehind(self):
486 self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
487 self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
488 self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
489 self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
490 # Group reference.
491 with check_warnings(('', RuntimeWarning)):
492 re.compile(r'(a)a(?<=\1)c')
493 # Named group reference.
494 with check_warnings(('', RuntimeWarning)):
495 re.compile(r'(?P<g>a)a(?<=(?P=g))c')
496 # Conditional group reference.
497 with check_warnings(('', RuntimeWarning)):
498 re.compile(r'(a)b(?<=(?(1)b|x))c')
499 # Group used before defined.
500 with check_warnings(('', RuntimeWarning)):
501 re.compile(r'(a)b(?<=(?(2)b|x))(c)')
502
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000503 def test_ignore_case(self):
Georg Brandl30de77b2008-08-24 18:11:07 +0000504 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
505 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000506 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
507 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
508 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
509 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
510 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
511 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
512 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
513 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
514
Serhiy Storchakae9277572014-11-10 12:37:02 +0200515 if have_unicode:
516 assert u(r'\u212a').lower() == u'k' # 'K'
517 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
518 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
519 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
520 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
521 assert u(r'\u017f').upper() == u'S' # 'ſ'
522 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
523 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
524 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
525 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
526
527 def test_ignore_case_set(self):
528 self.assertTrue(re.match(r'[19A]', 'A', re.I))
529 self.assertTrue(re.match(r'[19a]', 'a', re.I))
530 self.assertTrue(re.match(r'[19a]', 'A', re.I))
531 self.assertTrue(re.match(r'[19A]', 'a', re.I))
532 if have_unicode:
533 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
534 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
535 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
536 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
537 assert u(r'\u212a').lower() == u'k' # 'K'
538 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
539 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
540 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
541 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
542 assert u(r'\u017f').upper() == u'S' # 'ſ'
543 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
544 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
545 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
546 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
547
Serhiy Storchakae9e54ae2014-10-31 13:53:21 +0200548 def test_ignore_case_range(self):
549 # Issues #3511, #17381.
550 self.assertTrue(re.match(r'[9-a]', '_', re.I))
551 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
552 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
553 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
554 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
555 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
556 if have_unicode:
557 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
558 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
559 self.assertTrue(re.match(u(r'[\xc0-\xde]'),
560 u(r'\xd7'), re.U | re.I))
561 self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
562 u(r'\xf7'), re.U | re.I))
563 self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
564 u(r'\xf7'), re.U | re.I))
565 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
566 u(r'\xd7'), re.U | re.I))
567 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
568 u(r'\u0450'), re.U | re.I))
569 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
570 u(r'\u0400'), re.U | re.I))
571 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
572 u(r'\u0450'), re.U | re.I))
573 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
574 u(r'\u0400'), re.U | re.I))
575 if sys.maxunicode > 0xffff:
576 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
577 u(r'\U00010428'), re.U | re.I))
578 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
579 u(r'\U00010400'), re.U | re.I))
580 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
581 u(r'\U00010428'), re.U | re.I))
582 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
583 u(r'\U00010400'), re.U | re.I))
584
Serhiy Storchakae9277572014-11-10 12:37:02 +0200585 assert u(r'\u212a').lower() == u'k' # 'K'
586 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
587 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
588 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
589 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
590 assert u(r'\u017f').upper() == u'S' # 'ſ'
591 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
592 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
593 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
594 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
595
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000596 def test_category(self):
597 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
598
599 def test_getlower(self):
600 import _sre
601 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
602 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300603 if have_unicode:
604 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000605
606 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
607 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
608
609 def test_not_literal(self):
610 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
611 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
612
613 def test_search_coverage(self):
614 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
615 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
616
Ezio Melotti46645632011-03-25 14:50:52 +0200617 def assertMatch(self, pattern, text, match=None, span=None,
618 matcher=re.match):
619 if match is None and span is None:
620 # the pattern matches the whole text
621 match = text
622 span = (0, len(text))
623 elif match is None or span is None:
624 raise ValueError('If match is not None, span should be specified '
625 '(and vice versa).')
626 m = matcher(pattern, text)
627 self.assertTrue(m)
628 self.assertEqual(m.group(), match)
629 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000630
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300631 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200632 def test_re_escape(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300633 alnum_chars = unicode(string.ascii_letters + string.digits)
Ezio Melotti46645632011-03-25 14:50:52 +0200634 p = u''.join(unichr(i) for i in range(256))
635 for c in p:
636 if c in alnum_chars:
637 self.assertEqual(re.escape(c), c)
638 elif c == u'\x00':
639 self.assertEqual(re.escape(c), u'\\000')
640 else:
641 self.assertEqual(re.escape(c), u'\\' + c)
642 self.assertMatch(re.escape(c), c)
643 self.assertMatch(re.escape(p), p)
644
645 def test_re_escape_byte(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300646 alnum_chars = string.ascii_letters + string.digits
Ezio Melotti46645632011-03-25 14:50:52 +0200647 p = ''.join(chr(i) for i in range(256))
648 for b in p:
649 if b in alnum_chars:
650 self.assertEqual(re.escape(b), b)
651 elif b == b'\x00':
652 self.assertEqual(re.escape(b), b'\\000')
653 else:
654 self.assertEqual(re.escape(b), b'\\' + b)
655 self.assertMatch(re.escape(b), b)
656 self.assertMatch(re.escape(p), p)
657
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300658 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200659 def test_re_escape_non_ascii(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300660 s = u(r'xxx\u2620\u2620\u2620xxx')
Ezio Melotti46645632011-03-25 14:50:52 +0200661 s_escaped = re.escape(s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300662 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
Ezio Melotti46645632011-03-25 14:50:52 +0200663 self.assertMatch(s_escaped, s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300664 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
665 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
Ezio Melotti46645632011-03-25 14:50:52 +0200666
667 def test_re_escape_non_ascii_bytes(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300668 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
Ezio Melotti46645632011-03-25 14:50:52 +0200669 b_escaped = re.escape(b)
670 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
671 self.assertMatch(b_escaped, b)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300672 res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
Ezio Melotti46645632011-03-25 14:50:52 +0200673 self.assertEqual(len(res), 2)
Guido van Rossum49946571997-07-18 04:26:25 +0000674
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000675 def test_pickling(self):
676 import pickle
Skip Montanaro1e703c62003-04-25 15:40:28 +0000677 self.pickle_test(pickle)
678 import cPickle
679 self.pickle_test(cPickle)
Žiga Seilnacht7492e422007-03-21 20:07:56 +0000680 # old pickles expect the _compile() reconstructor in sre module
Florent Xicluna6257a7b2010-03-31 22:01:03 +0000681 import_module("sre", deprecated=True)
682 from sre import _compile
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300683 # current pickle expects the _compile() reconstructor in re module
684 from re import _compile
Skip Montanaro1e703c62003-04-25 15:40:28 +0000685
686 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000687 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300688 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
689 pickled = pickle.dumps(oldpat, proto)
690 newpat = pickle.loads(pickled)
691 self.assertEqual(newpat, oldpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000692
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000693 def test_constants(self):
694 self.assertEqual(re.I, re.IGNORECASE)
695 self.assertEqual(re.L, re.LOCALE)
696 self.assertEqual(re.M, re.MULTILINE)
697 self.assertEqual(re.S, re.DOTALL)
698 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000699
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000700 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000701 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300702 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000703
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000704 def test_sre_character_literals(self):
705 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300706 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
707 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
708 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
709 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
710 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
711 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000712 self.assertRaises(re.error, re.match, "\911", "")
713
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000714 def test_sre_character_class_literals(self):
715 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300716 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
717 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
718 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
719 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
720 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
721 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000722 self.assertRaises(re.error, re.match, "[\911]", "")
723
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000724 def test_bug_113254(self):
725 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
726 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
727 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
728
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000729 def test_bug_527371(self):
730 # bug described in patches 527371/672491
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300731 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000732 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
733 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
734 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
735 self.assertEqual(re.match("((a))", "a").lastindex, 1)
736
737 def test_bug_545855(self):
738 # bug 545855 -- This pattern failed to cause a compile error as it
739 # should, instead provoking a TypeError.
740 self.assertRaises(re.error, re.compile, 'foo[a-')
741
742 def test_bug_418626(self):
743 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
744 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
745 # pattern '*?' on a long string.
746 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
747 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
748 20003)
749 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000750 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000751 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000752 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000753
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300754 @requires_unicode
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000755 def test_bug_612074(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300756 pat=u"["+re.escape(unichr(0x2039))+u"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000757 self.assertEqual(re.compile(pat) and 1, 1)
758
Skip Montanaro1e703c62003-04-25 15:40:28 +0000759 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000760 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000761 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000762 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
763 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
764 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000765
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +0200766 def test_unlimited_zero_width_repeat(self):
767 # Issue #9669
768 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
769 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
770 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
771 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
772 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
773 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
774
Skip Montanaro1e703c62003-04-25 15:40:28 +0000775 def test_scanner(self):
776 def s_ident(scanner, token): return token
777 def s_operator(scanner, token): return "op%s" % token
778 def s_float(scanner, token): return float(token)
779 def s_int(scanner, token): return int(token)
780
781 scanner = Scanner([
782 (r"[a-zA-Z_]\w*", s_ident),
783 (r"\d+\.\d*", s_float),
784 (r"\d+", s_int),
785 (r"=|\+|-|\*|/", s_operator),
786 (r"\s+", None),
787 ])
788
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300789 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000790
Skip Montanaro1e703c62003-04-25 15:40:28 +0000791 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
792 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
793 'op+', 'bar'], ''))
794
Skip Montanaro5ba00542003-04-25 16:00:14 +0000795 def test_bug_448951(self):
796 # bug 448951 (similar to 429357, but with single char match)
797 # (Also test greedy matches.)
798 for op in '','?','*':
799 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
800 (None, None))
801 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
802 ('a:', 'a'))
803
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000804 def test_bug_725106(self):
805 # capturing groups in alternatives in repeats
806 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
807 ('b', 'a'))
808 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
809 ('c', 'b'))
810 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
811 ('b', None))
812 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
813 ('b', None))
814 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
815 ('b', 'a'))
816 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
817 ('c', 'b'))
818 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
819 ('b', None))
820 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
821 ('b', None))
822
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000823 def test_bug_725149(self):
824 # mark_stack_base restoring before restoring marks
825 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
826 ('a', None))
827 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
828 ('a', None, None))
829
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300830 @requires_unicode
Just van Rossum12723ba2003-07-02 20:03:04 +0000831 def test_bug_764548(self):
832 # bug 764548, re.compile() barfs on str/unicode subclasses
Just van Rossum12723ba2003-07-02 20:03:04 +0000833 class my_unicode(unicode): pass
834 pat = re.compile(my_unicode("abc"))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300835 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000836
Skip Montanaro5ba00542003-04-25 16:00:14 +0000837 def test_finditer(self):
838 iter = re.finditer(r":+", "a:b::c:::d")
839 self.assertEqual([item.group(0) for item in iter],
840 [":", "::", ":::"])
841
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300842 @requires_unicode
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000843 def test_bug_926075(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300844 self.assertIsNot(re.compile('bug_926075'),
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300845 re.compile(u'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000846
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300847 @requires_unicode
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000848 def test_bug_931848(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300849 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000850 self.assertEqual(re.compile(pattern).split("a.b.c"),
851 ['a','b','c'])
852
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000853 def test_bug_581080(self):
854 iter = re.finditer(r"\s", "a b")
855 self.assertEqual(iter.next().span(), (1,2))
856 self.assertRaises(StopIteration, iter.next)
857
858 scanner = re.compile(r"\s").scanner("a b")
859 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300860 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000861
862 def test_bug_817234(self):
863 iter = re.finditer(r".*", "asdf")
864 self.assertEqual(iter.next().span(), (0, 4))
865 self.assertEqual(iter.next().span(), (4, 4))
866 self.assertRaises(StopIteration, iter.next)
867
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300868 @requires_unicode
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000869 def test_bug_6561(self):
870 # '\d' should match characters in Unicode category 'Nd'
871 # (Number, Decimal Digit), but not those in 'Nl' (Number,
872 # Letter) or 'No' (Number, Other).
873 decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300874 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
875 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
876 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000877 ]
878 for x in decimal_digits:
879 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
880
881 not_decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300882 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
883 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
884 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
885 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000886 ]
887 for x in not_decimal_digits:
888 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
889
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000890 def test_empty_array(self):
891 # SF buf 1647541
892 import array
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300893 typecodes = 'cbBhHiIlLfd'
894 if have_unicode:
895 typecodes += 'u'
896 for typecode in typecodes:
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000897 a = array.array(typecode)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300898 self.assertIsNone(re.compile("bla").match(a))
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000899 self.assertEqual(re.compile("").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000900
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300901 @requires_unicode
Guido van Rossumae04c332008-01-03 19:12:44 +0000902 def test_inline_flags(self):
903 # Bug #1700
904 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
905 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
906
907 p = re.compile(upper_char, re.I | re.U)
908 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300909 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000910
911 p = re.compile(lower_char, re.I | re.U)
912 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300913 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000914
915 p = re.compile('(?i)' + upper_char, re.U)
916 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300917 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000918
919 p = re.compile('(?i)' + lower_char, re.U)
920 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300921 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000922
923 p = re.compile('(?iu)' + upper_char)
924 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300925 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000926
927 p = re.compile('(?iu)' + lower_char)
928 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300929 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000930
Amaury Forgeot d'Arcd08a8eb2008-01-10 21:59:42 +0000931 def test_dollar_matches_twice(self):
932 "$ matches the end of string, and just before the terminating \n"
933 pattern = re.compile('$')
934 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
935 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
936 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
937
938 pattern = re.compile('$', re.MULTILINE)
939 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
940 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
941 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
942
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000943 def test_dealloc(self):
944 # issue 3299: check for segfault in debug build
945 import _sre
Ezio Melotti0e4e7322010-01-23 10:43:05 +0000946 # the overflow limit is different on wide and narrow builds and it
947 # depends on the definition of SRE_CODE (see sre.h).
948 # 2**128 should be big enough to overflow on both. For smaller values
949 # a RuntimeError is raised instead of OverflowError.
950 long_overflow = 2**128
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000951 self.assertRaises(TypeError, re.finditer, "a", {})
952 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Guido van Rossumae04c332008-01-03 19:12:44 +0000953
Ezio Melottib56b6ff2012-03-13 01:25:40 +0200954 def test_compile(self):
955 # Test return value when given string and pattern as parameter
956 pattern = re.compile('random pattern')
957 self.assertIsInstance(pattern, re._pattern_type)
958 same_pattern = re.compile(pattern)
959 self.assertIsInstance(same_pattern, re._pattern_type)
960 self.assertIs(same_pattern, pattern)
961 # Test behaviour when not given a string or pattern as parameter
962 self.assertRaises(TypeError, re.compile, 0)
963
Ezio Melotti5c4e32b2013-01-11 08:32:01 +0200964 def test_bug_13899(self):
965 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
966 # nothing. Ditto B and Z.
967 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
968 ['A', 'B', '\b', 'C', 'Z'])
969
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100970 @precisionbigmemtest(size=_2G, memuse=1)
971 def test_large_search(self, size):
972 # Issue #10182: indices were 32-bit-truncated.
973 s = 'a' * size
974 m = re.search('$', s)
975 self.assertIsNotNone(m)
Antoine Pitrou74635c92012-12-03 21:08:43 +0100976 self.assertEqual(m.start(), size)
977 self.assertEqual(m.end(), size)
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100978
Antoine Pitroub83575b2012-12-02 12:52:36 +0100979 # The huge memuse is because of re.sub() using a list and a join()
980 # to create the replacement result.
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100981 @precisionbigmemtest(size=_2G, memuse=16 + 2)
982 def test_large_subn(self, size):
Antoine Pitroub83575b2012-12-02 12:52:36 +0100983 # Issue #10182: indices were 32-bit-truncated.
984 s = 'a' * size
Antoine Pitroub83575b2012-12-02 12:52:36 +0100985 r, n = re.subn('', '', s)
986 self.assertEqual(r, s)
987 self.assertEqual(n, size + 1)
988
989
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200990 def test_repeat_minmax_overflow(self):
991 # Issue #13169
992 string = "x" * 100000
993 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
994 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
995 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
996 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
997 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
998 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
999 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1000 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1001 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1002 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1003 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1004
1005 @cpython_only
1006 def test_repeat_minmax_overflow_maxrepeat(self):
1007 try:
1008 from _sre import MAXREPEAT
1009 except ImportError:
1010 self.skipTest('requires _sre.MAXREPEAT constant')
1011 string = "x" * 100000
1012 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1013 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1014 (0, 100000))
1015 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1016 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1017 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1018 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1019
R David Murray60773392013-04-14 13:08:50 -04001020 def test_backref_group_name_in_exception(self):
1021 # Issue 17341: Poor error message when compiling invalid regex
1022 with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1023 re.compile('(?P=<foo>)')
1024
1025 def test_group_name_in_exception(self):
1026 # Issue 17341: Poor error message when compiling invalid regex
1027 with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1028 re.compile('(?P<?foo>)')
1029
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001030 def test_issue17998(self):
1031 for reps in '*', '+', '?', '{1}':
1032 for mod in '', '?':
1033 pattern = '.' + reps + mod + 'yz'
1034 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1035 ['xyz'], msg=pattern)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03001036 if have_unicode:
1037 pattern = unicode(pattern)
1038 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1039 [u'xyz'], msg=pattern)
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001040
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001041
Serhiy Storchaka83737c62013-08-19 23:20:07 +03001042 def test_bug_2537(self):
1043 # issue 2537: empty submatches
1044 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1045 for inner_op in ('{0,}', '*', '?'):
1046 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1047 m = r.match("xyyzy")
1048 self.assertEqual(m.group(0), "xyy")
1049 self.assertEqual(m.group(1), "")
1050 self.assertEqual(m.group(2), "y")
1051
Antoine Pitrouf5814112014-02-03 20:59:59 +01001052 def test_debug_flag(self):
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001053 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitrouf5814112014-02-03 20:59:59 +01001054 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001055 re.compile(pat, re.DEBUG)
1056 dump = '''\
1057subpattern 1
1058 literal 46
1059subpattern None
1060 branch
1061 in
1062 literal 99
1063 literal 104
1064 or
1065 literal 112
1066 literal 121
1067subpattern None
1068 groupref_exists 1
1069 at at_end
1070 else
1071 literal 58
1072 literal 32
1073'''
1074 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001075 # Debug output is output again even a second time (bypassing
1076 # the cache -- issue #20426).
1077 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001078 re.compile(pat, re.DEBUG)
1079 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001080
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001081 def test_keyword_parameters(self):
1082 # Issue #20283: Accepting the string keyword parameter.
1083 pat = re.compile(r'(ab)')
1084 self.assertEqual(
1085 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1086 self.assertEqual(
1087 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1088 self.assertEqual(
1089 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1090 self.assertEqual(
1091 pat.split(string='abracadabra', maxsplit=1),
1092 ['', 'ab', 'racadabra'])
1093
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04001094 def test_match_group_takes_long(self):
1095 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1096 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1097
Serhiy Storchakad4c72902014-10-31 00:53:19 +02001098 def test_locale_caching(self):
1099 # Issue #22410
1100 oldlocale = locale.setlocale(locale.LC_CTYPE)
1101 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1102 for loc in 'en_US.iso88591', 'en_US.utf8':
1103 try:
1104 locale.setlocale(locale.LC_CTYPE, loc)
1105 except locale.Error:
1106 # Unsupported locale on this system
1107 self.skipTest('test needs %s locale' % loc)
1108
1109 re.purge()
1110 self.check_en_US_iso88591()
1111 self.check_en_US_utf8()
1112 re.purge()
1113 self.check_en_US_utf8()
1114 self.check_en_US_iso88591()
1115
1116 def check_en_US_iso88591(self):
1117 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1118 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1119 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1120 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1121 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1122 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1123 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1124
1125 def check_en_US_utf8(self):
1126 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1127 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1128 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1129 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1130 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1131 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1132 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1133
Antoine Pitrouf5814112014-02-03 20:59:59 +01001134
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001135def run_re_tests():
Georg Brandla4f46e12010-02-07 17:03:15 +00001136 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001137 if verbose:
1138 print 'Running re_tests test suite'
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001139 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001140 # To save time, only run the first and last 10 tests
1141 #tests = tests[:10] + tests[-10:]
1142 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001143
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001144 for t in tests:
1145 sys.stdout.flush()
1146 pattern = s = outcome = repl = expected = None
1147 if len(t) == 5:
1148 pattern, s, outcome, repl, expected = t
1149 elif len(t) == 3:
1150 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001151 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001152 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1153
Guido van Rossum41360a41998-03-26 19:42:58 +00001154 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001155 obj = re.compile(pattern)
1156 except re.error:
1157 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001158 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001159 print '=== Syntax error:', t
1160 except KeyboardInterrupt: raise KeyboardInterrupt
1161 except:
1162 print '*** Unexpected error ***', t
1163 if verbose:
1164 traceback.print_exc(file=sys.stdout)
1165 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001166 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001167 result = obj.search(s)
1168 except re.error, msg:
1169 print '=== Unexpected exception', t, repr(msg)
1170 if outcome == SYNTAX_ERROR:
1171 # This should have been a syntax error; forget it.
1172 pass
1173 elif outcome == FAIL:
1174 if result is None: pass # No match, as expected
1175 else: print '=== Succeeded incorrectly', t
1176 elif outcome == SUCCEED:
1177 if result is not None:
1178 # Matched, as expected, so now we compute the
1179 # result string and compare it to our expected result.
1180 start, end = result.span(0)
1181 vardict={'found': result.group(0),
1182 'groups': result.group(),
1183 'flags': result.re.flags}
1184 for i in range(1, 100):
1185 try:
1186 gi = result.group(i)
1187 # Special hack because else the string concat fails:
1188 if gi is None:
1189 gi = "None"
1190 except IndexError:
1191 gi = "Error"
1192 vardict['g%d' % i] = gi
1193 for i in result.re.groupindex.keys():
1194 try:
1195 gi = result.group(i)
1196 if gi is None:
1197 gi = "None"
1198 except IndexError:
1199 gi = "Error"
1200 vardict[i] = gi
1201 repl = eval(repl, vardict)
1202 if repl != expected:
1203 print '=== grouping error', t,
1204 print repr(repl) + ' should be ' + repr(expected)
1205 else:
1206 print '=== Failed incorrectly', t
1207
1208 # Try the match on a unicode string, and check that it
1209 # still succeeds.
1210 try:
1211 result = obj.search(unicode(s, "latin-1"))
1212 if result is None:
1213 print '=== Fails on unicode match', t
1214 except NameError:
1215 continue # 1.5.2
1216 except TypeError:
1217 continue # unicode test case
1218
1219 # Try the match on a unicode pattern, and check that it
1220 # still succeeds.
1221 obj=re.compile(unicode(pattern, "latin-1"))
1222 result = obj.search(s)
Fredrik Lundh17741be2001-03-22 15:51:28 +00001223 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001224 print '=== Fails on unicode pattern match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001225
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001226 # Try the match with the search area limited to the extent
1227 # of the match and see if it still succeeds. \B will
1228 # break (because it won't match at the end or start of a
1229 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001230
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001231 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1232 and result is not None:
1233 obj = re.compile(pattern)
1234 result = obj.search(s, result.start(0), result.end(0) + 1)
1235 if result is None:
1236 print '=== Failed on range-limited match', t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001237
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001238 # Try the match with IGNORECASE enabled, and check that it
1239 # still succeeds.
1240 obj = re.compile(pattern, re.IGNORECASE)
1241 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001242 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001243 print '=== Fails on case-insensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001244
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001245 # Try the match with LOCALE enabled, and check that it
1246 # still succeeds.
1247 obj = re.compile(pattern, re.LOCALE)
1248 result = obj.search(s)
1249 if result is None:
1250 print '=== Fails on locale-sensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001251
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001252 # Try the match with UNICODE locale enabled, and check
1253 # that it still succeeds.
1254 obj = re.compile(pattern, re.UNICODE)
1255 result = obj.search(s)
1256 if result is None:
1257 print '=== Fails on unicode-sensitive match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001258
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001259def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001260 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001261 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001262
1263if __name__ == "__main__":
1264 test_main()