blob: a7d2291ec1e8f633cc42c40a2ef20d102c65b2df [file] [log] [blame]
Serhiy Storchakae9277572014-11-10 12:37:02 +02001# -*- coding: utf-8 -*-
Florent Xicluna6257a7b2010-03-31 22:01:03 +00002from test.test_support import verbose, run_unittest, import_module
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003from test.test_support import precisionbigmemtest, _2G, cpython_only
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03004from test.test_support import captured_stdout, have_unicode, requires_unicode, u
Serhiy Storchakad4c72902014-10-31 00:53:19 +02005import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00006import re
Neal Norwitz94a9c092006-03-16 06:30:02 +00007from re import Scanner
R David Murray60773392013-04-14 13:08:50 -04008import sre_constants
Ezio Melotti46645632011-03-25 14:50:52 +02009import sys
10import string
11import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000012from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000013
Antoine Pitrou735f36e2012-12-03 20:53:12 +010014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti24b07bc2011-03-15 18:55:01 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Skip Montanaro8ed06da2003-04-24 19:43:18 +000021import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000022
Skip Montanaro8ed06da2003-04-24 19:43:18 +000023class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000024
25 def test_weakref(self):
26 s = 'QabbbcR'
27 x = re.compile('ab+c')
28 y = proxy(x)
29 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
30
Skip Montanaro8ed06da2003-04-24 19:43:18 +000031 def test_search_star_plus(self):
32 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
33 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
34 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
35 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030036 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000037 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
38 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
39 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
40 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030041 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000042
Skip Montanaro8ed06da2003-04-24 19:43:18 +000043 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000044 int_value = int(matchobj.group(0))
45 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000046
Skip Montanaro8ed06da2003-04-24 19:43:18 +000047 def test_basic_re_sub(self):
48 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
49 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
50 '9.3 -3 24x100y')
51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
52 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000053
Skip Montanaro8ed06da2003-04-24 19:43:18 +000054 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
55 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000056
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 s = r"\1\1"
58 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
59 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
60 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000061
Skip Montanaro8ed06da2003-04-24 19:43:18 +000062 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
63 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
64 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
65 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000066
Skip Montanaro8ed06da2003-04-24 19:43:18 +000067 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
68 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
69 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
70 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
71 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000074
Skip Montanaro2726fcd2003-04-25 14:31:54 +000075 def test_bug_449964(self):
76 # fails for group followed by other escape
77 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
78 'xx\bxx\b')
79
80 def test_bug_449000(self):
81 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
83 'abc\ndef\n')
84 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
85 'abc\ndef\n')
86 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
87 'abc\ndef\n')
88 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000090
Serhiy Storchaka7644ff12014-09-14 17:40:44 +030091 @requires_unicode
Guido van Rossum1ff91d92007-09-10 22:02:25 +000092 def test_bug_1140(self):
93 # re.sub(x, y, u'') should return u'', not '', and
94 # re.sub(x, y, '') should return '', not u''.
95 # Also:
96 # re.sub(x, y, unicode(x)) should return unicode(y), and
97 # re.sub(x, y, str(x)) should return
98 # str(y) if isinstance(y, str) else unicode(y).
99 for x in 'x', u'x':
100 for y in 'y', u'y':
101 z = re.sub(x, y, u'')
102 self.assertEqual(z, u'')
103 self.assertEqual(type(z), unicode)
104 #
105 z = re.sub(x, y, '')
106 self.assertEqual(z, '')
107 self.assertEqual(type(z), str)
108 #
109 z = re.sub(x, y, unicode(x))
110 self.assertEqual(z, y)
111 self.assertEqual(type(z), unicode)
112 #
113 z = re.sub(x, y, str(x))
114 self.assertEqual(z, y)
115 self.assertEqual(type(z), type(y))
116
Raymond Hettinger80016c92007-12-19 18:13:31 +0000117 def test_bug_1661(self):
118 # Verify that flags do not get silently ignored with compiled patterns
119 pattern = re.compile('.')
120 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
121 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
122 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
123 self.assertRaises(ValueError, re.compile, pattern, re.I)
124
Guido van Rossume3c4fd92008-09-10 14:27:00 +0000125 def test_bug_3629(self):
126 # A regex that triggered a bug in the sre-code validator
127 re.compile("(?P<quote>)(?(quote))")
128
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000129 def test_sub_template_numeric_escape(self):
130 # bug 776311 and friends
131 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
132 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
133 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
134 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
135 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
136 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
137 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
138
139 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
140 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
141
142 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
143 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
144 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
147
148 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
149 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000150
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000151 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
152 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
153 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
154 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
155 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
156 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
157 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
158 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
160 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
162 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
163
164 # in python2.3 (etc), these loop endlessly in sre_parser.py
165 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
166 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
167 'xz8')
168 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
169 'xza')
170
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000171 def test_qualified_re_sub(self):
172 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
173 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000174
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000175 def test_bug_114660(self):
176 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
177 'hello there')
178
179 def test_bug_462270(self):
180 # Test for empty sub() behaviour, see SF bug #462270
181 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
182 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
183
Ezio Melottief317382012-11-03 20:31:12 +0200184 def test_symbolic_groups(self):
185 re.compile('(?P<a>x)(?P=a)(?(a)y)')
186 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
187 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
188 self.assertRaises(re.error, re.compile, '(?Px)')
189 self.assertRaises(re.error, re.compile, '(?P=)')
190 self.assertRaises(re.error, re.compile, '(?P=1)')
191 self.assertRaises(re.error, re.compile, '(?P=a)')
192 self.assertRaises(re.error, re.compile, '(?P=a1)')
193 self.assertRaises(re.error, re.compile, '(?P=a.)')
194 self.assertRaises(re.error, re.compile, '(?P<)')
195 self.assertRaises(re.error, re.compile, '(?P<>)')
196 self.assertRaises(re.error, re.compile, '(?P<1>)')
197 self.assertRaises(re.error, re.compile, '(?P<a.>)')
198 self.assertRaises(re.error, re.compile, '(?())')
199 self.assertRaises(re.error, re.compile, '(?(a))')
200 self.assertRaises(re.error, re.compile, '(?(1a))')
201 self.assertRaises(re.error, re.compile, '(?(a.))')
202
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000203 def test_symbolic_refs(self):
204 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
205 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
206 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
207 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melottief317382012-11-03 20:31:12 +0200208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
210 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
211 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
212 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000213 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000214
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000215 def test_re_subn(self):
216 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
217 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
218 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
219 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
220 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000221
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 def test_re_split(self):
223 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
224 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
225 self.assertEqual(re.split("(:*)", ":a:b::c"),
226 ['', ':', 'a', ':', 'b', '::', 'c'])
227 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
228 self.assertEqual(re.split("(:)*", ":a:b::c"),
229 ['', ':', 'a', ':', 'b', ':', 'c'])
230 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
231 ['', ':', 'a', ':b::', 'c'])
232 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
233 ['', None, ':', 'a', None, ':', '', 'b', None, '',
234 None, '::', 'c'])
235 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
236 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000237
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000238 def test_qualified_re_split(self):
239 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
240 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
241 self.assertEqual(re.split("(:)", ":a:b::c", 2),
242 ['', ':', 'a', ':', 'b::c'])
243 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
244 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000245
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000246 def test_re_findall(self):
247 self.assertEqual(re.findall(":+", "abc"), [])
248 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
249 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
250 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
251 (":", ":"),
252 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000253
Skip Montanaro5ba00542003-04-25 16:00:14 +0000254 def test_bug_117612(self):
255 self.assertEqual(re.findall(r"(a|(b))", "aba"),
256 [("a", ""),("b", "b"),("a", "")])
257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000259 self.assertEqual(re.match('a', 'a').groups(), ())
260 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
261 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
262 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
263 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000264
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000265 pat = re.compile('((a)|(b))(c)?')
266 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
267 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
268 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
269 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
270 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000271
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000272 # A single group
273 m = re.match('(a)', 'a')
274 self.assertEqual(m.group(0), 'a')
275 self.assertEqual(m.group(0), 'a')
276 self.assertEqual(m.group(1), 'a')
277 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000278
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000279 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
280 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
281 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
282 (None, 'b', None))
283 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000284
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000285 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000286 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
287 ('(', 'a'))
288 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
289 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300290 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
291 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000292 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
293 ('a', 'b'))
294 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
295 (None, 'd'))
296 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
297 (None, 'd'))
298 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
299 ('a', ''))
300
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000301 # Tests for bug #1177831: exercise groups other than the first group
302 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
303 self.assertEqual(p.match('abc').groups(),
304 ('a', 'b', 'c'))
305 self.assertEqual(p.match('ad').groups(),
306 ('a', None, 'd'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300307 self.assertIsNone(p.match('abd'))
308 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000309
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000310
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000311 def test_re_groupref(self):
312 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
313 ('|', 'a'))
314 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
315 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300316 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
317 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000318 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
319 ('a', 'a'))
320 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
321 (None, None))
322
323 def test_groupdict(self):
324 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
325 'first second').groupdict(),
326 {'first':'first', 'second':'second'})
327
328 def test_expand(self):
329 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
330 "first second")
331 .expand(r"\2 \1 \g<second> \g<first>"),
332 "second first second first")
333
334 def test_repeat_minmax(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300335 self.assertIsNone(re.match("^(\w){1}$", "abc"))
336 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
337 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
338 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000339
340 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
341 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
342 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
343 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
344 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
345 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
346 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
347 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
348
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300349 self.assertIsNone(re.match("^x{1}$", "xxx"))
350 self.assertIsNone(re.match("^x{1}?$", "xxx"))
351 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
352 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000353
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300354 self.assertTrue(re.match("^x{3}$", "xxx"))
355 self.assertTrue(re.match("^x{1,3}$", "xxx"))
356 self.assertTrue(re.match("^x{1,4}$", "xxx"))
357 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
358 self.assertTrue(re.match("^x{3}?$", "xxx"))
359 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
360 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
361 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000362
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300363 self.assertIsNone(re.match("^x{}$", "xxx"))
364 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000365
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000366 def test_getattr(self):
367 self.assertEqual(re.match("(a)", "a").pos, 0)
368 self.assertEqual(re.match("(a)", "a").endpos, 1)
369 self.assertEqual(re.match("(a)", "a").string, "a")
370 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300371 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000372
373 def test_special_escapes(self):
374 self.assertEqual(re.search(r"\b(b.)\b",
375 "abcd abc bcd bx").group(1), "bx")
376 self.assertEqual(re.search(r"\B(b.)\B",
377 "abc bcd bc abxd").group(1), "bx")
378 self.assertEqual(re.search(r"\b(b.)\b",
379 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
380 self.assertEqual(re.search(r"\B(b.)\B",
381 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300382 if have_unicode:
383 self.assertEqual(re.search(r"\b(b.)\b",
384 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
385 self.assertEqual(re.search(r"\B(b.)\B",
386 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000387 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
388 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300389 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.search(r"\b(b.)\b",
391 u"abcd abc bcd bx").group(1), "bx")
392 self.assertEqual(re.search(r"\B(b.)\B",
393 u"abc bcd bc abxd").group(1), "bx")
394 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
395 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300396 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000397 self.assertEqual(re.search(r"\d\D\w\W\s\S",
398 "1aa! a").group(0), "1aa! a")
399 self.assertEqual(re.search(r"\d\D\w\W\s\S",
400 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300401 if have_unicode:
402 self.assertEqual(re.search(r"\d\D\w\W\s\S",
403 "1aa! a", re.UNICODE).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000404
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200405 def test_string_boundaries(self):
406 # See http://bugs.python.org/issue10713
407 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
408 "abc")
409 # There's a word boundary at the start of a string.
410 self.assertTrue(re.match(r"\b", "abc"))
411 # A non-empty string includes a non-boundary zero-length match.
412 self.assertTrue(re.search(r"\B", "abc"))
413 # There is no non-boundary match at the start of a string.
414 self.assertFalse(re.match(r"\B", "abc"))
415 # However, an empty string contains no word boundaries, and also no
416 # non-boundaries.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300417 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200418 # This one is questionable and different from the perlre behaviour,
419 # but describes current behavior.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300420 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200421 # A single word-character string has two boundaries, but no
422 # non-boundary gaps.
423 self.assertEqual(len(re.findall(r"\b", "a")), 2)
424 self.assertEqual(len(re.findall(r"\B", "a")), 0)
425 # If there are no words, there are no boundaries
426 self.assertEqual(len(re.findall(r"\b", " ")), 0)
427 self.assertEqual(len(re.findall(r"\b", " ")), 0)
428 # Can match around the whitespace.
429 self.assertEqual(len(re.findall(r"\B", " ")), 2)
430
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300431 @requires_unicode
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000432 def test_bigcharset(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300433 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
434 unichr(0x2222)).group(1), unichr(0x2222))
435 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
436 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
Serhiy Storchaka22fb0de2013-10-24 22:02:42 +0300437 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300438 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000439
Antoine Pitroub83ea142012-11-20 22:30:42 +0100440 def test_big_codesize(self):
441 # Issue #1160
442 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300443 self.assertTrue(r.match('1000'))
444 self.assertTrue(r.match('9999'))
Antoine Pitroub83ea142012-11-20 22:30:42 +0100445
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000446 def test_anyall(self):
447 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
448 "a\nb")
449 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
450 "a\n\nb")
451
Serhiy Storchaka15ea8702014-11-07 21:43:45 +0200452 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000453 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
454 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
455 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
456 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
457 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
458 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
459 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
460
461 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
462 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
463 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
464 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
465
Serhiy Storchaka15ea8702014-11-07 21:43:45 +0200466 # Group reference.
467 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
468 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
469 # Conditional group reference.
470 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
471 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
472 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
473 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
474 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
475 # Group used before defined.
476 self.assertTrue(re.match('(a)b(?=(?(2)x|c))(c)', 'abc'))
477 self.assertIsNone(re.match('(a)b(?=(?(2)b|x))(c)', 'abc'))
478 self.assertTrue(re.match('(a)b(?=(?(1)c|x))(c)', 'abc'))
479
480 def test_lookbehind(self):
481 self.assertTrue(re.match('ab(?<=b)c', 'abc'))
482 self.assertIsNone(re.match('ab(?<=c)c', 'abc'))
483 self.assertIsNone(re.match('ab(?<!b)c', 'abc'))
484 self.assertTrue(re.match('ab(?<!c)c', 'abc'))
485 # Group reference.
486 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
487 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
488 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
489 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
490 # Conditional group reference.
491 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
492 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
493 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
494 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
495 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
496 # Group used before defined.
497 self.assertIsNone(re.match('(a)b(?<=(?(2)x|c))(c)', 'abc'))
498 self.assertIsNone(re.match('(a)b(?<=(?(2)b|x))(c)', 'abc'))
499 self.assertIsNone(re.match('(a)b(?<=(?(1)c|x))(c)', 'abc'))
500 self.assertTrue(re.match('(a)b(?<=(?(1)b|x))(c)', 'abc'))
501
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000502 def test_ignore_case(self):
Georg Brandl30de77b2008-08-24 18:11:07 +0000503 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
504 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
506 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
507 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
508 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
509 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
510 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
511 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
512 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
513
Serhiy Storchakae9277572014-11-10 12:37:02 +0200514 if have_unicode:
515 assert u(r'\u212a').lower() == u'k' # 'K'
516 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
517 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
518 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
519 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
520 assert u(r'\u017f').upper() == u'S' # 'ſ'
521 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
522 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
523 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
524 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
525
526 def test_ignore_case_set(self):
527 self.assertTrue(re.match(r'[19A]', 'A', re.I))
528 self.assertTrue(re.match(r'[19a]', 'a', re.I))
529 self.assertTrue(re.match(r'[19a]', 'A', re.I))
530 self.assertTrue(re.match(r'[19A]', 'a', re.I))
531 if have_unicode:
532 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
533 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
534 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
535 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
536 assert u(r'\u212a').lower() == u'k' # 'K'
537 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
538 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
539 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
540 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
541 assert u(r'\u017f').upper() == u'S' # 'ſ'
542 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
543 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
544 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
545 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
546
Serhiy Storchakae9e54ae2014-10-31 13:53:21 +0200547 def test_ignore_case_range(self):
548 # Issues #3511, #17381.
549 self.assertTrue(re.match(r'[9-a]', '_', re.I))
550 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
551 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
552 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
553 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
554 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
555 if have_unicode:
556 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
557 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
558 self.assertTrue(re.match(u(r'[\xc0-\xde]'),
559 u(r'\xd7'), re.U | re.I))
560 self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
561 u(r'\xf7'), re.U | re.I))
562 self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
563 u(r'\xf7'), re.U | re.I))
564 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
565 u(r'\xd7'), re.U | re.I))
566 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
567 u(r'\u0450'), re.U | re.I))
568 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
569 u(r'\u0400'), re.U | re.I))
570 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
571 u(r'\u0450'), re.U | re.I))
572 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
573 u(r'\u0400'), re.U | re.I))
574 if sys.maxunicode > 0xffff:
575 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
576 u(r'\U00010428'), re.U | re.I))
577 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
578 u(r'\U00010400'), re.U | re.I))
579 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
580 u(r'\U00010428'), re.U | re.I))
581 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
582 u(r'\U00010400'), re.U | re.I))
583
Serhiy Storchakae9277572014-11-10 12:37:02 +0200584 assert u(r'\u212a').lower() == u'k' # 'K'
585 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
586 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
587 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
588 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
589 assert u(r'\u017f').upper() == u'S' # 'ſ'
590 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
591 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
592 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
593 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
594
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000595 def test_category(self):
596 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
597
598 def test_getlower(self):
599 import _sre
600 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
601 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300602 if have_unicode:
603 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000604
605 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
606 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
607
608 def test_not_literal(self):
609 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
610 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
611
612 def test_search_coverage(self):
613 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
614 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
615
Ezio Melotti46645632011-03-25 14:50:52 +0200616 def assertMatch(self, pattern, text, match=None, span=None,
617 matcher=re.match):
618 if match is None and span is None:
619 # the pattern matches the whole text
620 match = text
621 span = (0, len(text))
622 elif match is None or span is None:
623 raise ValueError('If match is not None, span should be specified '
624 '(and vice versa).')
625 m = matcher(pattern, text)
626 self.assertTrue(m)
627 self.assertEqual(m.group(), match)
628 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000629
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300630 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200631 def test_re_escape(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300632 alnum_chars = unicode(string.ascii_letters + string.digits)
Ezio Melotti46645632011-03-25 14:50:52 +0200633 p = u''.join(unichr(i) for i in range(256))
634 for c in p:
635 if c in alnum_chars:
636 self.assertEqual(re.escape(c), c)
637 elif c == u'\x00':
638 self.assertEqual(re.escape(c), u'\\000')
639 else:
640 self.assertEqual(re.escape(c), u'\\' + c)
641 self.assertMatch(re.escape(c), c)
642 self.assertMatch(re.escape(p), p)
643
644 def test_re_escape_byte(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300645 alnum_chars = string.ascii_letters + string.digits
Ezio Melotti46645632011-03-25 14:50:52 +0200646 p = ''.join(chr(i) for i in range(256))
647 for b in p:
648 if b in alnum_chars:
649 self.assertEqual(re.escape(b), b)
650 elif b == b'\x00':
651 self.assertEqual(re.escape(b), b'\\000')
652 else:
653 self.assertEqual(re.escape(b), b'\\' + b)
654 self.assertMatch(re.escape(b), b)
655 self.assertMatch(re.escape(p), p)
656
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300657 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200658 def test_re_escape_non_ascii(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300659 s = u(r'xxx\u2620\u2620\u2620xxx')
Ezio Melotti46645632011-03-25 14:50:52 +0200660 s_escaped = re.escape(s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300661 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
Ezio Melotti46645632011-03-25 14:50:52 +0200662 self.assertMatch(s_escaped, s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300663 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
664 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
Ezio Melotti46645632011-03-25 14:50:52 +0200665
666 def test_re_escape_non_ascii_bytes(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300667 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
Ezio Melotti46645632011-03-25 14:50:52 +0200668 b_escaped = re.escape(b)
669 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
670 self.assertMatch(b_escaped, b)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300671 res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
Ezio Melotti46645632011-03-25 14:50:52 +0200672 self.assertEqual(len(res), 2)
Guido van Rossum49946571997-07-18 04:26:25 +0000673
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000674 def test_pickling(self):
675 import pickle
Skip Montanaro1e703c62003-04-25 15:40:28 +0000676 self.pickle_test(pickle)
677 import cPickle
678 self.pickle_test(cPickle)
Žiga Seilnacht7492e422007-03-21 20:07:56 +0000679 # old pickles expect the _compile() reconstructor in sre module
Florent Xicluna6257a7b2010-03-31 22:01:03 +0000680 import_module("sre", deprecated=True)
681 from sre import _compile
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300682 # current pickle expects the _compile() reconstructor in re module
683 from re import _compile
Skip Montanaro1e703c62003-04-25 15:40:28 +0000684
685 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000686 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300687 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
688 pickled = pickle.dumps(oldpat, proto)
689 newpat = pickle.loads(pickled)
690 self.assertEqual(newpat, oldpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000691
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000692 def test_constants(self):
693 self.assertEqual(re.I, re.IGNORECASE)
694 self.assertEqual(re.L, re.LOCALE)
695 self.assertEqual(re.M, re.MULTILINE)
696 self.assertEqual(re.S, re.DOTALL)
697 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000698
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000699 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000700 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300701 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000702
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000703 def test_sre_character_literals(self):
704 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300705 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
706 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
707 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
708 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
709 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
710 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000711 self.assertRaises(re.error, re.match, "\911", "")
712
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000713 def test_sre_character_class_literals(self):
714 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300715 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
716 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
717 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
718 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
719 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
720 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000721 self.assertRaises(re.error, re.match, "[\911]", "")
722
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000723 def test_bug_113254(self):
724 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
725 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
726 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
727
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000728 def test_bug_527371(self):
729 # bug described in patches 527371/672491
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300730 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000731 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
732 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
733 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
734 self.assertEqual(re.match("((a))", "a").lastindex, 1)
735
736 def test_bug_545855(self):
737 # bug 545855 -- This pattern failed to cause a compile error as it
738 # should, instead provoking a TypeError.
739 self.assertRaises(re.error, re.compile, 'foo[a-')
740
741 def test_bug_418626(self):
742 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
743 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
744 # pattern '*?' on a long string.
745 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
746 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
747 20003)
748 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000749 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000750 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000751 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000752
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300753 @requires_unicode
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000754 def test_bug_612074(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300755 pat=u"["+re.escape(unichr(0x2039))+u"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000756 self.assertEqual(re.compile(pat) and 1, 1)
757
Skip Montanaro1e703c62003-04-25 15:40:28 +0000758 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000759 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000760 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000761 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
762 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
763 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000764
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +0200765 def test_unlimited_zero_width_repeat(self):
766 # Issue #9669
767 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
768 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
769 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
770 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
771 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
772 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
773
Skip Montanaro1e703c62003-04-25 15:40:28 +0000774 def test_scanner(self):
775 def s_ident(scanner, token): return token
776 def s_operator(scanner, token): return "op%s" % token
777 def s_float(scanner, token): return float(token)
778 def s_int(scanner, token): return int(token)
779
780 scanner = Scanner([
781 (r"[a-zA-Z_]\w*", s_ident),
782 (r"\d+\.\d*", s_float),
783 (r"\d+", s_int),
784 (r"=|\+|-|\*|/", s_operator),
785 (r"\s+", None),
786 ])
787
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300788 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000789
Skip Montanaro1e703c62003-04-25 15:40:28 +0000790 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
791 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
792 'op+', 'bar'], ''))
793
Skip Montanaro5ba00542003-04-25 16:00:14 +0000794 def test_bug_448951(self):
795 # bug 448951 (similar to 429357, but with single char match)
796 # (Also test greedy matches.)
797 for op in '','?','*':
798 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
799 (None, None))
800 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
801 ('a:', 'a'))
802
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000803 def test_bug_725106(self):
804 # capturing groups in alternatives in repeats
805 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
806 ('b', 'a'))
807 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
808 ('c', 'b'))
809 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
810 ('b', None))
811 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
812 ('b', None))
813 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
814 ('b', 'a'))
815 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
816 ('c', 'b'))
817 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
818 ('b', None))
819 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
820 ('b', None))
821
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000822 def test_bug_725149(self):
823 # mark_stack_base restoring before restoring marks
824 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
825 ('a', None))
826 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
827 ('a', None, None))
828
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300829 @requires_unicode
Just van Rossum12723ba2003-07-02 20:03:04 +0000830 def test_bug_764548(self):
831 # bug 764548, re.compile() barfs on str/unicode subclasses
Just van Rossum12723ba2003-07-02 20:03:04 +0000832 class my_unicode(unicode): pass
833 pat = re.compile(my_unicode("abc"))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300834 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000835
Skip Montanaro5ba00542003-04-25 16:00:14 +0000836 def test_finditer(self):
837 iter = re.finditer(r":+", "a:b::c:::d")
838 self.assertEqual([item.group(0) for item in iter],
839 [":", "::", ":::"])
840
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300841 @requires_unicode
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000842 def test_bug_926075(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300843 self.assertIsNot(re.compile('bug_926075'),
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300844 re.compile(u'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000845
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300846 @requires_unicode
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000847 def test_bug_931848(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300848 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000849 self.assertEqual(re.compile(pattern).split("a.b.c"),
850 ['a','b','c'])
851
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000852 def test_bug_581080(self):
853 iter = re.finditer(r"\s", "a b")
854 self.assertEqual(iter.next().span(), (1,2))
855 self.assertRaises(StopIteration, iter.next)
856
857 scanner = re.compile(r"\s").scanner("a b")
858 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300859 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000860
861 def test_bug_817234(self):
862 iter = re.finditer(r".*", "asdf")
863 self.assertEqual(iter.next().span(), (0, 4))
864 self.assertEqual(iter.next().span(), (4, 4))
865 self.assertRaises(StopIteration, iter.next)
866
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300867 @requires_unicode
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000868 def test_bug_6561(self):
869 # '\d' should match characters in Unicode category 'Nd'
870 # (Number, Decimal Digit), but not those in 'Nl' (Number,
871 # Letter) or 'No' (Number, Other).
872 decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300873 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
874 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
875 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000876 ]
877 for x in decimal_digits:
878 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
879
880 not_decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300881 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
882 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
883 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
884 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000885 ]
886 for x in not_decimal_digits:
887 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
888
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000889 def test_empty_array(self):
890 # SF buf 1647541
891 import array
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300892 typecodes = 'cbBhHiIlLfd'
893 if have_unicode:
894 typecodes += 'u'
895 for typecode in typecodes:
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000896 a = array.array(typecode)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300897 self.assertIsNone(re.compile("bla").match(a))
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000898 self.assertEqual(re.compile("").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000899
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300900 @requires_unicode
Guido van Rossumae04c332008-01-03 19:12:44 +0000901 def test_inline_flags(self):
902 # Bug #1700
903 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
904 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
905
906 p = re.compile(upper_char, re.I | re.U)
907 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300908 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000909
910 p = re.compile(lower_char, re.I | re.U)
911 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300912 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000913
914 p = re.compile('(?i)' + upper_char, re.U)
915 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300916 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000917
918 p = re.compile('(?i)' + lower_char, re.U)
919 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300920 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000921
922 p = re.compile('(?iu)' + upper_char)
923 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300924 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000925
926 p = re.compile('(?iu)' + lower_char)
927 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300928 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000929
Amaury Forgeot d'Arcd08a8eb2008-01-10 21:59:42 +0000930 def test_dollar_matches_twice(self):
931 "$ matches the end of string, and just before the terminating \n"
932 pattern = re.compile('$')
933 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
934 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
935 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
936
937 pattern = re.compile('$', re.MULTILINE)
938 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
939 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
940 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
941
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000942 def test_dealloc(self):
943 # issue 3299: check for segfault in debug build
944 import _sre
Ezio Melotti0e4e7322010-01-23 10:43:05 +0000945 # the overflow limit is different on wide and narrow builds and it
946 # depends on the definition of SRE_CODE (see sre.h).
947 # 2**128 should be big enough to overflow on both. For smaller values
948 # a RuntimeError is raised instead of OverflowError.
949 long_overflow = 2**128
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000950 self.assertRaises(TypeError, re.finditer, "a", {})
951 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Guido van Rossumae04c332008-01-03 19:12:44 +0000952
Ezio Melottib56b6ff2012-03-13 01:25:40 +0200953 def test_compile(self):
954 # Test return value when given string and pattern as parameter
955 pattern = re.compile('random pattern')
956 self.assertIsInstance(pattern, re._pattern_type)
957 same_pattern = re.compile(pattern)
958 self.assertIsInstance(same_pattern, re._pattern_type)
959 self.assertIs(same_pattern, pattern)
960 # Test behaviour when not given a string or pattern as parameter
961 self.assertRaises(TypeError, re.compile, 0)
962
Ezio Melotti5c4e32b2013-01-11 08:32:01 +0200963 def test_bug_13899(self):
964 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
965 # nothing. Ditto B and Z.
966 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
967 ['A', 'B', '\b', 'C', 'Z'])
968
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100969 @precisionbigmemtest(size=_2G, memuse=1)
970 def test_large_search(self, size):
971 # Issue #10182: indices were 32-bit-truncated.
972 s = 'a' * size
973 m = re.search('$', s)
974 self.assertIsNotNone(m)
Antoine Pitrou74635c92012-12-03 21:08:43 +0100975 self.assertEqual(m.start(), size)
976 self.assertEqual(m.end(), size)
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100977
Antoine Pitroub83575b2012-12-02 12:52:36 +0100978 # The huge memuse is because of re.sub() using a list and a join()
979 # to create the replacement result.
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100980 @precisionbigmemtest(size=_2G, memuse=16 + 2)
981 def test_large_subn(self, size):
Antoine Pitroub83575b2012-12-02 12:52:36 +0100982 # Issue #10182: indices were 32-bit-truncated.
983 s = 'a' * size
Antoine Pitroub83575b2012-12-02 12:52:36 +0100984 r, n = re.subn('', '', s)
985 self.assertEqual(r, s)
986 self.assertEqual(n, size + 1)
987
988
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200989 def test_repeat_minmax_overflow(self):
990 # Issue #13169
991 string = "x" * 100000
992 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
993 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
994 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
995 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
996 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
997 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
998 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
999 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1000 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1001 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1002 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1003
1004 @cpython_only
1005 def test_repeat_minmax_overflow_maxrepeat(self):
1006 try:
1007 from _sre import MAXREPEAT
1008 except ImportError:
1009 self.skipTest('requires _sre.MAXREPEAT constant')
1010 string = "x" * 100000
1011 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1012 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1013 (0, 100000))
1014 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1015 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1016 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1017 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1018
R David Murray60773392013-04-14 13:08:50 -04001019 def test_backref_group_name_in_exception(self):
1020 # Issue 17341: Poor error message when compiling invalid regex
1021 with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
1022 re.compile('(?P=<foo>)')
1023
1024 def test_group_name_in_exception(self):
1025 # Issue 17341: Poor error message when compiling invalid regex
1026 with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
1027 re.compile('(?P<?foo>)')
1028
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001029 def test_issue17998(self):
1030 for reps in '*', '+', '?', '{1}':
1031 for mod in '', '?':
1032 pattern = '.' + reps + mod + 'yz'
1033 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1034 ['xyz'], msg=pattern)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03001035 if have_unicode:
1036 pattern = unicode(pattern)
1037 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1038 [u'xyz'], msg=pattern)
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001039
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001040
Serhiy Storchaka83737c62013-08-19 23:20:07 +03001041 def test_bug_2537(self):
1042 # issue 2537: empty submatches
1043 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1044 for inner_op in ('{0,}', '*', '?'):
1045 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1046 m = r.match("xyyzy")
1047 self.assertEqual(m.group(0), "xyy")
1048 self.assertEqual(m.group(1), "")
1049 self.assertEqual(m.group(2), "y")
1050
Antoine Pitrouf5814112014-02-03 20:59:59 +01001051 def test_debug_flag(self):
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001052 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitrouf5814112014-02-03 20:59:59 +01001053 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001054 re.compile(pat, re.DEBUG)
1055 dump = '''\
1056subpattern 1
1057 literal 46
1058subpattern None
1059 branch
1060 in
1061 literal 99
1062 literal 104
1063 or
1064 literal 112
1065 literal 121
1066subpattern None
1067 groupref_exists 1
1068 at at_end
1069 else
1070 literal 58
1071 literal 32
1072'''
1073 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001074 # Debug output is output again even a second time (bypassing
1075 # the cache -- issue #20426).
1076 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001077 re.compile(pat, re.DEBUG)
1078 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001079
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001080 def test_keyword_parameters(self):
1081 # Issue #20283: Accepting the string keyword parameter.
1082 pat = re.compile(r'(ab)')
1083 self.assertEqual(
1084 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1085 self.assertEqual(
1086 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1087 self.assertEqual(
1088 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1089 self.assertEqual(
1090 pat.split(string='abracadabra', maxsplit=1),
1091 ['', 'ab', 'racadabra'])
1092
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04001093 def test_match_group_takes_long(self):
1094 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1095 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1096
Serhiy Storchakad4c72902014-10-31 00:53:19 +02001097 def test_locale_caching(self):
1098 # Issue #22410
1099 oldlocale = locale.setlocale(locale.LC_CTYPE)
1100 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1101 for loc in 'en_US.iso88591', 'en_US.utf8':
1102 try:
1103 locale.setlocale(locale.LC_CTYPE, loc)
1104 except locale.Error:
1105 # Unsupported locale on this system
1106 self.skipTest('test needs %s locale' % loc)
1107
1108 re.purge()
1109 self.check_en_US_iso88591()
1110 self.check_en_US_utf8()
1111 re.purge()
1112 self.check_en_US_utf8()
1113 self.check_en_US_iso88591()
1114
1115 def check_en_US_iso88591(self):
1116 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1117 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1118 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1119 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1120 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1121 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1122 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1123
1124 def check_en_US_utf8(self):
1125 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1126 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1127 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1128 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1129 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1130 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1131 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1132
Antoine Pitrouf5814112014-02-03 20:59:59 +01001133
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001134def run_re_tests():
Georg Brandla4f46e12010-02-07 17:03:15 +00001135 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001136 if verbose:
1137 print 'Running re_tests test suite'
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001138 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001139 # To save time, only run the first and last 10 tests
1140 #tests = tests[:10] + tests[-10:]
1141 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001142
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001143 for t in tests:
1144 sys.stdout.flush()
1145 pattern = s = outcome = repl = expected = None
1146 if len(t) == 5:
1147 pattern, s, outcome, repl, expected = t
1148 elif len(t) == 3:
1149 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001150 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001151 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1152
Guido van Rossum41360a41998-03-26 19:42:58 +00001153 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001154 obj = re.compile(pattern)
1155 except re.error:
1156 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001157 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001158 print '=== Syntax error:', t
1159 except KeyboardInterrupt: raise KeyboardInterrupt
1160 except:
1161 print '*** Unexpected error ***', t
1162 if verbose:
1163 traceback.print_exc(file=sys.stdout)
1164 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001165 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001166 result = obj.search(s)
1167 except re.error, msg:
1168 print '=== Unexpected exception', t, repr(msg)
1169 if outcome == SYNTAX_ERROR:
1170 # This should have been a syntax error; forget it.
1171 pass
1172 elif outcome == FAIL:
1173 if result is None: pass # No match, as expected
1174 else: print '=== Succeeded incorrectly', t
1175 elif outcome == SUCCEED:
1176 if result is not None:
1177 # Matched, as expected, so now we compute the
1178 # result string and compare it to our expected result.
1179 start, end = result.span(0)
1180 vardict={'found': result.group(0),
1181 'groups': result.group(),
1182 'flags': result.re.flags}
1183 for i in range(1, 100):
1184 try:
1185 gi = result.group(i)
1186 # Special hack because else the string concat fails:
1187 if gi is None:
1188 gi = "None"
1189 except IndexError:
1190 gi = "Error"
1191 vardict['g%d' % i] = gi
1192 for i in result.re.groupindex.keys():
1193 try:
1194 gi = result.group(i)
1195 if gi is None:
1196 gi = "None"
1197 except IndexError:
1198 gi = "Error"
1199 vardict[i] = gi
1200 repl = eval(repl, vardict)
1201 if repl != expected:
1202 print '=== grouping error', t,
1203 print repr(repl) + ' should be ' + repr(expected)
1204 else:
1205 print '=== Failed incorrectly', t
1206
1207 # Try the match on a unicode string, and check that it
1208 # still succeeds.
1209 try:
1210 result = obj.search(unicode(s, "latin-1"))
1211 if result is None:
1212 print '=== Fails on unicode match', t
1213 except NameError:
1214 continue # 1.5.2
1215 except TypeError:
1216 continue # unicode test case
1217
1218 # Try the match on a unicode pattern, and check that it
1219 # still succeeds.
1220 obj=re.compile(unicode(pattern, "latin-1"))
1221 result = obj.search(s)
Fredrik Lundh17741be2001-03-22 15:51:28 +00001222 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001223 print '=== Fails on unicode pattern match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001224
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001225 # Try the match with the search area limited to the extent
1226 # of the match and see if it still succeeds. \B will
1227 # break (because it won't match at the end or start of a
1228 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001229
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001230 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1231 and result is not None:
1232 obj = re.compile(pattern)
1233 result = obj.search(s, result.start(0), result.end(0) + 1)
1234 if result is None:
1235 print '=== Failed on range-limited match', t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001236
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001237 # Try the match with IGNORECASE enabled, and check that it
1238 # still succeeds.
1239 obj = re.compile(pattern, re.IGNORECASE)
1240 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001241 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001242 print '=== Fails on case-insensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001243
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001244 # Try the match with LOCALE enabled, and check that it
1245 # still succeeds.
1246 obj = re.compile(pattern, re.LOCALE)
1247 result = obj.search(s)
1248 if result is None:
1249 print '=== Fails on locale-sensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001250
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001251 # Try the match with UNICODE locale enabled, and check
1252 # that it still succeeds.
1253 obj = re.compile(pattern, re.UNICODE)
1254 result = obj.search(s)
1255 if result is None:
1256 print '=== Fails on unicode-sensitive match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001257
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001258def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001259 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001260 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001261
1262if __name__ == "__main__":
1263 test_main()