blob: 5e2914d4709f37d0d77bb3648686255148fa7288 [file] [log] [blame]
Serhiy Storchakae9277572014-11-10 12:37:02 +02001# -*- coding: utf-8 -*-
Florent Xicluna6257a7b2010-03-31 22:01:03 +00002from test.test_support import verbose, run_unittest, import_module
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02003from test.test_support import precisionbigmemtest, _2G, cpython_only
Serhiy Storchaka7644ff12014-09-14 17:40:44 +03004from test.test_support import captured_stdout, have_unicode, requires_unicode, u
Serhiy Storchakad4c72902014-10-31 00:53:19 +02005import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00006import re
Neal Norwitz94a9c092006-03-16 06:30:02 +00007from re import Scanner
R David Murray60773392013-04-14 13:08:50 -04008import sre_constants
Ezio Melotti46645632011-03-25 14:50:52 +02009import sys
10import string
11import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +000012from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000013
Antoine Pitrou735f36e2012-12-03 20:53:12 +010014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti24b07bc2011-03-15 18:55:01 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Skip Montanaro8ed06da2003-04-24 19:43:18 +000021import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000022
Skip Montanaro8ed06da2003-04-24 19:43:18 +000023class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000024
25 def test_weakref(self):
26 s = 'QabbbcR'
27 x = re.compile('ab+c')
28 y = proxy(x)
29 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
30
Skip Montanaro8ed06da2003-04-24 19:43:18 +000031 def test_search_star_plus(self):
32 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
33 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
34 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
35 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030036 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000037 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
38 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
39 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
40 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +030041 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000042
Skip Montanaro8ed06da2003-04-24 19:43:18 +000043 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000044 int_value = int(matchobj.group(0))
45 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000046
Skip Montanaro8ed06da2003-04-24 19:43:18 +000047 def test_basic_re_sub(self):
48 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
49 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
50 '9.3 -3 24x100y')
51 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
52 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000053
Skip Montanaro8ed06da2003-04-24 19:43:18 +000054 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
55 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000056
Skip Montanaro8ed06da2003-04-24 19:43:18 +000057 s = r"\1\1"
58 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
59 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
60 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000061
Skip Montanaro8ed06da2003-04-24 19:43:18 +000062 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
63 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
64 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
65 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000066
Skip Montanaro8ed06da2003-04-24 19:43:18 +000067 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
68 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
69 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
70 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
71 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000074
Skip Montanaro2726fcd2003-04-25 14:31:54 +000075 def test_bug_449964(self):
76 # fails for group followed by other escape
77 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
78 'xx\bxx\b')
79
80 def test_bug_449000(self):
81 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000082 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
83 'abc\ndef\n')
84 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
85 'abc\ndef\n')
86 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
87 'abc\ndef\n')
88 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000090
Serhiy Storchaka7644ff12014-09-14 17:40:44 +030091 @requires_unicode
Guido van Rossum1ff91d92007-09-10 22:02:25 +000092 def test_bug_1140(self):
93 # re.sub(x, y, u'') should return u'', not '', and
94 # re.sub(x, y, '') should return '', not u''.
95 # Also:
96 # re.sub(x, y, unicode(x)) should return unicode(y), and
97 # re.sub(x, y, str(x)) should return
98 # str(y) if isinstance(y, str) else unicode(y).
99 for x in 'x', u'x':
100 for y in 'y', u'y':
101 z = re.sub(x, y, u'')
102 self.assertEqual(z, u'')
103 self.assertEqual(type(z), unicode)
104 #
105 z = re.sub(x, y, '')
106 self.assertEqual(z, '')
107 self.assertEqual(type(z), str)
108 #
109 z = re.sub(x, y, unicode(x))
110 self.assertEqual(z, y)
111 self.assertEqual(type(z), unicode)
112 #
113 z = re.sub(x, y, str(x))
114 self.assertEqual(z, y)
115 self.assertEqual(type(z), type(y))
116
Raymond Hettinger80016c92007-12-19 18:13:31 +0000117 def test_bug_1661(self):
118 # Verify that flags do not get silently ignored with compiled patterns
119 pattern = re.compile('.')
120 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
121 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
122 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
123 self.assertRaises(ValueError, re.compile, pattern, re.I)
124
Guido van Rossume3c4fd92008-09-10 14:27:00 +0000125 def test_bug_3629(self):
126 # A regex that triggered a bug in the sre-code validator
127 re.compile("(?P<quote>)(?(quote))")
128
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000129 def test_sub_template_numeric_escape(self):
130 # bug 776311 and friends
131 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
132 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
133 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
134 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
135 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
136 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
137 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
138
139 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
140 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
141
142 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
143 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
144 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
147
148 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
149 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000150
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000151 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
152 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
153 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
154 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
155 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
156 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
157 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
158 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
160 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
161 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
162 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
163
164 # in python2.3 (etc), these loop endlessly in sre_parser.py
165 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
166 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
167 'xz8')
168 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
169 'xza')
170
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000171 def test_qualified_re_sub(self):
172 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
173 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000174
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000175 def test_bug_114660(self):
176 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
177 'hello there')
178
179 def test_bug_462270(self):
180 # Test for empty sub() behaviour, see SF bug #462270
181 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
182 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
183
Ezio Melottief317382012-11-03 20:31:12 +0200184 def test_symbolic_groups(self):
185 re.compile('(?P<a>x)(?P=a)(?(a)y)')
186 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
187 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
188 self.assertRaises(re.error, re.compile, '(?Px)')
189 self.assertRaises(re.error, re.compile, '(?P=)')
190 self.assertRaises(re.error, re.compile, '(?P=1)')
191 self.assertRaises(re.error, re.compile, '(?P=a)')
192 self.assertRaises(re.error, re.compile, '(?P=a1)')
193 self.assertRaises(re.error, re.compile, '(?P=a.)')
194 self.assertRaises(re.error, re.compile, '(?P<)')
195 self.assertRaises(re.error, re.compile, '(?P<>)')
196 self.assertRaises(re.error, re.compile, '(?P<1>)')
197 self.assertRaises(re.error, re.compile, '(?P<a.>)')
198 self.assertRaises(re.error, re.compile, '(?())')
199 self.assertRaises(re.error, re.compile, '(?(a))')
200 self.assertRaises(re.error, re.compile, '(?(1a))')
201 self.assertRaises(re.error, re.compile, '(?(a.))')
202
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000203 def test_symbolic_refs(self):
204 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
205 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
206 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
207 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melottief317382012-11-03 20:31:12 +0200208 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000209 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
210 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
211 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
212 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000213 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000214
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000215 def test_re_subn(self):
216 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
217 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
218 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
219 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
220 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000221
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 def test_re_split(self):
223 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
224 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
225 self.assertEqual(re.split("(:*)", ":a:b::c"),
226 ['', ':', 'a', ':', 'b', '::', 'c'])
227 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
228 self.assertEqual(re.split("(:)*", ":a:b::c"),
229 ['', ':', 'a', ':', 'b', ':', 'c'])
230 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
231 ['', ':', 'a', ':b::', 'c'])
232 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
233 ['', None, ':', 'a', None, ':', '', 'b', None, '',
234 None, '::', 'c'])
235 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
236 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000237
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000238 def test_qualified_re_split(self):
239 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
240 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
241 self.assertEqual(re.split("(:)", ":a:b::c", 2),
242 ['', ':', 'a', ':', 'b::c'])
243 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
244 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000245
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000246 def test_re_findall(self):
247 self.assertEqual(re.findall(":+", "abc"), [])
248 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
249 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
250 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
251 (":", ":"),
252 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000253
Skip Montanaro5ba00542003-04-25 16:00:14 +0000254 def test_bug_117612(self):
255 self.assertEqual(re.findall(r"(a|(b))", "aba"),
256 [("a", ""),("b", "b"),("a", "")])
257
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000258 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000259 self.assertEqual(re.match('a', 'a').groups(), ())
260 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
261 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
262 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
263 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000264
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000265 pat = re.compile('((a)|(b))(c)?')
266 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
267 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
268 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
269 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
270 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000271
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000272 # A single group
273 m = re.match('(a)', 'a')
274 self.assertEqual(m.group(0), 'a')
275 self.assertEqual(m.group(0), 'a')
276 self.assertEqual(m.group(1), 'a')
277 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000278
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000279 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
280 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
281 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
282 (None, 'b', None))
283 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000284
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000285 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000286 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
287 ('(', 'a'))
288 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
289 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300290 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
291 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000292 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
293 ('a', 'b'))
294 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
295 (None, 'd'))
296 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
297 (None, 'd'))
298 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
299 ('a', ''))
300
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000301 # Tests for bug #1177831: exercise groups other than the first group
302 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
303 self.assertEqual(p.match('abc').groups(),
304 ('a', 'b', 'c'))
305 self.assertEqual(p.match('ad').groups(),
306 ('a', None, 'd'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300307 self.assertIsNone(p.match('abd'))
308 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000309
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000310
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000311 def test_re_groupref(self):
312 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
313 ('|', 'a'))
314 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
315 (None, 'a'))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300316 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
317 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000318 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
319 ('a', 'a'))
320 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
321 (None, None))
322
323 def test_groupdict(self):
324 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
325 'first second').groupdict(),
326 {'first':'first', 'second':'second'})
327
328 def test_expand(self):
329 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
330 "first second")
331 .expand(r"\2 \1 \g<second> \g<first>"),
332 "second first second first")
333
334 def test_repeat_minmax(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300335 self.assertIsNone(re.match("^(\w){1}$", "abc"))
336 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
337 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
338 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000339
340 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
341 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
342 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
343 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
344 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
345 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
346 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
347 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
348
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300349 self.assertIsNone(re.match("^x{1}$", "xxx"))
350 self.assertIsNone(re.match("^x{1}?$", "xxx"))
351 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
352 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000353
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300354 self.assertTrue(re.match("^x{3}$", "xxx"))
355 self.assertTrue(re.match("^x{1,3}$", "xxx"))
356 self.assertTrue(re.match("^x{1,4}$", "xxx"))
357 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
358 self.assertTrue(re.match("^x{3}?$", "xxx"))
359 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
360 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
361 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000362
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300363 self.assertIsNone(re.match("^x{}$", "xxx"))
364 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000365
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000366 def test_getattr(self):
367 self.assertEqual(re.match("(a)", "a").pos, 0)
368 self.assertEqual(re.match("(a)", "a").endpos, 1)
369 self.assertEqual(re.match("(a)", "a").string, "a")
370 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300371 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000372
373 def test_special_escapes(self):
374 self.assertEqual(re.search(r"\b(b.)\b",
375 "abcd abc bcd bx").group(1), "bx")
376 self.assertEqual(re.search(r"\B(b.)\B",
377 "abc bcd bc abxd").group(1), "bx")
378 self.assertEqual(re.search(r"\b(b.)\b",
379 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
380 self.assertEqual(re.search(r"\B(b.)\B",
381 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300382 if have_unicode:
383 self.assertEqual(re.search(r"\b(b.)\b",
384 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
385 self.assertEqual(re.search(r"\B(b.)\B",
386 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000387 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
388 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300389 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.search(r"\b(b.)\b",
391 u"abcd abc bcd bx").group(1), "bx")
392 self.assertEqual(re.search(r"\B(b.)\B",
393 u"abc bcd bc abxd").group(1), "bx")
394 self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
395 self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300396 self.assertIsNone(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000397 self.assertEqual(re.search(r"\d\D\w\W\s\S",
398 "1aa! a").group(0), "1aa! a")
399 self.assertEqual(re.search(r"\d\D\w\W\s\S",
400 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300401 if have_unicode:
402 self.assertEqual(re.search(r"\d\D\w\W\s\S",
403 "1aa! a", re.UNICODE).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000404
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200405 def test_string_boundaries(self):
406 # See http://bugs.python.org/issue10713
407 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
408 "abc")
409 # There's a word boundary at the start of a string.
410 self.assertTrue(re.match(r"\b", "abc"))
411 # A non-empty string includes a non-boundary zero-length match.
412 self.assertTrue(re.search(r"\B", "abc"))
413 # There is no non-boundary match at the start of a string.
414 self.assertFalse(re.match(r"\B", "abc"))
415 # However, an empty string contains no word boundaries, and also no
416 # non-boundaries.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300417 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200418 # This one is questionable and different from the perlre behaviour,
419 # but describes current behavior.
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300420 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti38ae5b22012-02-29 11:40:00 +0200421 # A single word-character string has two boundaries, but no
422 # non-boundary gaps.
423 self.assertEqual(len(re.findall(r"\b", "a")), 2)
424 self.assertEqual(len(re.findall(r"\B", "a")), 0)
425 # If there are no words, there are no boundaries
426 self.assertEqual(len(re.findall(r"\b", " ")), 0)
427 self.assertEqual(len(re.findall(r"\b", " ")), 0)
428 # Can match around the whitespace.
429 self.assertEqual(len(re.findall(r"\B", " ")), 2)
430
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300431 @requires_unicode
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000432 def test_bigcharset(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300433 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
434 unichr(0x2222)).group(1), unichr(0x2222))
435 self.assertEqual(re.match(u(r"([\u2222\u2223])"),
436 unichr(0x2222), re.UNICODE).group(1), unichr(0x2222))
Serhiy Storchaka22fb0de2013-10-24 22:02:42 +0300437 r = u'[%s]' % u''.join(map(unichr, range(256, 2**16, 255)))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300438 self.assertEqual(re.match(r, unichr(0xff01), re.UNICODE).group(), unichr(0xff01))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000439
Antoine Pitroub83ea142012-11-20 22:30:42 +0100440 def test_big_codesize(self):
441 # Issue #1160
442 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300443 self.assertTrue(r.match('1000'))
444 self.assertTrue(r.match('9999'))
Antoine Pitroub83ea142012-11-20 22:30:42 +0100445
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000446 def test_anyall(self):
447 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
448 "a\nb")
449 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
450 "a\n\nb")
451
Benjamin Petersonf8c8d2e2014-11-30 11:47:54 -0500452 def test_non_consuming(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000453 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
454 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
455 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
456 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
457 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
458 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
459 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
460
461 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
462 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
463 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
464 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
465
466 def test_ignore_case(self):
Georg Brandl30de77b2008-08-24 18:11:07 +0000467 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
468 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000469 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
470 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
471 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
472 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
473 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
474 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
475 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
476 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
477
Serhiy Storchakae9277572014-11-10 12:37:02 +0200478 if have_unicode:
479 assert u(r'\u212a').lower() == u'k' # 'K'
480 self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
481 self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
482 self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
483 self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
484 assert u(r'\u017f').upper() == u'S' # 'ſ'
485 self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
486 self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
487 self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
488 self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
489
490 def test_ignore_case_set(self):
491 self.assertTrue(re.match(r'[19A]', 'A', re.I))
492 self.assertTrue(re.match(r'[19a]', 'a', re.I))
493 self.assertTrue(re.match(r'[19a]', 'A', re.I))
494 self.assertTrue(re.match(r'[19A]', 'a', re.I))
495 if have_unicode:
496 self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
497 self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
498 self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
499 self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
500 assert u(r'\u212a').lower() == u'k' # 'K'
501 self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
502 self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
503 self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
504 self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
505 assert u(r'\u017f').upper() == u'S' # 'ſ'
506 self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
507 self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
508 self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
509 self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
510
Serhiy Storchakae9e54ae2014-10-31 13:53:21 +0200511 def test_ignore_case_range(self):
512 # Issues #3511, #17381.
513 self.assertTrue(re.match(r'[9-a]', '_', re.I))
514 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
515 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
516 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
517 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
518 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
519 if have_unicode:
520 self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
521 self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
522 self.assertTrue(re.match(u(r'[\xc0-\xde]'),
523 u(r'\xd7'), re.U | re.I))
524 self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
525 u(r'\xf7'), re.U | re.I))
526 self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
527 u(r'\xf7'), re.U | re.I))
528 self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
529 u(r'\xd7'), re.U | re.I))
530 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
531 u(r'\u0450'), re.U | re.I))
532 self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
533 u(r'\u0400'), re.U | re.I))
534 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
535 u(r'\u0450'), re.U | re.I))
536 self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
537 u(r'\u0400'), re.U | re.I))
538 if sys.maxunicode > 0xffff:
539 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
540 u(r'\U00010428'), re.U | re.I))
541 self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
542 u(r'\U00010400'), re.U | re.I))
543 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
544 u(r'\U00010428'), re.U | re.I))
545 self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
546 u(r'\U00010400'), re.U | re.I))
547
Serhiy Storchakae9277572014-11-10 12:37:02 +0200548 assert u(r'\u212a').lower() == u'k' # 'K'
549 self.assertTrue(re.match(ur'[J-M]', u(r'\u212a'), re.U | re.I))
550 self.assertTrue(re.match(ur'[j-m]', u(r'\u212a'), re.U | re.I))
551 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'K', re.U | re.I))
552 self.assertTrue(re.match(u(r'[\u2129-\u212b]'), u'k', re.U | re.I))
553 assert u(r'\u017f').upper() == u'S' # 'ſ'
554 self.assertTrue(re.match(ur'[R-T]', u(r'\u017f'), re.U | re.I))
555 self.assertTrue(re.match(ur'[r-t]', u(r'\u017f'), re.U | re.I))
556 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u'S', re.U | re.I))
557 self.assertTrue(re.match(u(r'[\u017e-\u0180]'), u's', re.U | re.I))
558
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000559 def test_category(self):
560 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
561
562 def test_getlower(self):
563 import _sre
564 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
565 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300566 if have_unicode:
567 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000568
569 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
570 self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
571
572 def test_not_literal(self):
573 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
574 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
575
576 def test_search_coverage(self):
577 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
578 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
579
Ezio Melotti46645632011-03-25 14:50:52 +0200580 def assertMatch(self, pattern, text, match=None, span=None,
581 matcher=re.match):
582 if match is None and span is None:
583 # the pattern matches the whole text
584 match = text
585 span = (0, len(text))
586 elif match is None or span is None:
587 raise ValueError('If match is not None, span should be specified '
588 '(and vice versa).')
589 m = matcher(pattern, text)
590 self.assertTrue(m)
591 self.assertEqual(m.group(), match)
592 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000593
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300594 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200595 def test_re_escape(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300596 alnum_chars = unicode(string.ascii_letters + string.digits)
Ezio Melotti46645632011-03-25 14:50:52 +0200597 p = u''.join(unichr(i) for i in range(256))
598 for c in p:
599 if c in alnum_chars:
600 self.assertEqual(re.escape(c), c)
601 elif c == u'\x00':
602 self.assertEqual(re.escape(c), u'\\000')
603 else:
604 self.assertEqual(re.escape(c), u'\\' + c)
605 self.assertMatch(re.escape(c), c)
606 self.assertMatch(re.escape(p), p)
607
608 def test_re_escape_byte(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300609 alnum_chars = string.ascii_letters + string.digits
Ezio Melotti46645632011-03-25 14:50:52 +0200610 p = ''.join(chr(i) for i in range(256))
611 for b in p:
612 if b in alnum_chars:
613 self.assertEqual(re.escape(b), b)
614 elif b == b'\x00':
615 self.assertEqual(re.escape(b), b'\\000')
616 else:
617 self.assertEqual(re.escape(b), b'\\' + b)
618 self.assertMatch(re.escape(b), b)
619 self.assertMatch(re.escape(p), p)
620
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300621 @requires_unicode
Ezio Melotti46645632011-03-25 14:50:52 +0200622 def test_re_escape_non_ascii(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300623 s = u(r'xxx\u2620\u2620\u2620xxx')
Ezio Melotti46645632011-03-25 14:50:52 +0200624 s_escaped = re.escape(s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300625 self.assertEqual(s_escaped, u(r'xxx\\\u2620\\\u2620\\\u2620xxx'))
Ezio Melotti46645632011-03-25 14:50:52 +0200626 self.assertMatch(s_escaped, s)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300627 self.assertMatch(u'.%s+.' % re.escape(unichr(0x2620)), s,
628 u(r'x\u2620\u2620\u2620x'), (2, 7), re.search)
Ezio Melotti46645632011-03-25 14:50:52 +0200629
630 def test_re_escape_non_ascii_bytes(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300631 b = b'y\xe2\x98\xa0y\xe2\x98\xa0y'
Ezio Melotti46645632011-03-25 14:50:52 +0200632 b_escaped = re.escape(b)
633 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
634 self.assertMatch(b_escaped, b)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300635 res = re.findall(re.escape(b'\xe2\x98\xa0'), b)
Ezio Melotti46645632011-03-25 14:50:52 +0200636 self.assertEqual(len(res), 2)
Guido van Rossum49946571997-07-18 04:26:25 +0000637
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000638 def test_pickling(self):
639 import pickle
Skip Montanaro1e703c62003-04-25 15:40:28 +0000640 self.pickle_test(pickle)
641 import cPickle
642 self.pickle_test(cPickle)
Žiga Seilnacht7492e422007-03-21 20:07:56 +0000643 # old pickles expect the _compile() reconstructor in sre module
Florent Xicluna6257a7b2010-03-31 22:01:03 +0000644 import_module("sre", deprecated=True)
645 from sre import _compile
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300646 # current pickle expects the _compile() reconstructor in re module
647 from re import _compile
Skip Montanaro1e703c62003-04-25 15:40:28 +0000648
649 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000650 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
Serhiy Storchaka038fac62014-09-15 11:35:06 +0300651 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
652 pickled = pickle.dumps(oldpat, proto)
653 newpat = pickle.loads(pickled)
654 self.assertEqual(newpat, oldpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000655
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000656 def test_constants(self):
657 self.assertEqual(re.I, re.IGNORECASE)
658 self.assertEqual(re.L, re.LOCALE)
659 self.assertEqual(re.M, re.MULTILINE)
660 self.assertEqual(re.S, re.DOTALL)
661 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000662
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000663 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000664 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300665 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000666
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000667 def test_sre_character_literals(self):
668 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300669 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
670 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
671 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
672 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
673 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
674 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000675 self.assertRaises(re.error, re.match, "\911", "")
676
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000677 def test_sre_character_class_literals(self):
678 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300679 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
680 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
681 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
682 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
683 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
684 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000685 self.assertRaises(re.error, re.match, "[\911]", "")
686
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000687 def test_bug_113254(self):
688 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
689 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
690 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
691
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000692 def test_bug_527371(self):
693 # bug described in patches 527371/672491
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300694 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000695 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
696 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
697 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
698 self.assertEqual(re.match("((a))", "a").lastindex, 1)
699
700 def test_bug_545855(self):
701 # bug 545855 -- This pattern failed to cause a compile error as it
702 # should, instead provoking a TypeError.
703 self.assertRaises(re.error, re.compile, 'foo[a-')
704
705 def test_bug_418626(self):
706 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
707 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
708 # pattern '*?' on a long string.
709 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
710 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
711 20003)
712 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000713 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000714 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000715 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000716
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300717 @requires_unicode
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000718 def test_bug_612074(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300719 pat=u"["+re.escape(unichr(0x2039))+u"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000720 self.assertEqual(re.compile(pat) and 1, 1)
721
Skip Montanaro1e703c62003-04-25 15:40:28 +0000722 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000723 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000724 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000725 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
726 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
727 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000728
Serhiy Storchaka6a8e2b42013-02-16 21:23:01 +0200729 def test_unlimited_zero_width_repeat(self):
730 # Issue #9669
731 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
732 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
733 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
734 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
735 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
736 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
737
Skip Montanaro1e703c62003-04-25 15:40:28 +0000738 def test_scanner(self):
739 def s_ident(scanner, token): return token
740 def s_operator(scanner, token): return "op%s" % token
741 def s_float(scanner, token): return float(token)
742 def s_int(scanner, token): return int(token)
743
744 scanner = Scanner([
745 (r"[a-zA-Z_]\w*", s_ident),
746 (r"\d+\.\d*", s_float),
747 (r"\d+", s_int),
748 (r"=|\+|-|\*|/", s_operator),
749 (r"\s+", None),
750 ])
751
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300752 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000753
Skip Montanaro1e703c62003-04-25 15:40:28 +0000754 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
755 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
756 'op+', 'bar'], ''))
757
Skip Montanaro5ba00542003-04-25 16:00:14 +0000758 def test_bug_448951(self):
759 # bug 448951 (similar to 429357, but with single char match)
760 # (Also test greedy matches.)
761 for op in '','?','*':
762 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
763 (None, None))
764 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
765 ('a:', 'a'))
766
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000767 def test_bug_725106(self):
768 # capturing groups in alternatives in repeats
769 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
770 ('b', 'a'))
771 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
772 ('c', 'b'))
773 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
774 ('b', None))
775 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
776 ('b', None))
777 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
778 ('b', 'a'))
779 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
780 ('c', 'b'))
781 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
782 ('b', None))
783 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
784 ('b', None))
785
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000786 def test_bug_725149(self):
787 # mark_stack_base restoring before restoring marks
788 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
789 ('a', None))
790 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
791 ('a', None, None))
792
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300793 @requires_unicode
Just van Rossum12723ba2003-07-02 20:03:04 +0000794 def test_bug_764548(self):
795 # bug 764548, re.compile() barfs on str/unicode subclasses
Just van Rossum12723ba2003-07-02 20:03:04 +0000796 class my_unicode(unicode): pass
797 pat = re.compile(my_unicode("abc"))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300798 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000799
Skip Montanaro5ba00542003-04-25 16:00:14 +0000800 def test_finditer(self):
801 iter = re.finditer(r":+", "a:b::c:::d")
802 self.assertEqual([item.group(0) for item in iter],
803 [":", "::", ":::"])
804
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300805 @requires_unicode
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000806 def test_bug_926075(self):
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300807 self.assertIsNot(re.compile('bug_926075'),
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300808 re.compile(u'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000809
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300810 @requires_unicode
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000811 def test_bug_931848(self):
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300812 pattern = u(r"[\u002E\u3002\uFF0E\uFF61]")
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000813 self.assertEqual(re.compile(pattern).split("a.b.c"),
814 ['a','b','c'])
815
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000816 def test_bug_581080(self):
817 iter = re.finditer(r"\s", "a b")
818 self.assertEqual(iter.next().span(), (1,2))
819 self.assertRaises(StopIteration, iter.next)
820
821 scanner = re.compile(r"\s").scanner("a b")
822 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300823 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000824
825 def test_bug_817234(self):
826 iter = re.finditer(r".*", "asdf")
827 self.assertEqual(iter.next().span(), (0, 4))
828 self.assertEqual(iter.next().span(), (4, 4))
829 self.assertRaises(StopIteration, iter.next)
830
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300831 @requires_unicode
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000832 def test_bug_6561(self):
833 # '\d' should match characters in Unicode category 'Nd'
834 # (Number, Decimal Digit), but not those in 'Nl' (Number,
835 # Letter) or 'No' (Number, Other).
836 decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300837 unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
838 unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
839 unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000840 ]
841 for x in decimal_digits:
842 self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
843
844 not_decimal_digits = [
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300845 unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
846 unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
847 unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
848 unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
Mark Dickinsonfe67bd92009-07-28 20:35:03 +0000849 ]
850 for x in not_decimal_digits:
851 self.assertIsNone(re.match('^\d$', x, re.UNICODE))
852
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000853 def test_empty_array(self):
854 # SF buf 1647541
855 import array
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300856 typecodes = 'cbBhHiIlLfd'
857 if have_unicode:
858 typecodes += 'u'
859 for typecode in typecodes:
Raymond Hettinger01a807d2007-04-02 22:54:21 +0000860 a = array.array(typecode)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300861 self.assertIsNone(re.compile("bla").match(a))
Neal Norwitz0d4c06e2007-04-25 06:30:05 +0000862 self.assertEqual(re.compile("").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000863
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300864 @requires_unicode
Guido van Rossumae04c332008-01-03 19:12:44 +0000865 def test_inline_flags(self):
866 # Bug #1700
867 upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
868 lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
869
870 p = re.compile(upper_char, re.I | re.U)
871 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300872 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000873
874 p = re.compile(lower_char, re.I | re.U)
875 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300876 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000877
878 p = re.compile('(?i)' + upper_char, re.U)
879 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300880 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000881
882 p = re.compile('(?i)' + lower_char, re.U)
883 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300884 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000885
886 p = re.compile('(?iu)' + upper_char)
887 q = p.match(lower_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300888 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000889
890 p = re.compile('(?iu)' + lower_char)
891 q = p.match(upper_char)
Serhiy Storchakaed5ea152014-09-14 16:19:37 +0300892 self.assertTrue(q)
Guido van Rossumae04c332008-01-03 19:12:44 +0000893
Amaury Forgeot d'Arcd08a8eb2008-01-10 21:59:42 +0000894 def test_dollar_matches_twice(self):
895 "$ matches the end of string, and just before the terminating \n"
896 pattern = re.compile('$')
897 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
898 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
899 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
900
901 pattern = re.compile('$', re.MULTILINE)
902 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
903 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
904 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
905
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000906 def test_dealloc(self):
907 # issue 3299: check for segfault in debug build
908 import _sre
Ezio Melotti0e4e7322010-01-23 10:43:05 +0000909 # the overflow limit is different on wide and narrow builds and it
910 # depends on the definition of SRE_CODE (see sre.h).
911 # 2**128 should be big enough to overflow on both. For smaller values
912 # a RuntimeError is raised instead of OverflowError.
913 long_overflow = 2**128
Antoine Pitrouefdddd32010-01-14 17:25:24 +0000914 self.assertRaises(TypeError, re.finditer, "a", {})
915 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Guido van Rossumae04c332008-01-03 19:12:44 +0000916
Ezio Melottib56b6ff2012-03-13 01:25:40 +0200917 def test_compile(self):
918 # Test return value when given string and pattern as parameter
919 pattern = re.compile('random pattern')
920 self.assertIsInstance(pattern, re._pattern_type)
921 same_pattern = re.compile(pattern)
922 self.assertIsInstance(same_pattern, re._pattern_type)
923 self.assertIs(same_pattern, pattern)
924 # Test behaviour when not given a string or pattern as parameter
925 self.assertRaises(TypeError, re.compile, 0)
926
Ezio Melotti5c4e32b2013-01-11 08:32:01 +0200927 def test_bug_13899(self):
928 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
929 # nothing. Ditto B and Z.
930 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
931 ['A', 'B', '\b', 'C', 'Z'])
932
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100933 @precisionbigmemtest(size=_2G, memuse=1)
934 def test_large_search(self, size):
935 # Issue #10182: indices were 32-bit-truncated.
936 s = 'a' * size
937 m = re.search('$', s)
938 self.assertIsNotNone(m)
Antoine Pitrou74635c92012-12-03 21:08:43 +0100939 self.assertEqual(m.start(), size)
940 self.assertEqual(m.end(), size)
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100941
Antoine Pitroub83575b2012-12-02 12:52:36 +0100942 # The huge memuse is because of re.sub() using a list and a join()
943 # to create the replacement result.
Antoine Pitrou735f36e2012-12-03 20:53:12 +0100944 @precisionbigmemtest(size=_2G, memuse=16 + 2)
945 def test_large_subn(self, size):
Antoine Pitroub83575b2012-12-02 12:52:36 +0100946 # Issue #10182: indices were 32-bit-truncated.
947 s = 'a' * size
Antoine Pitroub83575b2012-12-02 12:52:36 +0100948 r, n = re.subn('', '', s)
949 self.assertEqual(r, s)
950 self.assertEqual(n, size + 1)
951
952
Serhiy Storchakae18e05c2013-02-16 16:47:15 +0200953 def test_repeat_minmax_overflow(self):
954 # Issue #13169
955 string = "x" * 100000
956 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
957 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
958 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
959 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
960 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
961 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
962 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
963 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
964 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
965 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
966 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
967
968 @cpython_only
969 def test_repeat_minmax_overflow_maxrepeat(self):
970 try:
971 from _sre import MAXREPEAT
972 except ImportError:
973 self.skipTest('requires _sre.MAXREPEAT constant')
974 string = "x" * 100000
975 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
976 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
977 (0, 100000))
978 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
979 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
980 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
981 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
982
R David Murray60773392013-04-14 13:08:50 -0400983 def test_backref_group_name_in_exception(self):
984 # Issue 17341: Poor error message when compiling invalid regex
985 with self.assertRaisesRegexp(sre_constants.error, '<foo>'):
986 re.compile('(?P=<foo>)')
987
988 def test_group_name_in_exception(self):
989 # Issue 17341: Poor error message when compiling invalid regex
990 with self.assertRaisesRegexp(sre_constants.error, '\?foo'):
991 re.compile('(?P<?foo>)')
992
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +0300993 def test_issue17998(self):
994 for reps in '*', '+', '?', '{1}':
995 for mod in '', '?':
996 pattern = '.' + reps + mod + 'yz'
997 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
998 ['xyz'], msg=pattern)
Serhiy Storchaka7644ff12014-09-14 17:40:44 +0300999 if have_unicode:
1000 pattern = unicode(pattern)
1001 self.assertEqual(re.compile(pattern, re.S).findall(u'xyz'),
1002 [u'xyz'], msg=pattern)
Serhiy Storchaka3ade66c2013-08-03 19:26:33 +03001003
Serhiy Storchakae18e05c2013-02-16 16:47:15 +02001004
Serhiy Storchaka83737c62013-08-19 23:20:07 +03001005 def test_bug_2537(self):
1006 # issue 2537: empty submatches
1007 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1008 for inner_op in ('{0,}', '*', '?'):
1009 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1010 m = r.match("xyyzy")
1011 self.assertEqual(m.group(0), "xyy")
1012 self.assertEqual(m.group(1), "")
1013 self.assertEqual(m.group(2), "y")
1014
Antoine Pitrouf5814112014-02-03 20:59:59 +01001015 def test_debug_flag(self):
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001016 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitrouf5814112014-02-03 20:59:59 +01001017 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001018 re.compile(pat, re.DEBUG)
1019 dump = '''\
1020subpattern 1
1021 literal 46
1022subpattern None
1023 branch
1024 in
1025 literal 99
1026 literal 104
1027 or
1028 literal 112
1029 literal 121
1030subpattern None
1031 groupref_exists 1
1032 at at_end
1033 else
1034 literal 58
1035 literal 32
1036'''
1037 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001038 # Debug output is output again even a second time (bypassing
1039 # the cache -- issue #20426).
1040 with captured_stdout() as out:
Serhiy Storchakac0799e32014-09-21 22:47:30 +03001041 re.compile(pat, re.DEBUG)
1042 self.assertEqual(out.getvalue(), dump)
Antoine Pitrouf5814112014-02-03 20:59:59 +01001043
Serhiy Storchakae50fe4c2014-03-06 12:24:29 +02001044 def test_keyword_parameters(self):
1045 # Issue #20283: Accepting the string keyword parameter.
1046 pat = re.compile(r'(ab)')
1047 self.assertEqual(
1048 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1049 self.assertEqual(
1050 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1051 self.assertEqual(
1052 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1053 self.assertEqual(
1054 pat.split(string='abracadabra', maxsplit=1),
1055 ['', 'ab', 'racadabra'])
1056
Benjamin Petersonbc4ece52014-09-30 22:04:28 -04001057 def test_match_group_takes_long(self):
1058 self.assertEqual(re.match("(foo)", "foo").group(1L), "foo")
1059 self.assertRaises(IndexError, re.match("", "").group, sys.maxint + 1)
1060
Serhiy Storchakad4c72902014-10-31 00:53:19 +02001061 def test_locale_caching(self):
1062 # Issue #22410
1063 oldlocale = locale.setlocale(locale.LC_CTYPE)
1064 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1065 for loc in 'en_US.iso88591', 'en_US.utf8':
1066 try:
1067 locale.setlocale(locale.LC_CTYPE, loc)
1068 except locale.Error:
1069 # Unsupported locale on this system
1070 self.skipTest('test needs %s locale' % loc)
1071
1072 re.purge()
1073 self.check_en_US_iso88591()
1074 self.check_en_US_utf8()
1075 re.purge()
1076 self.check_en_US_utf8()
1077 self.check_en_US_iso88591()
1078
1079 def check_en_US_iso88591(self):
1080 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1081 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1082 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1083 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1084 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1085 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1086 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1087
1088 def check_en_US_utf8(self):
1089 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1090 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1091 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1092 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1093 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1094 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1095 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1096
Antoine Pitrouf5814112014-02-03 20:59:59 +01001097
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001098def run_re_tests():
Georg Brandla4f46e12010-02-07 17:03:15 +00001099 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001100 if verbose:
1101 print 'Running re_tests test suite'
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001102 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001103 # To save time, only run the first and last 10 tests
1104 #tests = tests[:10] + tests[-10:]
1105 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001106
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001107 for t in tests:
1108 sys.stdout.flush()
1109 pattern = s = outcome = repl = expected = None
1110 if len(t) == 5:
1111 pattern, s, outcome, repl, expected = t
1112 elif len(t) == 3:
1113 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001114 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001115 raise ValueError, ('Test tuples should have 3 or 5 fields', t)
1116
Guido van Rossum41360a41998-03-26 19:42:58 +00001117 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001118 obj = re.compile(pattern)
1119 except re.error:
1120 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001121 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001122 print '=== Syntax error:', t
1123 except KeyboardInterrupt: raise KeyboardInterrupt
1124 except:
1125 print '*** Unexpected error ***', t
1126 if verbose:
1127 traceback.print_exc(file=sys.stdout)
1128 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001129 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001130 result = obj.search(s)
1131 except re.error, msg:
1132 print '=== Unexpected exception', t, repr(msg)
1133 if outcome == SYNTAX_ERROR:
1134 # This should have been a syntax error; forget it.
1135 pass
1136 elif outcome == FAIL:
1137 if result is None: pass # No match, as expected
1138 else: print '=== Succeeded incorrectly', t
1139 elif outcome == SUCCEED:
1140 if result is not None:
1141 # Matched, as expected, so now we compute the
1142 # result string and compare it to our expected result.
1143 start, end = result.span(0)
1144 vardict={'found': result.group(0),
1145 'groups': result.group(),
1146 'flags': result.re.flags}
1147 for i in range(1, 100):
1148 try:
1149 gi = result.group(i)
1150 # Special hack because else the string concat fails:
1151 if gi is None:
1152 gi = "None"
1153 except IndexError:
1154 gi = "Error"
1155 vardict['g%d' % i] = gi
1156 for i in result.re.groupindex.keys():
1157 try:
1158 gi = result.group(i)
1159 if gi is None:
1160 gi = "None"
1161 except IndexError:
1162 gi = "Error"
1163 vardict[i] = gi
1164 repl = eval(repl, vardict)
1165 if repl != expected:
1166 print '=== grouping error', t,
1167 print repr(repl) + ' should be ' + repr(expected)
1168 else:
1169 print '=== Failed incorrectly', t
1170
1171 # Try the match on a unicode string, and check that it
1172 # still succeeds.
1173 try:
1174 result = obj.search(unicode(s, "latin-1"))
1175 if result is None:
1176 print '=== Fails on unicode match', t
1177 except NameError:
1178 continue # 1.5.2
1179 except TypeError:
1180 continue # unicode test case
1181
1182 # Try the match on a unicode pattern, and check that it
1183 # still succeeds.
1184 obj=re.compile(unicode(pattern, "latin-1"))
1185 result = obj.search(s)
Fredrik Lundh17741be2001-03-22 15:51:28 +00001186 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001187 print '=== Fails on unicode pattern match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001188
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001189 # Try the match with the search area limited to the extent
1190 # of the match and see if it still succeeds. \B will
1191 # break (because it won't match at the end or start of a
1192 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001193
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001194 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1195 and result is not None:
1196 obj = re.compile(pattern)
1197 result = obj.search(s, result.start(0), result.end(0) + 1)
1198 if result is None:
1199 print '=== Failed on range-limited match', t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001200
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001201 # Try the match with IGNORECASE enabled, and check that it
1202 # still succeeds.
1203 obj = re.compile(pattern, re.IGNORECASE)
1204 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001205 if result is None:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001206 print '=== Fails on case-insensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001207
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001208 # Try the match with LOCALE enabled, and check that it
1209 # still succeeds.
1210 obj = re.compile(pattern, re.LOCALE)
1211 result = obj.search(s)
1212 if result is None:
1213 print '=== Fails on locale-sensitive match', t
Guido van Rossumdfa67901997-12-08 17:12:06 +00001214
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001215 # Try the match with UNICODE locale enabled, and check
1216 # that it still succeeds.
1217 obj = re.compile(pattern, re.UNICODE)
1218 result = obj.search(s)
1219 if result is None:
1220 print '=== Fails on unicode-sensitive match', t
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001221
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001222def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001223 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001224 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001225
1226if __name__ == "__main__":
1227 test_main()