blob: 9b0aa75c130b7adb31c1a0cd623fbda6fb582206 [file] [log] [blame]
Benjamin Petersone48944b2012-03-07 14:50:25 -06001from test.support import verbose, run_unittest, gc_collect
2import io
Guido van Rossum8e0ce301997-07-11 19:34:44 +00003import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00004from re import Scanner
Ezio Melottid2114eb2011-03-25 14:08:44 +02005import sys
6import string
7import traceback
Raymond Hettinger027bb632004-05-31 03:09:25 +00008from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +00009
Guido van Rossum23b22571997-07-17 22:36:14 +000010# Misc tests from Tim Peters' re.doc
11
Just van Rossum6802c6e2003-07-02 14:36:59 +000012# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020013# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000014# cover most of the code.
15
Skip Montanaro8ed06da2003-04-24 19:43:18 +000016import unittest
Guido van Rossum8430c581998-04-03 21:47:12 +000017
Skip Montanaro8ed06da2003-04-24 19:43:18 +000018class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000019
Benjamin Petersone48944b2012-03-07 14:50:25 -060020 def test_keep_buffer(self):
21 # See bug 14212
22 b = bytearray(b'x')
23 it = re.finditer(b'a', b)
24 with self.assertRaises(BufferError):
25 b.extend(b'x'*400)
26 list(it)
27 del it
28 gc_collect()
29 b.extend(b'x'*400)
30
Raymond Hettinger027bb632004-05-31 03:09:25 +000031 def test_weakref(self):
32 s = 'QabbbcR'
33 x = re.compile('ab+c')
34 y = proxy(x)
35 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
36
Skip Montanaro8ed06da2003-04-24 19:43:18 +000037 def test_search_star_plus(self):
38 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
39 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
40 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
41 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000042 self.assertEqual(re.search('x', 'aaa'), None)
Skip Montanaro8ed06da2003-04-24 19:43:18 +000043 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
44 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
45 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
46 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Skip Montanaro5ba00542003-04-25 16:00:14 +000047 self.assertEqual(re.match('a+', 'xxx'), None)
Guido van Rossum8430c581998-04-03 21:47:12 +000048
Skip Montanaro8ed06da2003-04-24 19:43:18 +000049 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000050 int_value = int(matchobj.group(0))
51 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000052
Skip Montanaro8ed06da2003-04-24 19:43:18 +000053 def test_basic_re_sub(self):
54 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
55 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
56 '9.3 -3 24x100y')
57 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
58 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000059
Skip Montanaro8ed06da2003-04-24 19:43:18 +000060 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
61 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000062
Skip Montanaro8ed06da2003-04-24 19:43:18 +000063 s = r"\1\1"
64 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
65 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
66 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000067
Skip Montanaro8ed06da2003-04-24 19:43:18 +000068 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
69 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
70 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
71 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +000072
Skip Montanaro8ed06da2003-04-24 19:43:18 +000073 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
74 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
75 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
76 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
77 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +000078
Skip Montanaro8ed06da2003-04-24 19:43:18 +000079 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +000080
Skip Montanaro2726fcd2003-04-25 14:31:54 +000081 def test_bug_449964(self):
82 # fails for group followed by other escape
83 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
84 'xx\bxx\b')
85
86 def test_bug_449000(self):
87 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
89 'abc\ndef\n')
90 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
91 'abc\ndef\n')
92 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
93 'abc\ndef\n')
94 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
95 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +000096
Christian Heimes5fb7c2a2007-12-24 08:52:31 +000097 def test_bug_1661(self):
98 # Verify that flags do not get silently ignored with compiled patterns
99 pattern = re.compile('.')
100 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
101 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
102 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
103 self.assertRaises(ValueError, re.compile, pattern, re.I)
104
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000105 def test_bug_3629(self):
106 # A regex that triggered a bug in the sre-code validator
107 re.compile("(?P<quote>)(?(quote))")
108
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000109 def test_sub_template_numeric_escape(self):
110 # bug 776311 and friends
111 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
112 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
113 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
114 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
115 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
116 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
117 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
118
119 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
120 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
121
122 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
123 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
124 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
125 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
126 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
127
128 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
129 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000130
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000131 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
132 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
133 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
134 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
135 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
136 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
137 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
138 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
139 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
140 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
141 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
142 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
143
144 # in python2.3 (etc), these loop endlessly in sre_parser.py
145 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
146 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
147 'xz8')
148 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
149 'xza')
150
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000151 def test_qualified_re_sub(self):
152 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
153 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000154
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000155 def test_bug_114660(self):
156 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
157 'hello there')
158
159 def test_bug_462270(self):
160 # Test for empty sub() behaviour, see SF bug #462270
161 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
162 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
163
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000164 def test_symbolic_refs(self):
165 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
166 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
167 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
168 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
169 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
170 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
171 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
172 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000173 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000174
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000175 def test_re_subn(self):
176 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
177 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
178 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
179 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
180 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000181
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000182 def test_re_split(self):
183 self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
184 self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
185 self.assertEqual(re.split("(:*)", ":a:b::c"),
186 ['', ':', 'a', ':', 'b', '::', 'c'])
187 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
188 self.assertEqual(re.split("(:)*", ":a:b::c"),
189 ['', ':', 'a', ':', 'b', ':', 'c'])
190 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
191 ['', ':', 'a', ':b::', 'c'])
192 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
193 ['', None, ':', 'a', None, ':', '', 'b', None, '',
194 None, '::', 'c'])
195 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
196 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000197
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000198 def test_qualified_re_split(self):
199 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
200 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
201 self.assertEqual(re.split("(:)", ":a:b::c", 2),
202 ['', ':', 'a', ':', 'b::c'])
203 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
204 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000205
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000206 def test_re_findall(self):
207 self.assertEqual(re.findall(":+", "abc"), [])
208 self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
209 self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
210 self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
211 (":", ":"),
212 (":", "::")])
Guido van Rossum49946571997-07-18 04:26:25 +0000213
Skip Montanaro5ba00542003-04-25 16:00:14 +0000214 def test_bug_117612(self):
215 self.assertEqual(re.findall(r"(a|(b))", "aba"),
216 [("a", ""),("b", "b"),("a", "")])
217
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000218 def test_re_match(self):
Skip Montanaro5ba00542003-04-25 16:00:14 +0000219 self.assertEqual(re.match('a', 'a').groups(), ())
220 self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
221 self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
222 self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
223 self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000224
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000225 pat = re.compile('((a)|(b))(c)?')
226 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
227 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
228 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
229 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
230 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000231
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000232 # A single group
233 m = re.match('(a)', 'a')
234 self.assertEqual(m.group(0), 'a')
235 self.assertEqual(m.group(0), 'a')
236 self.assertEqual(m.group(1), 'a')
237 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000238
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000239 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
240 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
241 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
242 (None, 'b', None))
243 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000244
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000245 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000246 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
247 ('(', 'a'))
248 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
249 (None, 'a'))
250 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
251 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
252 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
253 ('a', 'b'))
254 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
255 (None, 'd'))
256 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
257 (None, 'd'))
258 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
259 ('a', ''))
260
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000261 # Tests for bug #1177831: exercise groups other than the first group
262 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
263 self.assertEqual(p.match('abc').groups(),
264 ('a', 'b', 'c'))
265 self.assertEqual(p.match('ad').groups(),
266 ('a', None, 'd'))
267 self.assertEqual(p.match('abd'), None)
268 self.assertEqual(p.match('ac'), None)
269
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000270
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000271 def test_re_groupref(self):
272 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
273 ('|', 'a'))
274 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
275 (None, 'a'))
276 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
277 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
278 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
279 ('a', 'a'))
280 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
281 (None, None))
282
283 def test_groupdict(self):
284 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
285 'first second').groupdict(),
286 {'first':'first', 'second':'second'})
287
288 def test_expand(self):
289 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
290 "first second")
291 .expand(r"\2 \1 \g<second> \g<first>"),
292 "second first second first")
293
294 def test_repeat_minmax(self):
295 self.assertEqual(re.match("^(\w){1}$", "abc"), None)
296 self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
297 self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
298 self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
299
300 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
301 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
302 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
303 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
304 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
305 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
306 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
307 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
308
309 self.assertEqual(re.match("^x{1}$", "xxx"), None)
310 self.assertEqual(re.match("^x{1}?$", "xxx"), None)
311 self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
312 self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
313
314 self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
315 self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
316 self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
317 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
318 self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
319 self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
320 self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
321 self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
322
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000323 self.assertEqual(re.match("^x{}$", "xxx"), None)
324 self.assertNotEqual(re.match("^x{}$", "x{}"), None)
325
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000326 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000327 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000328 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000329 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
330 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
331 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
332 {'first': 1, 'other': 2})
333
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000334 self.assertEqual(re.match("(a)", "a").pos, 0)
335 self.assertEqual(re.match("(a)", "a").endpos, 1)
336 self.assertEqual(re.match("(a)", "a").string, "a")
337 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
338 self.assertNotEqual(re.match("(a)", "a").re, None)
339
340 def test_special_escapes(self):
341 self.assertEqual(re.search(r"\b(b.)\b",
342 "abcd abc bcd bx").group(1), "bx")
343 self.assertEqual(re.search(r"\B(b.)\B",
344 "abc bcd bc abxd").group(1), "bx")
345 self.assertEqual(re.search(r"\b(b.)\b",
346 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
347 self.assertEqual(re.search(r"\B(b.)\B",
348 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
349 self.assertEqual(re.search(r"\b(b.)\b",
350 "abcd abc bcd bx", re.UNICODE).group(1), "bx")
351 self.assertEqual(re.search(r"\B(b.)\B",
352 "abc bcd bc abxd", re.UNICODE).group(1), "bx")
353 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
354 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
355 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
356 self.assertEqual(re.search(r"\b(b.)\b",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000357 "abcd abc bcd bx").group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000358 self.assertEqual(re.search(r"\B(b.)\B",
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000359 "abc bcd bc abxd").group(1), "bx")
360 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
361 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
362 self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000363 self.assertEqual(re.search(r"\d\D\w\W\s\S",
364 "1aa! a").group(0), "1aa! a")
365 self.assertEqual(re.search(r"\d\D\w\W\s\S",
366 "1aa! a", re.LOCALE).group(0), "1aa! a")
367 self.assertEqual(re.search(r"\d\D\w\W\s\S",
368 "1aa! a", re.UNICODE).group(0), "1aa! a")
369
Ezio Melotti5a045b92012-02-29 11:48:44 +0200370 def test_string_boundaries(self):
371 # See http://bugs.python.org/issue10713
372 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
373 "abc")
374 # There's a word boundary at the start of a string.
375 self.assertTrue(re.match(r"\b", "abc"))
376 # A non-empty string includes a non-boundary zero-length match.
377 self.assertTrue(re.search(r"\B", "abc"))
378 # There is no non-boundary match at the start of a string.
379 self.assertFalse(re.match(r"\B", "abc"))
380 # However, an empty string contains no word boundaries, and also no
381 # non-boundaries.
382 self.assertEqual(re.search(r"\B", ""), None)
383 # This one is questionable and different from the perlre behaviour,
384 # but describes current behavior.
385 self.assertEqual(re.search(r"\b", ""), None)
386 # A single word-character string has two boundaries, but no
387 # non-boundary gaps.
388 self.assertEqual(len(re.findall(r"\b", "a")), 2)
389 self.assertEqual(len(re.findall(r"\B", "a")), 0)
390 # If there are no words, there are no boundaries
391 self.assertEqual(len(re.findall(r"\b", " ")), 0)
392 self.assertEqual(len(re.findall(r"\b", " ")), 0)
393 # Can match around the whitespace.
394 self.assertEqual(len(re.findall(r"\B", " ")), 2)
395
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000396 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000397 self.assertEqual(re.match("([\u2222\u2223])",
398 "\u2222").group(1), "\u2222")
399 self.assertEqual(re.match("([\u2222\u2223])",
400 "\u2222", re.UNICODE).group(1), "\u2222")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000401
402 def test_anyall(self):
403 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
404 "a\nb")
405 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
406 "a\n\nb")
407
408 def test_non_consuming(self):
409 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
410 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
411 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
412 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
413 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
414 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
415 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
416
417 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
418 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
419 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
420 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
421
422 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000423 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
424 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000425 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
426 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
427 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
428 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
429 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
430 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
431 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
432 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
433
434 def test_category(self):
435 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
436
437 def test_getlower(self):
438 import _sre
439 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
440 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
441 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
442
443 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000444 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000445
446 def test_not_literal(self):
447 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
448 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
449
450 def test_search_coverage(self):
451 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
452 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
453
Ezio Melottid2114eb2011-03-25 14:08:44 +0200454 def assertMatch(self, pattern, text, match=None, span=None,
455 matcher=re.match):
456 if match is None and span is None:
457 # the pattern matches the whole text
458 match = text
459 span = (0, len(text))
460 elif match is None or span is None:
461 raise ValueError('If match is not None, span should be specified '
462 '(and vice versa).')
463 m = matcher(pattern, text)
464 self.assertTrue(m)
465 self.assertEqual(m.group(), match)
466 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000467
Ezio Melottid2114eb2011-03-25 14:08:44 +0200468 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300469 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200470 p = ''.join(chr(i) for i in range(256))
471 for c in p:
472 if c in alnum_chars:
473 self.assertEqual(re.escape(c), c)
474 elif c == '\x00':
475 self.assertEqual(re.escape(c), '\\000')
476 else:
477 self.assertEqual(re.escape(c), '\\' + c)
478 self.assertMatch(re.escape(c), c)
479 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000480
Guido van Rossum698280d2008-09-10 17:44:35 +0000481 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300482 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200483 p = bytes(range(256))
484 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000485 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200486 if b in alnum_chars:
487 self.assertEqual(re.escape(b), b)
488 elif i == 0:
489 self.assertEqual(re.escape(b), b'\\000')
490 else:
491 self.assertEqual(re.escape(b), b'\\' + b)
492 self.assertMatch(re.escape(b), b)
493 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000494
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200495 def test_re_escape_non_ascii(self):
496 s = 'xxx\u2620\u2620\u2620xxx'
497 s_escaped = re.escape(s)
498 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
499 self.assertMatch(s_escaped, s)
500 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
501 'x\u2620\u2620\u2620x', (2, 7), re.search)
502
503 def test_re_escape_non_ascii_bytes(self):
504 b = 'y\u2620y\u2620y'.encode('utf-8')
505 b_escaped = re.escape(b)
506 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
507 self.assertMatch(b_escaped, b)
508 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
509 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000510
Skip Montanaro1e703c62003-04-25 15:40:28 +0000511 def pickle_test(self, pickle):
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000512 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
513 s = pickle.dumps(oldpat)
514 newpat = pickle.loads(s)
515 self.assertEqual(oldpat, newpat)
Guido van Rossum23b22571997-07-17 22:36:14 +0000516
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000517 def test_constants(self):
518 self.assertEqual(re.I, re.IGNORECASE)
519 self.assertEqual(re.L, re.LOCALE)
520 self.assertEqual(re.M, re.MULTILINE)
521 self.assertEqual(re.S, re.DOTALL)
522 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000523
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000524 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000525 for flag in [re.I, re.M, re.X, re.S, re.L]:
526 self.assertNotEqual(re.compile('^pattern$', flag), None)
Guido van Rossumf473cb01998-01-14 16:42:17 +0000527
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000528 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200529 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
530 if i < 256:
531 self.assertIsNotNone(re.match(r"\%03o" % i, chr(i)))
532 self.assertIsNotNone(re.match(r"\%03o0" % i, chr(i)+"0"))
533 self.assertIsNotNone(re.match(r"\%03o8" % i, chr(i)+"8"))
534 self.assertIsNotNone(re.match(r"\x%02x" % i, chr(i)))
535 self.assertIsNotNone(re.match(r"\x%02x0" % i, chr(i)+"0"))
536 self.assertIsNotNone(re.match(r"\x%02xz" % i, chr(i)+"z"))
537 if i < 0x10000:
538 self.assertIsNotNone(re.match(r"\u%04x" % i, chr(i)))
539 self.assertIsNotNone(re.match(r"\u%04x0" % i, chr(i)+"0"))
540 self.assertIsNotNone(re.match(r"\u%04xz" % i, chr(i)+"z"))
541 self.assertIsNotNone(re.match(r"\U%08x" % i, chr(i)))
542 self.assertIsNotNone(re.match(r"\U%08x0" % i, chr(i)+"0"))
543 self.assertIsNotNone(re.match(r"\U%08xz" % i, chr(i)+"z"))
544 self.assertIsNotNone(re.match(r"\0", "\000"))
545 self.assertIsNotNone(re.match(r"\08", "\0008"))
546 self.assertIsNotNone(re.match(r"\01", "\001"))
547 self.assertIsNotNone(re.match(r"\018", "\0018"))
548 self.assertIsNotNone(re.match(r"\567", chr(0o167)))
549 self.assertRaises(re.error, re.match, r"\911", "")
550 self.assertRaises(re.error, re.match, r"\x1", "")
551 self.assertRaises(re.error, re.match, r"\x1z", "")
552 self.assertRaises(re.error, re.match, r"\u123", "")
553 self.assertRaises(re.error, re.match, r"\u123z", "")
554 self.assertRaises(re.error, re.match, r"\U0001234", "")
555 self.assertRaises(re.error, re.match, r"\U0001234z", "")
556 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000557
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000558 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200559 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
560 if i < 256:
561 self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
562 self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
563 self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
564 self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
565 self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
566 self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
567 self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
568 self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
569 if i < 0x10000:
570 self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
571 self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
572 self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
573 self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
574 self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
575 self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
576 self.assertRaises(re.error, re.match, r"[\911]", "")
577 self.assertRaises(re.error, re.match, r"[\x1z]", "")
578 self.assertRaises(re.error, re.match, r"[\u123z]", "")
579 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
580 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
581
582 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000583 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Antoine Pitrou463badf2012-06-23 13:29:19 +0200584 self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
585 self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
586 self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
587 self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
588 self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
589 self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
590 self.assertIsNotNone(re.match(br"\u", b'u'))
591 self.assertIsNotNone(re.match(br"\U", b'U'))
592 self.assertIsNotNone(re.match(br"\0", b"\000"))
593 self.assertIsNotNone(re.match(br"\08", b"\0008"))
594 self.assertIsNotNone(re.match(br"\01", b"\001"))
595 self.assertIsNotNone(re.match(br"\018", b"\0018"))
596 self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
597 self.assertRaises(re.error, re.match, br"\911", b"")
598 self.assertRaises(re.error, re.match, br"\x1", b"")
599 self.assertRaises(re.error, re.match, br"\x1z", b"")
600
601 def test_sre_byte_class_literals(self):
602 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
603 self.assertIsNotNone(re.match((r"[\%o]" % i).encode(), bytes([i])))
604 self.assertIsNotNone(re.match((r"[\%o8]" % i).encode(), bytes([i])))
605 self.assertIsNotNone(re.match((r"[\%03o]" % i).encode(), bytes([i])))
606 self.assertIsNotNone(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
607 self.assertIsNotNone(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
608 self.assertIsNotNone(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
609 self.assertIsNotNone(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
610 self.assertIsNotNone(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
611 self.assertIsNotNone(re.match(br"[\u]", b'u'))
612 self.assertIsNotNone(re.match(br"[\U]", b'U'))
613 self.assertRaises(re.error, re.match, br"[\911]", "")
614 self.assertRaises(re.error, re.match, br"[\x1z]", "")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000615
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000616 def test_bug_113254(self):
617 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
618 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
619 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
620
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000621 def test_bug_527371(self):
622 # bug described in patches 527371/672491
623 self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
624 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
625 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
626 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
627 self.assertEqual(re.match("((a))", "a").lastindex, 1)
628
629 def test_bug_545855(self):
630 # bug 545855 -- This pattern failed to cause a compile error as it
631 # should, instead provoking a TypeError.
632 self.assertRaises(re.error, re.compile, 'foo[a-')
633
634 def test_bug_418626(self):
635 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
636 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
637 # pattern '*?' on a long string.
638 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
639 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
640 20003)
641 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000642 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000643 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000644 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000645
646 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000647 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000648 self.assertEqual(re.compile(pat) and 1, 1)
649
Skip Montanaro1e703c62003-04-25 15:40:28 +0000650 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000651 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000652 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000653 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
654 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
655 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000656
657 def test_scanner(self):
658 def s_ident(scanner, token): return token
659 def s_operator(scanner, token): return "op%s" % token
660 def s_float(scanner, token): return float(token)
661 def s_int(scanner, token): return int(token)
662
663 scanner = Scanner([
664 (r"[a-zA-Z_]\w*", s_ident),
665 (r"\d+\.\d*", s_float),
666 (r"\d+", s_int),
667 (r"=|\+|-|\*|/", s_operator),
668 (r"\s+", None),
669 ])
670
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000671 self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
672
Skip Montanaro1e703c62003-04-25 15:40:28 +0000673 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
674 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
675 'op+', 'bar'], ''))
676
Skip Montanaro5ba00542003-04-25 16:00:14 +0000677 def test_bug_448951(self):
678 # bug 448951 (similar to 429357, but with single char match)
679 # (Also test greedy matches.)
680 for op in '','?','*':
681 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
682 (None, None))
683 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
684 ('a:', 'a'))
685
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000686 def test_bug_725106(self):
687 # capturing groups in alternatives in repeats
688 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
689 ('b', 'a'))
690 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
691 ('c', 'b'))
692 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
693 ('b', None))
694 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
695 ('b', None))
696 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
697 ('b', 'a'))
698 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
699 ('c', 'b'))
700 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
701 ('b', None))
702 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
703 ('b', None))
704
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000705 def test_bug_725149(self):
706 # mark_stack_base restoring before restoring marks
707 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
708 ('a', None))
709 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
710 ('a', None, None))
711
Just van Rossum12723ba2003-07-02 20:03:04 +0000712 def test_bug_764548(self):
713 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000714 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000715 pat = re.compile(my_unicode("abc"))
716 self.assertEqual(pat.match("xyz"), None)
717
Skip Montanaro5ba00542003-04-25 16:00:14 +0000718 def test_finditer(self):
719 iter = re.finditer(r":+", "a:b::c:::d")
720 self.assertEqual([item.group(0) for item in iter],
721 [":", "::", ":::"])
722
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600723 pat = re.compile(r":+")
724 iter = pat.finditer("a:b::c:::d", 1, 10)
725 self.assertEqual([item.group(0) for item in iter],
726 [":", "::", ":::"])
727
728 pat = re.compile(r":+")
729 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
730 self.assertEqual([item.group(0) for item in iter],
731 [":", "::", ":::"])
732
733 pat = re.compile(r":+")
734 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
735 self.assertEqual([item.group(0) for item in iter],
736 [":", "::", ":::"])
737
738 pat = re.compile(r":+")
739 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
740 self.assertEqual([item.group(0) for item in iter],
741 ["::", "::"])
742
Thomas Wouters40a088d2008-03-18 20:19:54 +0000743 def test_bug_926075(self):
Benjamin Petersonc9c0f202009-06-30 23:06:06 +0000744 self.assertTrue(re.compile('bug_926075') is not
Thomas Wouters40a088d2008-03-18 20:19:54 +0000745 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000746
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000747 def test_bug_931848(self):
Guido van Rossum7ebb9702007-05-15 21:39:58 +0000748 pattern = eval('"[\u002E\u3002\uFF0E\uFF61]"')
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000749 self.assertEqual(re.compile(pattern).split("a.b.c"),
750 ['a','b','c'])
751
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000752 def test_bug_581080(self):
753 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000754 self.assertEqual(next(iter).span(), (1,2))
755 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000756
757 scanner = re.compile(r"\s").scanner("a b")
758 self.assertEqual(scanner.search().span(), (1, 2))
759 self.assertEqual(scanner.search(), None)
760
761 def test_bug_817234(self):
762 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000763 self.assertEqual(next(iter).span(), (0, 4))
764 self.assertEqual(next(iter).span(), (4, 4))
765 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000766
Mark Dickinson1f268282009-07-28 17:22:36 +0000767 def test_bug_6561(self):
768 # '\d' should match characters in Unicode category 'Nd'
769 # (Number, Decimal Digit), but not those in 'Nl' (Number,
770 # Letter) or 'No' (Number, Other).
771 decimal_digits = [
772 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
773 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
774 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
775 ]
776 for x in decimal_digits:
777 self.assertEqual(re.match('^\d$', x).group(0), x)
778
779 not_decimal_digits = [
780 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
781 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
782 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
783 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
784 ]
785 for x in not_decimal_digits:
786 self.assertIsNone(re.match('^\d$', x))
787
Guido van Rossumd8faa362007-04-27 19:54:29 +0000788 def test_empty_array(self):
789 # SF buf 1647541
790 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000791 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000792 a = array.array(typecode)
Antoine Pitroufd036452008-08-19 17:56:33 +0000793 self.assertEqual(re.compile(b"bla").match(a), None)
794 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000795
Christian Heimes072c0f12008-01-03 23:01:04 +0000796 def test_inline_flags(self):
797 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000798 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
799 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000800
801 p = re.compile(upper_char, re.I | re.U)
802 q = p.match(lower_char)
803 self.assertNotEqual(q, None)
804
805 p = re.compile(lower_char, re.I | re.U)
806 q = p.match(upper_char)
807 self.assertNotEqual(q, None)
808
809 p = re.compile('(?i)' + upper_char, re.U)
810 q = p.match(lower_char)
811 self.assertNotEqual(q, None)
812
813 p = re.compile('(?i)' + lower_char, re.U)
814 q = p.match(upper_char)
815 self.assertNotEqual(q, None)
816
817 p = re.compile('(?iu)' + upper_char)
818 q = p.match(lower_char)
819 self.assertNotEqual(q, None)
820
821 p = re.compile('(?iu)' + lower_char)
822 q = p.match(upper_char)
823 self.assertNotEqual(q, None)
824
Christian Heimes25bb7832008-01-11 16:17:00 +0000825 def test_dollar_matches_twice(self):
826 "$ matches the end of string, and just before the terminating \n"
827 pattern = re.compile('$')
828 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
829 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
830 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
831
832 pattern = re.compile('$', re.MULTILINE)
833 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
834 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
835 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
836
Antoine Pitroufd036452008-08-19 17:56:33 +0000837 def test_bytes_str_mixing(self):
838 # Mixing str and bytes is disallowed
839 pat = re.compile('.')
840 bpat = re.compile(b'.')
841 self.assertRaises(TypeError, pat.match, b'b')
842 self.assertRaises(TypeError, bpat.match, 'b')
843 self.assertRaises(TypeError, pat.sub, b'b', 'c')
844 self.assertRaises(TypeError, pat.sub, 'b', b'c')
845 self.assertRaises(TypeError, pat.sub, b'b', b'c')
846 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
847 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
848 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
849
850 def test_ascii_and_unicode_flag(self):
851 # String patterns
852 for flags in (0, re.UNICODE):
853 pat = re.compile('\xc0', flags | re.IGNORECASE)
854 self.assertNotEqual(pat.match('\xe0'), None)
855 pat = re.compile('\w', flags)
856 self.assertNotEqual(pat.match('\xe0'), None)
857 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
858 self.assertEqual(pat.match('\xe0'), None)
859 pat = re.compile('(?a)\xc0', re.IGNORECASE)
860 self.assertEqual(pat.match('\xe0'), None)
861 pat = re.compile('\w', re.ASCII)
862 self.assertEqual(pat.match('\xe0'), None)
863 pat = re.compile('(?a)\w')
864 self.assertEqual(pat.match('\xe0'), None)
865 # Bytes patterns
866 for flags in (0, re.ASCII):
867 pat = re.compile(b'\xc0', re.IGNORECASE)
868 self.assertEqual(pat.match(b'\xe0'), None)
869 pat = re.compile(b'\w')
870 self.assertEqual(pat.match(b'\xe0'), None)
871 # Incompatibilities
872 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
873 self.assertRaises(ValueError, re.compile, b'(?u)\w')
874 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
875 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
876 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
877 self.assertRaises(ValueError, re.compile, '(?au)\w')
878
Ezio Melottib92ed7c2010-03-06 15:24:08 +0000879 def test_bug_6509(self):
880 # Replacement strings of both types must parse properly.
881 # all strings
882 pat = re.compile('a(\w)')
883 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
884 pat = re.compile('a(.)')
885 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
886 pat = re.compile('..')
887 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
888
889 # all bytes
890 pat = re.compile(b'a(\w)')
891 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
892 pat = re.compile(b'a(.)')
893 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
894 pat = re.compile(b'..')
895 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
896
Antoine Pitrou82feb1f2010-01-14 17:34:48 +0000897 def test_dealloc(self):
898 # issue 3299: check for segfault in debug build
899 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +0000900 # the overflow limit is different on wide and narrow builds and it
901 # depends on the definition of SRE_CODE (see sre.h).
902 # 2**128 should be big enough to overflow on both. For smaller values
903 # a RuntimeError is raised instead of OverflowError.
904 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +0000905 self.assertRaises(TypeError, re.finditer, "a", {})
906 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +0000907 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +0000908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200909 def test_search_dot_unicode(self):
910 self.assertIsNotNone(re.search("123.*-", '123abc-'))
911 self.assertIsNotNone(re.search("123.*-", '123\xe9-'))
912 self.assertIsNotNone(re.search("123.*-", '123\u20ac-'))
913 self.assertIsNotNone(re.search("123.*-", '123\U0010ffff-'))
914 self.assertIsNotNone(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
915
Ezio Melottidf723e12012-03-13 01:29:48 +0200916 def test_compile(self):
917 # Test return value when given string and pattern as parameter
918 pattern = re.compile('random pattern')
919 self.assertIsInstance(pattern, re._pattern_type)
920 same_pattern = re.compile(pattern)
921 self.assertIsInstance(same_pattern, re._pattern_type)
922 self.assertIs(same_pattern, pattern)
923 # Test behaviour when not given a string or pattern as parameter
924 self.assertRaises(TypeError, re.compile, 0)
925
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000926def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +0000927 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000928 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000929 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +0000930 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000931 # To save time, only run the first and last 10 tests
932 #tests = tests[:10] + tests[-10:]
933 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +0000934
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000935 for t in tests:
936 sys.stdout.flush()
937 pattern = s = outcome = repl = expected = None
938 if len(t) == 5:
939 pattern, s, outcome, repl, expected = t
940 elif len(t) == 3:
941 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000942 else:
Collin Winter3add4d72007-08-29 23:37:32 +0000943 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000944
Guido van Rossum41360a41998-03-26 19:42:58 +0000945 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000946 obj = re.compile(pattern)
947 except re.error:
948 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +0000949 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000950 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000951 except KeyboardInterrupt: raise KeyboardInterrupt
952 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000953 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000954 if verbose:
955 traceback.print_exc(file=sys.stdout)
956 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +0000957 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000958 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +0000959 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000960 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000961 if outcome == SYNTAX_ERROR:
962 # This should have been a syntax error; forget it.
963 pass
964 elif outcome == FAIL:
965 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000966 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000967 elif outcome == SUCCEED:
968 if result is not None:
969 # Matched, as expected, so now we compute the
970 # result string and compare it to our expected result.
971 start, end = result.span(0)
972 vardict={'found': result.group(0),
973 'groups': result.group(),
974 'flags': result.re.flags}
975 for i in range(1, 100):
976 try:
977 gi = result.group(i)
978 # Special hack because else the string concat fails:
979 if gi is None:
980 gi = "None"
981 except IndexError:
982 gi = "Error"
983 vardict['g%d' % i] = gi
984 for i in result.re.groupindex.keys():
985 try:
986 gi = result.group(i)
987 if gi is None:
988 gi = "None"
989 except IndexError:
990 gi = "Error"
991 vardict[i] = gi
992 repl = eval(repl, vardict)
993 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000994 print('=== grouping error', t, end=' ')
995 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000996 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000997 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000998
Antoine Pitrou22628c42008-07-22 17:53:22 +0000999 # Try the match with both pattern and string converted to
1000 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001001 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001002 bpat = bytes(pattern, "ascii")
1003 bs = bytes(s, "ascii")
1004 except UnicodeEncodeError:
1005 # skip non-ascii tests
1006 pass
1007 else:
1008 try:
1009 bpat = re.compile(bpat)
1010 except Exception:
1011 print('=== Fails on bytes pattern compile', t)
1012 if verbose:
1013 traceback.print_exc(file=sys.stdout)
1014 else:
1015 bytes_result = bpat.search(bs)
1016 if bytes_result is None:
1017 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001018
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001019 # Try the match with the search area limited to the extent
1020 # of the match and see if it still succeeds. \B will
1021 # break (because it won't match at the end or start of a
1022 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001023
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001024 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1025 and result is not None:
1026 obj = re.compile(pattern)
1027 result = obj.search(s, result.start(0), result.end(0) + 1)
1028 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001029 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001030
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001031 # Try the match with IGNORECASE enabled, and check that it
1032 # still succeeds.
1033 obj = re.compile(pattern, re.IGNORECASE)
1034 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001035 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001036 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001037
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001038 # Try the match with LOCALE enabled, and check that it
1039 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001040 if '(?u)' not in pattern:
1041 obj = re.compile(pattern, re.LOCALE)
1042 result = obj.search(s)
1043 if result is None:
1044 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001045
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001046 # Try the match with UNICODE locale enabled, and check
1047 # that it still succeeds.
1048 obj = re.compile(pattern, re.UNICODE)
1049 result = obj.search(s)
1050 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001051 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001052
Gregory P. Smith5a631832010-07-27 05:31:29 +00001053
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001054def test_main():
Walter Dörwald21d3a322003-05-01 17:45:56 +00001055 run_unittest(ReTests)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001056 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001057
1058if __name__ == "__main__":
1059 test_main()