blob: 3fd6ab02d0232723505c45934f6e53679eb38fbf [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
87 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
88 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
158 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
159 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
183 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
197 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
198 self.assertRaises(re.error, re.compile, '(?Px)')
199 self.assertRaises(re.error, re.compile, '(?P=)')
200 self.assertRaises(re.error, re.compile, '(?P=1)')
201 self.assertRaises(re.error, re.compile, '(?P=a)')
202 self.assertRaises(re.error, re.compile, '(?P=a1)')
203 self.assertRaises(re.error, re.compile, '(?P=a.)')
204 self.assertRaises(re.error, re.compile, '(?P<)')
205 self.assertRaises(re.error, re.compile, '(?P<>)')
206 self.assertRaises(re.error, re.compile, '(?P<1>)')
207 self.assertRaises(re.error, re.compile, '(?P<a.>)')
208 self.assertRaises(re.error, re.compile, '(?())')
209 self.assertRaises(re.error, re.compile, '(?(a))')
210 self.assertRaises(re.error, re.compile, '(?(1a))')
211 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200212 # New valid/invalid identifiers in Python 3
213 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
214 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
215 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_symbolic_refs(self):
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
224 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200228 # New valid/invalid identifiers in Python 3
229 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
230 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
231 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000232
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 def test_re_subn(self):
234 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
235 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
236 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
237 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
238 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300241 for string in ":a:b::c", S(":a:b::c"):
242 self.assertTypedEqual(re.split(":", string),
243 ['', 'a', 'b', '', 'c'])
244 self.assertTypedEqual(re.split(":*", string),
245 ['', 'a', 'b', 'c'])
246 self.assertTypedEqual(re.split("(:*)", string),
247 ['', ':', 'a', ':', 'b', '::', 'c'])
248 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
249 memoryview(b":a:b::c")):
250 self.assertTypedEqual(re.split(b":", string),
251 [b'', b'a', b'b', b'', b'c'])
252 self.assertTypedEqual(re.split(b":*", string),
253 [b'', b'a', b'b', b'c'])
254 self.assertTypedEqual(re.split(b"(:*)", string),
255 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300256 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
257 "\U0001d49c\U0001d49e\U0001d4b5"):
258 string = ":%s:%s::%s" % (a, b, c)
259 self.assertEqual(re.split(":", string), ['', a, b, '', c])
260 self.assertEqual(re.split(":*", string), ['', a, b, c])
261 self.assertEqual(re.split("(:*)", string),
262 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300263
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000264 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
265 self.assertEqual(re.split("(:)*", ":a:b::c"),
266 ['', ':', 'a', ':', 'b', ':', 'c'])
267 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
268 ['', ':', 'a', ':b::', 'c'])
269 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
270 ['', None, ':', 'a', None, ':', '', 'b', None, '',
271 None, '::', 'c'])
272 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
273 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000274
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000275 def test_qualified_re_split(self):
276 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
277 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
278 self.assertEqual(re.split("(:)", ":a:b::c", 2),
279 ['', ':', 'a', ':', 'b::c'])
280 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
281 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_findall(self):
284 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300285 for string in "a:b::c:::d", S("a:b::c:::d"):
286 self.assertTypedEqual(re.findall(":+", string),
287 [":", "::", ":::"])
288 self.assertTypedEqual(re.findall("(:+)", string),
289 [":", "::", ":::"])
290 self.assertTypedEqual(re.findall("(:)(:*)", string),
291 [(":", ""), (":", ":"), (":", "::")])
292 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
293 memoryview(b"a:b::c:::d")):
294 self.assertTypedEqual(re.findall(b":+", string),
295 [b":", b"::", b":::"])
296 self.assertTypedEqual(re.findall(b"(:+)", string),
297 [b":", b"::", b":::"])
298 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
299 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300300 for x in ("\xe0", "\u0430", "\U0001d49c"):
301 xx = x * 2
302 xxx = x * 3
303 string = "a%sb%sc%sd" % (x, xx, xxx)
304 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
306 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
307 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro5ba00542003-04-25 16:00:14 +0000309 def test_bug_117612(self):
310 self.assertEqual(re.findall(r"(a|(b))", "aba"),
311 [("a", ""),("b", "b"),("a", "")])
312
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000313 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300314 for string in 'a', S('a'):
315 self.assertEqual(re.match('a', string).groups(), ())
316 self.assertEqual(re.match('(a)', string).groups(), ('a',))
317 self.assertEqual(re.match('(a)', string).group(0), 'a')
318 self.assertEqual(re.match('(a)', string).group(1), 'a')
319 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
320 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
321 self.assertEqual(re.match(b'a', string).groups(), ())
322 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
323 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
325 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300326 for a in ("\xe0", "\u0430", "\U0001d49c"):
327 self.assertEqual(re.match(a, a).groups(), ())
328 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
329 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
331 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000333 pat = re.compile('((a)|(b))(c)?')
334 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
335 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
336 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
337 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
338 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000339
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000340 # A single group
341 m = re.match('(a)', 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(0), 'a')
344 self.assertEqual(m.group(1), 'a')
345 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
348 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
349 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
350 (None, 'b', None))
351 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000352
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200353 def test_re_fullmatch(self):
354 # Issue 16203: Proposal: add re.fullmatch() method.
355 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
356 for string in "ab", S("ab"):
357 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
358 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
359 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
360 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
361 r = r"%s|%s" % (a, a + b)
362 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
363 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
365 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
366 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
367 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
368 self.assertIsNone(re.fullmatch(r"a+", "ab"))
369 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
371 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
372 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
374 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
375
376 self.assertEqual(
377 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
378 self.assertEqual(
379 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
380 self.assertEqual(
381 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
382
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000384 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
385 ('(', 'a'))
386 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
387 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300388 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
389 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
391 ('a', 'b'))
392 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
393 (None, 'd'))
394 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
395 (None, 'd'))
396 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
397 ('a', ''))
398
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000399 # Tests for bug #1177831: exercise groups other than the first group
400 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
401 self.assertEqual(p.match('abc').groups(),
402 ('a', 'b', 'c'))
403 self.assertEqual(p.match('ad').groups(),
404 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300405 self.assertIsNone(p.match('abd'))
406 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000407
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000408
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000409 def test_re_groupref(self):
410 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
411 ('|', 'a'))
412 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
413 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300414 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
415 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000416 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
417 ('a', 'a'))
418 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
419 (None, None))
420
421 def test_groupdict(self):
422 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
423 'first second').groupdict(),
424 {'first':'first', 'second':'second'})
425
426 def test_expand(self):
427 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
428 "first second")
429 .expand(r"\2 \1 \g<second> \g<first>"),
430 "second first second first")
431
432 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300433 self.assertIsNone(re.match("^(\w){1}$", "abc"))
434 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
435 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
436 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000437
438 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
445 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
446
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300447 self.assertIsNone(re.match("^x{1}$", "xxx"))
448 self.assertIsNone(re.match("^x{1}?$", "xxx"))
449 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
450 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000451
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300452 self.assertTrue(re.match("^x{3}$", "xxx"))
453 self.assertTrue(re.match("^x{1,3}$", "xxx"))
454 self.assertTrue(re.match("^x{1,4}$", "xxx"))
455 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
456 self.assertTrue(re.match("^x{3}?$", "xxx"))
457 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
458 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
459 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300461 self.assertIsNone(re.match("^x{}$", "xxx"))
462 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000463
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000464 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000467 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
468 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
469 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
470 {'first': 1, 'other': 2})
471
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match("(a)", "a").pos, 0)
473 self.assertEqual(re.match("(a)", "a").endpos, 1)
474 self.assertEqual(re.match("(a)", "a").string, "a")
475 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300476 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000477
478 def test_special_escapes(self):
479 self.assertEqual(re.search(r"\b(b.)\b",
480 "abcd abc bcd bx").group(1), "bx")
481 self.assertEqual(re.search(r"\B(b.)\B",
482 "abc bcd bc abxd").group(1), "bx")
483 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300484 "abcd abc bcd bx", re.ASCII).group(1), "bx")
485 self.assertEqual(re.search(r"\B(b.)\B",
486 "abc bcd bc abxd", re.ASCII).group(1), "bx")
487 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000488 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
489 self.assertEqual(re.search(r"\B(b.)\B",
490 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000491 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300493 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300494 self.assertEqual(re.search(br"\b(b.)\b",
495 b"abcd abc bcd bx").group(1), b"bx")
496 self.assertEqual(re.search(br"\B(b.)\B",
497 b"abc bcd bc abxd").group(1), b"bx")
498 self.assertEqual(re.search(br"\b(b.)\b",
499 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
500 self.assertEqual(re.search(br"\B(b.)\B",
501 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
502 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
503 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300504 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"\d\D\w\W\s\S",
506 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300507 self.assertEqual(re.search(br"\d\D\w\W\s\S",
508 b"1aa! a").group(0), b"1aa! a")
509 self.assertEqual(re.search(r"\d\D\w\W\s\S",
510 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511 self.assertEqual(re.search(r"\d\D\w\W\s\S",
512 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300513 self.assertEqual(re.search(br"\d\D\w\W\s\S",
514 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515
Ezio Melotti5a045b92012-02-29 11:48:44 +0200516 def test_string_boundaries(self):
517 # See http://bugs.python.org/issue10713
518 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
519 "abc")
520 # There's a word boundary at the start of a string.
521 self.assertTrue(re.match(r"\b", "abc"))
522 # A non-empty string includes a non-boundary zero-length match.
523 self.assertTrue(re.search(r"\B", "abc"))
524 # There is no non-boundary match at the start of a string.
525 self.assertFalse(re.match(r"\B", "abc"))
526 # However, an empty string contains no word boundaries, and also no
527 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300528 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200529 # This one is questionable and different from the perlre behaviour,
530 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200532 # A single word-character string has two boundaries, but no
533 # non-boundary gaps.
534 self.assertEqual(len(re.findall(r"\b", "a")), 2)
535 self.assertEqual(len(re.findall(r"\B", "a")), 0)
536 # If there are no words, there are no boundaries
537 self.assertEqual(len(re.findall(r"\b", " ")), 0)
538 self.assertEqual(len(re.findall(r"\b", " ")), 0)
539 # Can match around the whitespace.
540 self.assertEqual(len(re.findall(r"\B", " ")), 2)
541
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(re.match("([\u2222\u2223])",
544 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300545 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300546 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000547
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100548 def test_big_codesize(self):
549 # Issue #1160
550 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300551 self.assertTrue(r.match('1000'))
552 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100553
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000554 def test_anyall(self):
555 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
556 "a\nb")
557 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
558 "a\n\nb")
559
560 def test_non_consuming(self):
561 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
562 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
563 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
564 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
567 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
568
569 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
570 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
571 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
572 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
573
574 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000575 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300576 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000577 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
578 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
579 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
580 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
581 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
582 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
583 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
584 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
585
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200586 def test_ignore_case_range(self):
587 # Issues #3511, #17381.
588 self.assertTrue(re.match(r'[9-a]', '_', re.I))
589 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
590 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
591 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
592 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
593 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
594 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
595 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
596 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
597 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
598 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
599 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
600 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
601 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
602 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
603 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
604
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000605 def test_category(self):
606 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
607
608 def test_getlower(self):
609 import _sre
610 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
611 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
612 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
613
614 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300615 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000616
617 def test_not_literal(self):
618 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
619 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
620
621 def test_search_coverage(self):
622 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
623 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
624
Ezio Melottid2114eb2011-03-25 14:08:44 +0200625 def assertMatch(self, pattern, text, match=None, span=None,
626 matcher=re.match):
627 if match is None and span is None:
628 # the pattern matches the whole text
629 match = text
630 span = (0, len(text))
631 elif match is None or span is None:
632 raise ValueError('If match is not None, span should be specified '
633 '(and vice versa).')
634 m = matcher(pattern, text)
635 self.assertTrue(m)
636 self.assertEqual(m.group(), match)
637 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000638
Ezio Melottid2114eb2011-03-25 14:08:44 +0200639 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300640 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200641 p = ''.join(chr(i) for i in range(256))
642 for c in p:
643 if c in alnum_chars:
644 self.assertEqual(re.escape(c), c)
645 elif c == '\x00':
646 self.assertEqual(re.escape(c), '\\000')
647 else:
648 self.assertEqual(re.escape(c), '\\' + c)
649 self.assertMatch(re.escape(c), c)
650 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000651
Guido van Rossum698280d2008-09-10 17:44:35 +0000652 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300653 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200654 p = bytes(range(256))
655 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000656 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200657 if b in alnum_chars:
658 self.assertEqual(re.escape(b), b)
659 elif i == 0:
660 self.assertEqual(re.escape(b), b'\\000')
661 else:
662 self.assertEqual(re.escape(b), b'\\' + b)
663 self.assertMatch(re.escape(b), b)
664 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000665
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200666 def test_re_escape_non_ascii(self):
667 s = 'xxx\u2620\u2620\u2620xxx'
668 s_escaped = re.escape(s)
669 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
670 self.assertMatch(s_escaped, s)
671 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
672 'x\u2620\u2620\u2620x', (2, 7), re.search)
673
674 def test_re_escape_non_ascii_bytes(self):
675 b = 'y\u2620y\u2620y'.encode('utf-8')
676 b_escaped = re.escape(b)
677 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
678 self.assertMatch(b_escaped, b)
679 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
680 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000681
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300682 def test_pickling(self):
683 import pickle
684 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
685 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
686 pickled = pickle.dumps(oldpat, proto)
687 newpat = pickle.loads(pickled)
688 self.assertEqual(newpat, oldpat)
689 # current pickle expects the _compile() reconstructor in re module
690 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000691
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000692 def test_constants(self):
693 self.assertEqual(re.I, re.IGNORECASE)
694 self.assertEqual(re.L, re.LOCALE)
695 self.assertEqual(re.M, re.MULTILINE)
696 self.assertEqual(re.S, re.DOTALL)
697 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000698
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000699 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000700 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300701 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000702
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000703 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200704 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
705 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300706 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
707 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
708 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
709 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
710 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
711 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200712 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300713 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
714 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
715 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
716 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
717 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
718 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
719 self.assertTrue(re.match(r"\0", "\000"))
720 self.assertTrue(re.match(r"\08", "\0008"))
721 self.assertTrue(re.match(r"\01", "\001"))
722 self.assertTrue(re.match(r"\018", "\0018"))
723 self.assertTrue(re.match(r"\567", chr(0o167)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200724 self.assertRaises(re.error, re.match, r"\911", "")
725 self.assertRaises(re.error, re.match, r"\x1", "")
726 self.assertRaises(re.error, re.match, r"\x1z", "")
727 self.assertRaises(re.error, re.match, r"\u123", "")
728 self.assertRaises(re.error, re.match, r"\u123z", "")
729 self.assertRaises(re.error, re.match, r"\U0001234", "")
730 self.assertRaises(re.error, re.match, r"\U0001234z", "")
731 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000732
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000733 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200734 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
735 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300736 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
737 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
738 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
739 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
740 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
741 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
742 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
743 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200744 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300745 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
746 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
747 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
748 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
749 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
750 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
751 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200752 self.assertRaises(re.error, re.match, r"[\911]", "")
753 self.assertRaises(re.error, re.match, r"[\x1z]", "")
754 self.assertRaises(re.error, re.match, r"[\u123z]", "")
755 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
756 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
757
758 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000759 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300760 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
761 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
762 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
763 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
764 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
765 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
766 self.assertTrue(re.match(br"\u", b'u'))
767 self.assertTrue(re.match(br"\U", b'U'))
768 self.assertTrue(re.match(br"\0", b"\000"))
769 self.assertTrue(re.match(br"\08", b"\0008"))
770 self.assertTrue(re.match(br"\01", b"\001"))
771 self.assertTrue(re.match(br"\018", b"\0018"))
772 self.assertTrue(re.match(br"\567", bytes([0o167])))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200773 self.assertRaises(re.error, re.match, br"\911", b"")
774 self.assertRaises(re.error, re.match, br"\x1", b"")
775 self.assertRaises(re.error, re.match, br"\x1z", b"")
776
777 def test_sre_byte_class_literals(self):
778 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300779 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
780 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
781 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
782 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
783 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
784 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
785 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
786 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
787 self.assertTrue(re.match(br"[\u]", b'u'))
788 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300789 self.assertRaises(re.error, re.match, br"[\911]", b"")
790 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000791
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000792 def test_bug_113254(self):
793 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
794 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
795 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
796
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000797 def test_bug_527371(self):
798 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300799 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000800 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
801 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
802 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
803 self.assertEqual(re.match("((a))", "a").lastindex, 1)
804
805 def test_bug_545855(self):
806 # bug 545855 -- This pattern failed to cause a compile error as it
807 # should, instead provoking a TypeError.
808 self.assertRaises(re.error, re.compile, 'foo[a-')
809
810 def test_bug_418626(self):
811 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
812 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
813 # pattern '*?' on a long string.
814 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
815 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
816 20003)
817 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000818 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000819 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000820 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000821
822 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000823 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000824 self.assertEqual(re.compile(pat) and 1, 1)
825
Skip Montanaro1e703c62003-04-25 15:40:28 +0000826 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000827 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000828 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000829 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
830 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
831 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000832
Serhiy Storchakafa468162013-02-16 21:23:53 +0200833 def test_unlimited_zero_width_repeat(self):
834 # Issue #9669
835 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
836 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
837 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
838 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
839 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
840 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
841
Skip Montanaro1e703c62003-04-25 15:40:28 +0000842 def test_scanner(self):
843 def s_ident(scanner, token): return token
844 def s_operator(scanner, token): return "op%s" % token
845 def s_float(scanner, token): return float(token)
846 def s_int(scanner, token): return int(token)
847
848 scanner = Scanner([
849 (r"[a-zA-Z_]\w*", s_ident),
850 (r"\d+\.\d*", s_float),
851 (r"\d+", s_int),
852 (r"=|\+|-|\*|/", s_operator),
853 (r"\s+", None),
854 ])
855
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300856 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000857
Skip Montanaro1e703c62003-04-25 15:40:28 +0000858 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
859 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
860 'op+', 'bar'], ''))
861
Skip Montanaro5ba00542003-04-25 16:00:14 +0000862 def test_bug_448951(self):
863 # bug 448951 (similar to 429357, but with single char match)
864 # (Also test greedy matches.)
865 for op in '','?','*':
866 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
867 (None, None))
868 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
869 ('a:', 'a'))
870
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000871 def test_bug_725106(self):
872 # capturing groups in alternatives in repeats
873 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
874 ('b', 'a'))
875 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
876 ('c', 'b'))
877 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
878 ('b', None))
879 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
880 ('b', None))
881 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
882 ('b', 'a'))
883 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
884 ('c', 'b'))
885 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
886 ('b', None))
887 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
888 ('b', None))
889
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000890 def test_bug_725149(self):
891 # mark_stack_base restoring before restoring marks
892 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
893 ('a', None))
894 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
895 ('a', None, None))
896
Just van Rossum12723ba2003-07-02 20:03:04 +0000897 def test_bug_764548(self):
898 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000899 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000900 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300901 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000902
Skip Montanaro5ba00542003-04-25 16:00:14 +0000903 def test_finditer(self):
904 iter = re.finditer(r":+", "a:b::c:::d")
905 self.assertEqual([item.group(0) for item in iter],
906 [":", "::", ":::"])
907
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600908 pat = re.compile(r":+")
909 iter = pat.finditer("a:b::c:::d", 1, 10)
910 self.assertEqual([item.group(0) for item in iter],
911 [":", "::", ":::"])
912
913 pat = re.compile(r":+")
914 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
915 self.assertEqual([item.group(0) for item in iter],
916 [":", "::", ":::"])
917
918 pat = re.compile(r":+")
919 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
920 self.assertEqual([item.group(0) for item in iter],
921 [":", "::", ":::"])
922
923 pat = re.compile(r":+")
924 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
925 self.assertEqual([item.group(0) for item in iter],
926 ["::", "::"])
927
Thomas Wouters40a088d2008-03-18 20:19:54 +0000928 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300929 self.assertIsNot(re.compile('bug_926075'),
930 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000931
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000932 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300933 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000934 self.assertEqual(re.compile(pattern).split("a.b.c"),
935 ['a','b','c'])
936
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000937 def test_bug_581080(self):
938 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000939 self.assertEqual(next(iter).span(), (1,2))
940 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000941
942 scanner = re.compile(r"\s").scanner("a b")
943 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300944 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000945
946 def test_bug_817234(self):
947 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000948 self.assertEqual(next(iter).span(), (0, 4))
949 self.assertEqual(next(iter).span(), (4, 4))
950 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000951
Mark Dickinson1f268282009-07-28 17:22:36 +0000952 def test_bug_6561(self):
953 # '\d' should match characters in Unicode category 'Nd'
954 # (Number, Decimal Digit), but not those in 'Nl' (Number,
955 # Letter) or 'No' (Number, Other).
956 decimal_digits = [
957 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
958 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
959 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
960 ]
961 for x in decimal_digits:
962 self.assertEqual(re.match('^\d$', x).group(0), x)
963
964 not_decimal_digits = [
965 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
966 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
967 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
968 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
969 ]
970 for x in not_decimal_digits:
971 self.assertIsNone(re.match('^\d$', x))
972
Guido van Rossumd8faa362007-04-27 19:54:29 +0000973 def test_empty_array(self):
974 # SF buf 1647541
975 import array
Guido van Rossum166746c2007-07-03 15:39:16 +0000976 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +0000977 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300978 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +0000979 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000980
Christian Heimes072c0f12008-01-03 23:01:04 +0000981 def test_inline_flags(self):
982 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +0000983 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
984 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +0000985
986 p = re.compile(upper_char, re.I | re.U)
987 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300988 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +0000989
990 p = re.compile(lower_char, re.I | re.U)
991 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300992 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +0000993
994 p = re.compile('(?i)' + upper_char, re.U)
995 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300996 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +0000997
998 p = re.compile('(?i)' + lower_char, re.U)
999 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001000 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001001
1002 p = re.compile('(?iu)' + upper_char)
1003 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001004 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001005
1006 p = re.compile('(?iu)' + lower_char)
1007 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001008 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001009
Christian Heimes25bb7832008-01-11 16:17:00 +00001010 def test_dollar_matches_twice(self):
1011 "$ matches the end of string, and just before the terminating \n"
1012 pattern = re.compile('$')
1013 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1014 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1015 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1016
1017 pattern = re.compile('$', re.MULTILINE)
1018 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1019 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1020 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1021
Antoine Pitroufd036452008-08-19 17:56:33 +00001022 def test_bytes_str_mixing(self):
1023 # Mixing str and bytes is disallowed
1024 pat = re.compile('.')
1025 bpat = re.compile(b'.')
1026 self.assertRaises(TypeError, pat.match, b'b')
1027 self.assertRaises(TypeError, bpat.match, 'b')
1028 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1029 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1030 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1031 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1032 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1033 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1034
1035 def test_ascii_and_unicode_flag(self):
1036 # String patterns
1037 for flags in (0, re.UNICODE):
1038 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001039 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001040 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001041 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001042 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001043 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001044 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001045 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001046 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001047 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001048 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001049 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001050 # Bytes patterns
1051 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001052 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001053 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001054 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001055 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001056 # Incompatibilities
1057 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1058 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1059 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1060 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1061 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1062 self.assertRaises(ValueError, re.compile, '(?au)\w')
1063
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001064 def test_bug_6509(self):
1065 # Replacement strings of both types must parse properly.
1066 # all strings
1067 pat = re.compile('a(\w)')
1068 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1069 pat = re.compile('a(.)')
1070 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1071 pat = re.compile('..')
1072 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1073
1074 # all bytes
1075 pat = re.compile(b'a(\w)')
1076 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1077 pat = re.compile(b'a(.)')
1078 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1079 pat = re.compile(b'..')
1080 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1081
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001082 def test_dealloc(self):
1083 # issue 3299: check for segfault in debug build
1084 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001085 # the overflow limit is different on wide and narrow builds and it
1086 # depends on the definition of SRE_CODE (see sre.h).
1087 # 2**128 should be big enough to overflow on both. For smaller values
1088 # a RuntimeError is raised instead of OverflowError.
1089 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001090 self.assertRaises(TypeError, re.finditer, "a", {})
1091 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001092 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001095 self.assertTrue(re.search("123.*-", '123abc-'))
1096 self.assertTrue(re.search("123.*-", '123\xe9-'))
1097 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1098 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1099 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100
Ezio Melottidf723e12012-03-13 01:29:48 +02001101 def test_compile(self):
1102 # Test return value when given string and pattern as parameter
1103 pattern = re.compile('random pattern')
1104 self.assertIsInstance(pattern, re._pattern_type)
1105 same_pattern = re.compile(pattern)
1106 self.assertIsInstance(same_pattern, re._pattern_type)
1107 self.assertIs(same_pattern, pattern)
1108 # Test behaviour when not given a string or pattern as parameter
1109 self.assertRaises(TypeError, re.compile, 0)
1110
Ezio Melottife8e6e72013-01-11 08:32:01 +02001111 def test_bug_13899(self):
1112 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1113 # nothing. Ditto B and Z.
1114 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1115 ['A', 'B', '\b', 'C', 'Z'])
1116
Antoine Pitroub33941a2012-12-03 20:55:56 +01001117 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001118 def test_large_search(self, size):
1119 # Issue #10182: indices were 32-bit-truncated.
1120 s = 'a' * size
1121 m = re.search('$', s)
1122 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001123 self.assertEqual(m.start(), size)
1124 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001125
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001126 # The huge memuse is because of re.sub() using a list and a join()
1127 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001128 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001129 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001130 # Issue #10182: indices were 32-bit-truncated.
1131 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001132 r, n = re.subn('', '', s)
1133 self.assertEqual(r, s)
1134 self.assertEqual(n, size + 1)
1135
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001136 def test_bug_16688(self):
1137 # Issue 16688: Backreferences make case-insensitive regex fail on
1138 # non-ASCII strings.
1139 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1140 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001141
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001142 def test_repeat_minmax_overflow(self):
1143 # Issue #13169
1144 string = "x" * 100000
1145 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1146 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1147 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1148 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1149 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1150 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1151 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1152 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1153 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1154 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1155 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1156
1157 @cpython_only
1158 def test_repeat_minmax_overflow_maxrepeat(self):
1159 try:
1160 from _sre import MAXREPEAT
1161 except ImportError:
1162 self.skipTest('requires _sre.MAXREPEAT constant')
1163 string = "x" * 100000
1164 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1165 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1166 (0, 100000))
1167 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1168 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1169 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1170 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1171
R David Murray26dfaac92013-04-14 13:00:54 -04001172 def test_backref_group_name_in_exception(self):
1173 # Issue 17341: Poor error message when compiling invalid regex
1174 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1175 re.compile('(?P=<foo>)')
1176
1177 def test_group_name_in_exception(self):
1178 # Issue 17341: Poor error message when compiling invalid regex
1179 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1180 re.compile('(?P<?foo>)')
1181
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001182 def test_issue17998(self):
1183 for reps in '*', '+', '?', '{1}':
1184 for mod in '', '?':
1185 pattern = '.' + reps + mod + 'yz'
1186 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1187 ['xyz'], msg=pattern)
1188 pattern = pattern.encode()
1189 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1190 [b'xyz'], msg=pattern)
1191
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001192 def test_match_repr(self):
1193 for string in '[abracadabra]', S('[abracadabra]'):
1194 m = re.search(r'(.+)(.*?)\1', string)
1195 self.assertEqual(repr(m), "<%s.%s object; "
1196 "span=(1, 12), match='abracadabra'>" %
1197 (type(m).__module__, type(m).__qualname__))
1198 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1199 bytearray(b'[abracadabra]'),
1200 memoryview(b'[abracadabra]')):
1201 m = re.search(rb'(.+)(.*?)\1', string)
1202 self.assertEqual(repr(m), "<%s.%s object; "
1203 "span=(1, 12), match=b'abracadabra'>" %
1204 (type(m).__module__, type(m).__qualname__))
1205
1206 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1207 self.assertEqual(repr(first), "<%s.%s object; "
1208 "span=(0, 2), match='aa'>" %
1209 (type(second).__module__, type(first).__qualname__))
1210 self.assertEqual(repr(second), "<%s.%s object; "
1211 "span=(3, 5), match='bb'>" %
1212 (type(second).__module__, type(second).__qualname__))
1213
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001214
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001215 def test_bug_2537(self):
1216 # issue 2537: empty submatches
1217 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1218 for inner_op in ('{0,}', '*', '?'):
1219 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1220 m = r.match("xyyzy")
1221 self.assertEqual(m.group(0), "xyy")
1222 self.assertEqual(m.group(1), "")
1223 self.assertEqual(m.group(2), "y")
1224
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001225 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001226 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001227 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001228 re.compile(pat, re.DEBUG)
1229 dump = '''\
1230subpattern 1
1231 literal 46
1232subpattern None
1233 branch
1234 in
1235 literal 99
1236 literal 104
1237 or
1238 literal 112
1239 literal 121
1240subpattern None
1241 groupref_exists 1
1242 at at_end
1243 else
1244 literal 58
1245 literal 32
1246'''
1247 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001248 # Debug output is output again even a second time (bypassing
1249 # the cache -- issue #20426).
1250 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001251 re.compile(pat, re.DEBUG)
1252 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001253
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001254 def test_keyword_parameters(self):
1255 # Issue #20283: Accepting the string keyword parameter.
1256 pat = re.compile(r'(ab)')
1257 self.assertEqual(
1258 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1259 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001260 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1261 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001262 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1263 self.assertEqual(
1264 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1265 self.assertEqual(
1266 pat.split(string='abracadabra', maxsplit=1),
1267 ['', 'ab', 'racadabra'])
1268 self.assertEqual(
1269 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1270 (7, 9))
1271
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001272 def test_bug_20998(self):
1273 # Issue #20998: Fullmatch of repeated single character pattern
1274 # with ignore case.
1275 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1276
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001277 def test_locale_caching(self):
1278 # Issue #22410
1279 oldlocale = locale.setlocale(locale.LC_CTYPE)
1280 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1281 for loc in 'en_US.iso88591', 'en_US.utf8':
1282 try:
1283 locale.setlocale(locale.LC_CTYPE, loc)
1284 except locale.Error:
1285 # Unsupported locale on this system
1286 self.skipTest('test needs %s locale' % loc)
1287
1288 re.purge()
1289 self.check_en_US_iso88591()
1290 self.check_en_US_utf8()
1291 re.purge()
1292 self.check_en_US_utf8()
1293 self.check_en_US_iso88591()
1294
1295 def check_en_US_iso88591(self):
1296 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1297 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1298 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1299 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1300 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1301 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1302 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1303
1304 def check_en_US_utf8(self):
1305 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1306 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1307 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1308 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1309 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1310 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1311 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1312
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001313
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001314class PatternReprTests(unittest.TestCase):
1315 def check(self, pattern, expected):
1316 self.assertEqual(repr(re.compile(pattern)), expected)
1317
1318 def check_flags(self, pattern, flags, expected):
1319 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1320
1321 def test_without_flags(self):
1322 self.check('random pattern',
1323 "re.compile('random pattern')")
1324
1325 def test_single_flag(self):
1326 self.check_flags('random pattern', re.IGNORECASE,
1327 "re.compile('random pattern', re.IGNORECASE)")
1328
1329 def test_multiple_flags(self):
1330 self.check_flags('random pattern', re.I|re.S|re.X,
1331 "re.compile('random pattern', "
1332 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1333
1334 def test_unicode_flag(self):
1335 self.check_flags('random pattern', re.U,
1336 "re.compile('random pattern')")
1337 self.check_flags('random pattern', re.I|re.S|re.U,
1338 "re.compile('random pattern', "
1339 "re.IGNORECASE|re.DOTALL)")
1340
1341 def test_inline_flags(self):
1342 self.check('(?i)pattern',
1343 "re.compile('(?i)pattern', re.IGNORECASE)")
1344
1345 def test_unknown_flags(self):
1346 self.check_flags('random pattern', 0x123000,
1347 "re.compile('random pattern', 0x123000)")
1348 self.check_flags('random pattern', 0x123000|re.I,
1349 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1350
1351 def test_bytes(self):
1352 self.check(b'bytes pattern',
1353 "re.compile(b'bytes pattern')")
1354 self.check_flags(b'bytes pattern', re.A,
1355 "re.compile(b'bytes pattern', re.ASCII)")
1356
1357 def test_quotes(self):
1358 self.check('random "double quoted" pattern',
1359 '''re.compile('random "double quoted" pattern')''')
1360 self.check("random 'single quoted' pattern",
1361 '''re.compile("random 'single quoted' pattern")''')
1362 self.check('''both 'single' and "double" quotes''',
1363 '''re.compile('both \\'single\\' and "double" quotes')''')
1364
1365 def test_long_pattern(self):
1366 pattern = 'Very %spattern' % ('long ' * 1000)
1367 r = repr(re.compile(pattern))
1368 self.assertLess(len(r), 300)
1369 self.assertEqual(r[:30], "re.compile('Very long long lon")
1370 r = repr(re.compile(pattern, re.I))
1371 self.assertLess(len(r), 300)
1372 self.assertEqual(r[:30], "re.compile('Very long long lon")
1373 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1374
1375
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001376class ImplementationTest(unittest.TestCase):
1377 """
1378 Test implementation details of the re module.
1379 """
1380
1381 def test_overlap_table(self):
1382 f = sre_compile._generate_overlap_table
1383 self.assertEqual(f(""), [])
1384 self.assertEqual(f("a"), [0])
1385 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1386 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1387 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1388 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1389
1390
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001391def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001392 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001393 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001394 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001395 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001396 # To save time, only run the first and last 10 tests
1397 #tests = tests[:10] + tests[-10:]
1398 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001399
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001400 for t in tests:
1401 sys.stdout.flush()
1402 pattern = s = outcome = repl = expected = None
1403 if len(t) == 5:
1404 pattern, s, outcome, repl, expected = t
1405 elif len(t) == 3:
1406 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001407 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001408 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001409
Guido van Rossum41360a41998-03-26 19:42:58 +00001410 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001411 obj = re.compile(pattern)
1412 except re.error:
1413 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001414 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001415 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001416 except KeyboardInterrupt: raise KeyboardInterrupt
1417 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001418 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001419 if verbose:
1420 traceback.print_exc(file=sys.stdout)
1421 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001422 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001423 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001424 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001425 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001426 if outcome == SYNTAX_ERROR:
1427 # This should have been a syntax error; forget it.
1428 pass
1429 elif outcome == FAIL:
1430 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001431 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001432 elif outcome == SUCCEED:
1433 if result is not None:
1434 # Matched, as expected, so now we compute the
1435 # result string and compare it to our expected result.
1436 start, end = result.span(0)
1437 vardict={'found': result.group(0),
1438 'groups': result.group(),
1439 'flags': result.re.flags}
1440 for i in range(1, 100):
1441 try:
1442 gi = result.group(i)
1443 # Special hack because else the string concat fails:
1444 if gi is None:
1445 gi = "None"
1446 except IndexError:
1447 gi = "Error"
1448 vardict['g%d' % i] = gi
1449 for i in result.re.groupindex.keys():
1450 try:
1451 gi = result.group(i)
1452 if gi is None:
1453 gi = "None"
1454 except IndexError:
1455 gi = "Error"
1456 vardict[i] = gi
1457 repl = eval(repl, vardict)
1458 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001459 print('=== grouping error', t, end=' ')
1460 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001461 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001462 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001463
Antoine Pitrou22628c42008-07-22 17:53:22 +00001464 # Try the match with both pattern and string converted to
1465 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001466 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001467 bpat = bytes(pattern, "ascii")
1468 bs = bytes(s, "ascii")
1469 except UnicodeEncodeError:
1470 # skip non-ascii tests
1471 pass
1472 else:
1473 try:
1474 bpat = re.compile(bpat)
1475 except Exception:
1476 print('=== Fails on bytes pattern compile', t)
1477 if verbose:
1478 traceback.print_exc(file=sys.stdout)
1479 else:
1480 bytes_result = bpat.search(bs)
1481 if bytes_result is None:
1482 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001483
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001484 # Try the match with the search area limited to the extent
1485 # of the match and see if it still succeeds. \B will
1486 # break (because it won't match at the end or start of a
1487 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001488
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001489 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1490 and result is not None:
1491 obj = re.compile(pattern)
1492 result = obj.search(s, result.start(0), result.end(0) + 1)
1493 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001494 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001495
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001496 # Try the match with IGNORECASE enabled, and check that it
1497 # still succeeds.
1498 obj = re.compile(pattern, re.IGNORECASE)
1499 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001500 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001501 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001502
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001503 # Try the match with LOCALE enabled, and check that it
1504 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001505 if '(?u)' not in pattern:
1506 obj = re.compile(pattern, re.LOCALE)
1507 result = obj.search(s)
1508 if result is None:
1509 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001510
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001511 # Try the match with UNICODE locale enabled, and check
1512 # that it still succeeds.
1513 obj = re.compile(pattern, re.UNICODE)
1514 result = obj.search(s)
1515 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001516 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001517
Gregory P. Smith5a631832010-07-27 05:31:29 +00001518
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001519def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001520 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001521 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001522
1523if __name__ == "__main__":
1524 test_main()