blob: 0fb47a895c65c60699751b1e864e3ea567e0e9ad [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
87 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
88 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
158 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
159 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
183 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
197 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
198 self.assertRaises(re.error, re.compile, '(?Px)')
199 self.assertRaises(re.error, re.compile, '(?P=)')
200 self.assertRaises(re.error, re.compile, '(?P=1)')
201 self.assertRaises(re.error, re.compile, '(?P=a)')
202 self.assertRaises(re.error, re.compile, '(?P=a1)')
203 self.assertRaises(re.error, re.compile, '(?P=a.)')
204 self.assertRaises(re.error, re.compile, '(?P<)')
205 self.assertRaises(re.error, re.compile, '(?P<>)')
206 self.assertRaises(re.error, re.compile, '(?P<1>)')
207 self.assertRaises(re.error, re.compile, '(?P<a.>)')
208 self.assertRaises(re.error, re.compile, '(?())')
209 self.assertRaises(re.error, re.compile, '(?(a))')
210 self.assertRaises(re.error, re.compile, '(?(1a))')
211 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200212 # New valid/invalid identifiers in Python 3
213 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
214 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
215 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_symbolic_refs(self):
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
224 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200228 # New valid/invalid identifiers in Python 3
229 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
230 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
231 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000232
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 def test_re_subn(self):
234 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
235 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
236 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
237 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
238 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300241 for string in ":a:b::c", S(":a:b::c"):
242 self.assertTypedEqual(re.split(":", string),
243 ['', 'a', 'b', '', 'c'])
244 self.assertTypedEqual(re.split(":*", string),
245 ['', 'a', 'b', 'c'])
246 self.assertTypedEqual(re.split("(:*)", string),
247 ['', ':', 'a', ':', 'b', '::', 'c'])
248 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
249 memoryview(b":a:b::c")):
250 self.assertTypedEqual(re.split(b":", string),
251 [b'', b'a', b'b', b'', b'c'])
252 self.assertTypedEqual(re.split(b":*", string),
253 [b'', b'a', b'b', b'c'])
254 self.assertTypedEqual(re.split(b"(:*)", string),
255 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300256 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
257 "\U0001d49c\U0001d49e\U0001d4b5"):
258 string = ":%s:%s::%s" % (a, b, c)
259 self.assertEqual(re.split(":", string), ['', a, b, '', c])
260 self.assertEqual(re.split(":*", string), ['', a, b, c])
261 self.assertEqual(re.split("(:*)", string),
262 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300263
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000264 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
265 self.assertEqual(re.split("(:)*", ":a:b::c"),
266 ['', ':', 'a', ':', 'b', ':', 'c'])
267 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
268 ['', ':', 'a', ':b::', 'c'])
269 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
270 ['', None, ':', 'a', None, ':', '', 'b', None, '',
271 None, '::', 'c'])
272 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
273 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000274
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000275 def test_qualified_re_split(self):
276 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
277 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
278 self.assertEqual(re.split("(:)", ":a:b::c", 2),
279 ['', ':', 'a', ':', 'b::c'])
280 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
281 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_findall(self):
284 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300285 for string in "a:b::c:::d", S("a:b::c:::d"):
286 self.assertTypedEqual(re.findall(":+", string),
287 [":", "::", ":::"])
288 self.assertTypedEqual(re.findall("(:+)", string),
289 [":", "::", ":::"])
290 self.assertTypedEqual(re.findall("(:)(:*)", string),
291 [(":", ""), (":", ":"), (":", "::")])
292 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
293 memoryview(b"a:b::c:::d")):
294 self.assertTypedEqual(re.findall(b":+", string),
295 [b":", b"::", b":::"])
296 self.assertTypedEqual(re.findall(b"(:+)", string),
297 [b":", b"::", b":::"])
298 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
299 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300300 for x in ("\xe0", "\u0430", "\U0001d49c"):
301 xx = x * 2
302 xxx = x * 3
303 string = "a%sb%sc%sd" % (x, xx, xxx)
304 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
306 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
307 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro5ba00542003-04-25 16:00:14 +0000309 def test_bug_117612(self):
310 self.assertEqual(re.findall(r"(a|(b))", "aba"),
311 [("a", ""),("b", "b"),("a", "")])
312
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000313 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300314 for string in 'a', S('a'):
315 self.assertEqual(re.match('a', string).groups(), ())
316 self.assertEqual(re.match('(a)', string).groups(), ('a',))
317 self.assertEqual(re.match('(a)', string).group(0), 'a')
318 self.assertEqual(re.match('(a)', string).group(1), 'a')
319 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
320 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
321 self.assertEqual(re.match(b'a', string).groups(), ())
322 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
323 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
325 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300326 for a in ("\xe0", "\u0430", "\U0001d49c"):
327 self.assertEqual(re.match(a, a).groups(), ())
328 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
329 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
331 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000333 pat = re.compile('((a)|(b))(c)?')
334 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
335 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
336 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
337 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
338 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000339
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000340 # A single group
341 m = re.match('(a)', 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(0), 'a')
344 self.assertEqual(m.group(1), 'a')
345 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
348 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
349 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
350 (None, 'b', None))
351 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000352
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200353 def test_re_fullmatch(self):
354 # Issue 16203: Proposal: add re.fullmatch() method.
355 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
356 for string in "ab", S("ab"):
357 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
358 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
359 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
360 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
361 r = r"%s|%s" % (a, a + b)
362 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
363 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
365 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
366 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
367 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
368 self.assertIsNone(re.fullmatch(r"a+", "ab"))
369 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
371 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
372 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
374 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
375
376 self.assertEqual(
377 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
378 self.assertEqual(
379 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
380 self.assertEqual(
381 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
382
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000384 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
385 ('(', 'a'))
386 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
387 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300388 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
389 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
391 ('a', 'b'))
392 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
393 (None, 'd'))
394 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
395 (None, 'd'))
396 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
397 ('a', ''))
398
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000399 # Tests for bug #1177831: exercise groups other than the first group
400 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
401 self.assertEqual(p.match('abc').groups(),
402 ('a', 'b', 'c'))
403 self.assertEqual(p.match('ad').groups(),
404 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300405 self.assertIsNone(p.match('abd'))
406 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000407
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000408
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000409 def test_re_groupref(self):
410 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
411 ('|', 'a'))
412 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
413 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300414 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
415 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000416 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
417 ('a', 'a'))
418 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
419 (None, None))
420
421 def test_groupdict(self):
422 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
423 'first second').groupdict(),
424 {'first':'first', 'second':'second'})
425
426 def test_expand(self):
427 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
428 "first second")
429 .expand(r"\2 \1 \g<second> \g<first>"),
430 "second first second first")
431
432 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300433 self.assertIsNone(re.match("^(\w){1}$", "abc"))
434 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
435 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
436 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000437
438 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
445 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
446
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300447 self.assertIsNone(re.match("^x{1}$", "xxx"))
448 self.assertIsNone(re.match("^x{1}?$", "xxx"))
449 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
450 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000451
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300452 self.assertTrue(re.match("^x{3}$", "xxx"))
453 self.assertTrue(re.match("^x{1,3}$", "xxx"))
454 self.assertTrue(re.match("^x{1,4}$", "xxx"))
455 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
456 self.assertTrue(re.match("^x{3}?$", "xxx"))
457 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
458 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
459 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300461 self.assertIsNone(re.match("^x{}$", "xxx"))
462 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000463
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000464 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000467 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
468 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
469 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
470 {'first': 1, 'other': 2})
471
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match("(a)", "a").pos, 0)
473 self.assertEqual(re.match("(a)", "a").endpos, 1)
474 self.assertEqual(re.match("(a)", "a").string, "a")
475 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300476 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000477
478 def test_special_escapes(self):
479 self.assertEqual(re.search(r"\b(b.)\b",
480 "abcd abc bcd bx").group(1), "bx")
481 self.assertEqual(re.search(r"\B(b.)\B",
482 "abc bcd bc abxd").group(1), "bx")
483 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300484 "abcd abc bcd bx", re.ASCII).group(1), "bx")
485 self.assertEqual(re.search(r"\B(b.)\B",
486 "abc bcd bc abxd", re.ASCII).group(1), "bx")
487 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000488 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
489 self.assertEqual(re.search(r"\B(b.)\B",
490 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000491 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300493 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300494 self.assertEqual(re.search(br"\b(b.)\b",
495 b"abcd abc bcd bx").group(1), b"bx")
496 self.assertEqual(re.search(br"\B(b.)\B",
497 b"abc bcd bc abxd").group(1), b"bx")
498 self.assertEqual(re.search(br"\b(b.)\b",
499 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
500 self.assertEqual(re.search(br"\B(b.)\B",
501 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
502 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
503 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300504 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"\d\D\w\W\s\S",
506 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300507 self.assertEqual(re.search(br"\d\D\w\W\s\S",
508 b"1aa! a").group(0), b"1aa! a")
509 self.assertEqual(re.search(r"\d\D\w\W\s\S",
510 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511 self.assertEqual(re.search(r"\d\D\w\W\s\S",
512 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300513 self.assertEqual(re.search(br"\d\D\w\W\s\S",
514 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515
Ezio Melotti5a045b92012-02-29 11:48:44 +0200516 def test_string_boundaries(self):
517 # See http://bugs.python.org/issue10713
518 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
519 "abc")
520 # There's a word boundary at the start of a string.
521 self.assertTrue(re.match(r"\b", "abc"))
522 # A non-empty string includes a non-boundary zero-length match.
523 self.assertTrue(re.search(r"\B", "abc"))
524 # There is no non-boundary match at the start of a string.
525 self.assertFalse(re.match(r"\B", "abc"))
526 # However, an empty string contains no word boundaries, and also no
527 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300528 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200529 # This one is questionable and different from the perlre behaviour,
530 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200532 # A single word-character string has two boundaries, but no
533 # non-boundary gaps.
534 self.assertEqual(len(re.findall(r"\b", "a")), 2)
535 self.assertEqual(len(re.findall(r"\B", "a")), 0)
536 # If there are no words, there are no boundaries
537 self.assertEqual(len(re.findall(r"\b", " ")), 0)
538 self.assertEqual(len(re.findall(r"\b", " ")), 0)
539 # Can match around the whitespace.
540 self.assertEqual(len(re.findall(r"\B", " ")), 2)
541
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(re.match("([\u2222\u2223])",
544 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300545 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300546 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000547
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100548 def test_big_codesize(self):
549 # Issue #1160
550 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300551 self.assertTrue(r.match('1000'))
552 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100553
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000554 def test_anyall(self):
555 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
556 "a\nb")
557 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
558 "a\n\nb")
559
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200560 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
562 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
563 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
564 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
567 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
568
569 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
570 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
571 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
572 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
573
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200574 # Group reference.
575 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
576 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
577 # Conditional group reference.
578 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
579 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
580 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
581 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
582 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
583 # Group used before defined.
584 self.assertTrue(re.match('(a)b(?=(?(2)x|c))(c)', 'abc'))
585 self.assertIsNone(re.match('(a)b(?=(?(2)b|x))(c)', 'abc'))
586 self.assertTrue(re.match('(a)b(?=(?(1)c|x))(c)', 'abc'))
587
588 def test_lookbehind(self):
589 self.assertTrue(re.match('ab(?<=b)c', 'abc'))
590 self.assertIsNone(re.match('ab(?<=c)c', 'abc'))
591 self.assertIsNone(re.match('ab(?<!b)c', 'abc'))
592 self.assertTrue(re.match('ab(?<!c)c', 'abc'))
593 # Group reference.
594 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
595 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
596 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
597 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
598 # Conditional group reference.
599 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
600 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
601 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
602 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
603 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
604 # Group used before defined.
605 self.assertIsNone(re.match('(a)b(?<=(?(2)x|c))(c)', 'abc'))
606 self.assertIsNone(re.match('(a)b(?<=(?(2)b|x))(c)', 'abc'))
607 self.assertIsNone(re.match('(a)b(?<=(?(1)c|x))(c)', 'abc'))
608 self.assertTrue(re.match('(a)b(?<=(?(1)b|x))(c)', 'abc'))
609
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000610 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000611 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300612 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000613 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
614 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
615 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
616 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
617 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
618 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
619 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
620 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
621
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200622 def test_ignore_case_range(self):
623 # Issues #3511, #17381.
624 self.assertTrue(re.match(r'[9-a]', '_', re.I))
625 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
626 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
627 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
628 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
629 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
630 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
631 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
632 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
633 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
634 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
635 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
636 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
637 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
638 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
639 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
640
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000641 def test_category(self):
642 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
643
644 def test_getlower(self):
645 import _sre
646 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
647 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
648 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
649
650 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300651 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000652
653 def test_not_literal(self):
654 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
655 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
656
657 def test_search_coverage(self):
658 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
659 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
660
Ezio Melottid2114eb2011-03-25 14:08:44 +0200661 def assertMatch(self, pattern, text, match=None, span=None,
662 matcher=re.match):
663 if match is None and span is None:
664 # the pattern matches the whole text
665 match = text
666 span = (0, len(text))
667 elif match is None or span is None:
668 raise ValueError('If match is not None, span should be specified '
669 '(and vice versa).')
670 m = matcher(pattern, text)
671 self.assertTrue(m)
672 self.assertEqual(m.group(), match)
673 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000674
Ezio Melottid2114eb2011-03-25 14:08:44 +0200675 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300676 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200677 p = ''.join(chr(i) for i in range(256))
678 for c in p:
679 if c in alnum_chars:
680 self.assertEqual(re.escape(c), c)
681 elif c == '\x00':
682 self.assertEqual(re.escape(c), '\\000')
683 else:
684 self.assertEqual(re.escape(c), '\\' + c)
685 self.assertMatch(re.escape(c), c)
686 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000687
Guido van Rossum698280d2008-09-10 17:44:35 +0000688 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300689 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200690 p = bytes(range(256))
691 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000692 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200693 if b in alnum_chars:
694 self.assertEqual(re.escape(b), b)
695 elif i == 0:
696 self.assertEqual(re.escape(b), b'\\000')
697 else:
698 self.assertEqual(re.escape(b), b'\\' + b)
699 self.assertMatch(re.escape(b), b)
700 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000701
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200702 def test_re_escape_non_ascii(self):
703 s = 'xxx\u2620\u2620\u2620xxx'
704 s_escaped = re.escape(s)
705 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
706 self.assertMatch(s_escaped, s)
707 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
708 'x\u2620\u2620\u2620x', (2, 7), re.search)
709
710 def test_re_escape_non_ascii_bytes(self):
711 b = 'y\u2620y\u2620y'.encode('utf-8')
712 b_escaped = re.escape(b)
713 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
714 self.assertMatch(b_escaped, b)
715 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
716 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000717
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300718 def test_pickling(self):
719 import pickle
720 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
721 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
722 pickled = pickle.dumps(oldpat, proto)
723 newpat = pickle.loads(pickled)
724 self.assertEqual(newpat, oldpat)
725 # current pickle expects the _compile() reconstructor in re module
726 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000727
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000728 def test_constants(self):
729 self.assertEqual(re.I, re.IGNORECASE)
730 self.assertEqual(re.L, re.LOCALE)
731 self.assertEqual(re.M, re.MULTILINE)
732 self.assertEqual(re.S, re.DOTALL)
733 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000734
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000735 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000736 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300737 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000738
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000739 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200740 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
741 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300742 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
743 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
744 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
745 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
746 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
747 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200748 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300749 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
750 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
751 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
752 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
753 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
754 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
755 self.assertTrue(re.match(r"\0", "\000"))
756 self.assertTrue(re.match(r"\08", "\0008"))
757 self.assertTrue(re.match(r"\01", "\001"))
758 self.assertTrue(re.match(r"\018", "\0018"))
759 self.assertTrue(re.match(r"\567", chr(0o167)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200760 self.assertRaises(re.error, re.match, r"\911", "")
761 self.assertRaises(re.error, re.match, r"\x1", "")
762 self.assertRaises(re.error, re.match, r"\x1z", "")
763 self.assertRaises(re.error, re.match, r"\u123", "")
764 self.assertRaises(re.error, re.match, r"\u123z", "")
765 self.assertRaises(re.error, re.match, r"\U0001234", "")
766 self.assertRaises(re.error, re.match, r"\U0001234z", "")
767 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000768
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000769 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200770 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
771 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300772 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
773 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
774 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
775 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
776 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
777 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
778 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
779 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200780 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300781 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
782 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
783 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
784 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
785 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
786 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
787 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200788 self.assertRaises(re.error, re.match, r"[\911]", "")
789 self.assertRaises(re.error, re.match, r"[\x1z]", "")
790 self.assertRaises(re.error, re.match, r"[\u123z]", "")
791 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
792 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
793
794 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000795 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300796 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
797 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
798 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
799 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
800 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
801 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
802 self.assertTrue(re.match(br"\u", b'u'))
803 self.assertTrue(re.match(br"\U", b'U'))
804 self.assertTrue(re.match(br"\0", b"\000"))
805 self.assertTrue(re.match(br"\08", b"\0008"))
806 self.assertTrue(re.match(br"\01", b"\001"))
807 self.assertTrue(re.match(br"\018", b"\0018"))
808 self.assertTrue(re.match(br"\567", bytes([0o167])))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200809 self.assertRaises(re.error, re.match, br"\911", b"")
810 self.assertRaises(re.error, re.match, br"\x1", b"")
811 self.assertRaises(re.error, re.match, br"\x1z", b"")
812
813 def test_sre_byte_class_literals(self):
814 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300815 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
816 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
817 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
818 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
819 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
820 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
821 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
822 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
823 self.assertTrue(re.match(br"[\u]", b'u'))
824 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300825 self.assertRaises(re.error, re.match, br"[\911]", b"")
826 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000827
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000828 def test_bug_113254(self):
829 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
830 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
831 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
832
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000833 def test_bug_527371(self):
834 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300835 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000836 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
837 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
838 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
839 self.assertEqual(re.match("((a))", "a").lastindex, 1)
840
841 def test_bug_545855(self):
842 # bug 545855 -- This pattern failed to cause a compile error as it
843 # should, instead provoking a TypeError.
844 self.assertRaises(re.error, re.compile, 'foo[a-')
845
846 def test_bug_418626(self):
847 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
848 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
849 # pattern '*?' on a long string.
850 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
851 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
852 20003)
853 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000854 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000855 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000856 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000857
858 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000859 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000860 self.assertEqual(re.compile(pat) and 1, 1)
861
Skip Montanaro1e703c62003-04-25 15:40:28 +0000862 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000863 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000864 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000865 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
866 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
867 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000868
Serhiy Storchakafa468162013-02-16 21:23:53 +0200869 def test_unlimited_zero_width_repeat(self):
870 # Issue #9669
871 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
872 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
873 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
874 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
875 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
876 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
877
Skip Montanaro1e703c62003-04-25 15:40:28 +0000878 def test_scanner(self):
879 def s_ident(scanner, token): return token
880 def s_operator(scanner, token): return "op%s" % token
881 def s_float(scanner, token): return float(token)
882 def s_int(scanner, token): return int(token)
883
884 scanner = Scanner([
885 (r"[a-zA-Z_]\w*", s_ident),
886 (r"\d+\.\d*", s_float),
887 (r"\d+", s_int),
888 (r"=|\+|-|\*|/", s_operator),
889 (r"\s+", None),
890 ])
891
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300892 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000893
Skip Montanaro1e703c62003-04-25 15:40:28 +0000894 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
895 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
896 'op+', 'bar'], ''))
897
Skip Montanaro5ba00542003-04-25 16:00:14 +0000898 def test_bug_448951(self):
899 # bug 448951 (similar to 429357, but with single char match)
900 # (Also test greedy matches.)
901 for op in '','?','*':
902 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
903 (None, None))
904 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
905 ('a:', 'a'))
906
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000907 def test_bug_725106(self):
908 # capturing groups in alternatives in repeats
909 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
910 ('b', 'a'))
911 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
912 ('c', 'b'))
913 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
914 ('b', None))
915 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
916 ('b', None))
917 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
918 ('b', 'a'))
919 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
920 ('c', 'b'))
921 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
922 ('b', None))
923 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
924 ('b', None))
925
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000926 def test_bug_725149(self):
927 # mark_stack_base restoring before restoring marks
928 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
929 ('a', None))
930 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
931 ('a', None, None))
932
Just van Rossum12723ba2003-07-02 20:03:04 +0000933 def test_bug_764548(self):
934 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000935 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000936 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300937 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000938
Skip Montanaro5ba00542003-04-25 16:00:14 +0000939 def test_finditer(self):
940 iter = re.finditer(r":+", "a:b::c:::d")
941 self.assertEqual([item.group(0) for item in iter],
942 [":", "::", ":::"])
943
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600944 pat = re.compile(r":+")
945 iter = pat.finditer("a:b::c:::d", 1, 10)
946 self.assertEqual([item.group(0) for item in iter],
947 [":", "::", ":::"])
948
949 pat = re.compile(r":+")
950 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
951 self.assertEqual([item.group(0) for item in iter],
952 [":", "::", ":::"])
953
954 pat = re.compile(r":+")
955 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
956 self.assertEqual([item.group(0) for item in iter],
957 [":", "::", ":::"])
958
959 pat = re.compile(r":+")
960 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
961 self.assertEqual([item.group(0) for item in iter],
962 ["::", "::"])
963
Thomas Wouters40a088d2008-03-18 20:19:54 +0000964 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300965 self.assertIsNot(re.compile('bug_926075'),
966 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000967
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000968 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300969 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000970 self.assertEqual(re.compile(pattern).split("a.b.c"),
971 ['a','b','c'])
972
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000973 def test_bug_581080(self):
974 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000975 self.assertEqual(next(iter).span(), (1,2))
976 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000977
978 scanner = re.compile(r"\s").scanner("a b")
979 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300980 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000981
982 def test_bug_817234(self):
983 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000984 self.assertEqual(next(iter).span(), (0, 4))
985 self.assertEqual(next(iter).span(), (4, 4))
986 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000987
Mark Dickinson1f268282009-07-28 17:22:36 +0000988 def test_bug_6561(self):
989 # '\d' should match characters in Unicode category 'Nd'
990 # (Number, Decimal Digit), but not those in 'Nl' (Number,
991 # Letter) or 'No' (Number, Other).
992 decimal_digits = [
993 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
994 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
995 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
996 ]
997 for x in decimal_digits:
998 self.assertEqual(re.match('^\d$', x).group(0), x)
999
1000 not_decimal_digits = [
1001 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1002 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1003 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1004 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1005 ]
1006 for x in not_decimal_digits:
1007 self.assertIsNone(re.match('^\d$', x))
1008
Guido van Rossumd8faa362007-04-27 19:54:29 +00001009 def test_empty_array(self):
1010 # SF buf 1647541
1011 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001012 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001013 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001014 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001015 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001016
Christian Heimes072c0f12008-01-03 23:01:04 +00001017 def test_inline_flags(self):
1018 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +00001019 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
1020 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +00001021
1022 p = re.compile(upper_char, re.I | re.U)
1023 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001024 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001025
1026 p = re.compile(lower_char, re.I | re.U)
1027 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001028 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001029
1030 p = re.compile('(?i)' + upper_char, re.U)
1031 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001032 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001033
1034 p = re.compile('(?i)' + lower_char, re.U)
1035 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001036 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001037
1038 p = re.compile('(?iu)' + upper_char)
1039 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001040 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001041
1042 p = re.compile('(?iu)' + lower_char)
1043 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001044 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001045
Christian Heimes25bb7832008-01-11 16:17:00 +00001046 def test_dollar_matches_twice(self):
1047 "$ matches the end of string, and just before the terminating \n"
1048 pattern = re.compile('$')
1049 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1050 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1051 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1052
1053 pattern = re.compile('$', re.MULTILINE)
1054 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1055 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1056 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1057
Antoine Pitroufd036452008-08-19 17:56:33 +00001058 def test_bytes_str_mixing(self):
1059 # Mixing str and bytes is disallowed
1060 pat = re.compile('.')
1061 bpat = re.compile(b'.')
1062 self.assertRaises(TypeError, pat.match, b'b')
1063 self.assertRaises(TypeError, bpat.match, 'b')
1064 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1065 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1066 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1067 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1068 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1069 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1070
1071 def test_ascii_and_unicode_flag(self):
1072 # String patterns
1073 for flags in (0, re.UNICODE):
1074 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001075 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001076 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001077 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001078 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001079 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001080 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001081 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001082 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001083 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001084 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001085 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001086 # Bytes patterns
1087 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001088 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001089 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001090 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001091 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001092 # Incompatibilities
1093 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1094 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1095 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1096 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1097 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1098 self.assertRaises(ValueError, re.compile, '(?au)\w')
1099
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001100 def test_bug_6509(self):
1101 # Replacement strings of both types must parse properly.
1102 # all strings
1103 pat = re.compile('a(\w)')
1104 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1105 pat = re.compile('a(.)')
1106 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1107 pat = re.compile('..')
1108 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1109
1110 # all bytes
1111 pat = re.compile(b'a(\w)')
1112 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1113 pat = re.compile(b'a(.)')
1114 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1115 pat = re.compile(b'..')
1116 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1117
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001118 def test_dealloc(self):
1119 # issue 3299: check for segfault in debug build
1120 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001121 # the overflow limit is different on wide and narrow builds and it
1122 # depends on the definition of SRE_CODE (see sre.h).
1123 # 2**128 should be big enough to overflow on both. For smaller values
1124 # a RuntimeError is raised instead of OverflowError.
1125 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001126 self.assertRaises(TypeError, re.finditer, "a", {})
1127 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001128 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001131 self.assertTrue(re.search("123.*-", '123abc-'))
1132 self.assertTrue(re.search("123.*-", '123\xe9-'))
1133 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1134 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1135 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136
Ezio Melottidf723e12012-03-13 01:29:48 +02001137 def test_compile(self):
1138 # Test return value when given string and pattern as parameter
1139 pattern = re.compile('random pattern')
1140 self.assertIsInstance(pattern, re._pattern_type)
1141 same_pattern = re.compile(pattern)
1142 self.assertIsInstance(same_pattern, re._pattern_type)
1143 self.assertIs(same_pattern, pattern)
1144 # Test behaviour when not given a string or pattern as parameter
1145 self.assertRaises(TypeError, re.compile, 0)
1146
Ezio Melottife8e6e72013-01-11 08:32:01 +02001147 def test_bug_13899(self):
1148 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1149 # nothing. Ditto B and Z.
1150 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1151 ['A', 'B', '\b', 'C', 'Z'])
1152
Antoine Pitroub33941a2012-12-03 20:55:56 +01001153 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001154 def test_large_search(self, size):
1155 # Issue #10182: indices were 32-bit-truncated.
1156 s = 'a' * size
1157 m = re.search('$', s)
1158 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001159 self.assertEqual(m.start(), size)
1160 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001161
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001162 # The huge memuse is because of re.sub() using a list and a join()
1163 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001164 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001165 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001166 # Issue #10182: indices were 32-bit-truncated.
1167 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001168 r, n = re.subn('', '', s)
1169 self.assertEqual(r, s)
1170 self.assertEqual(n, size + 1)
1171
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001172 def test_bug_16688(self):
1173 # Issue 16688: Backreferences make case-insensitive regex fail on
1174 # non-ASCII strings.
1175 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1176 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001177
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001178 def test_repeat_minmax_overflow(self):
1179 # Issue #13169
1180 string = "x" * 100000
1181 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1182 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1183 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1184 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1185 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1186 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1187 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1188 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1189 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1190 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1191 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1192
1193 @cpython_only
1194 def test_repeat_minmax_overflow_maxrepeat(self):
1195 try:
1196 from _sre import MAXREPEAT
1197 except ImportError:
1198 self.skipTest('requires _sre.MAXREPEAT constant')
1199 string = "x" * 100000
1200 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1201 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1202 (0, 100000))
1203 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1204 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1205 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1206 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1207
R David Murray26dfaac92013-04-14 13:00:54 -04001208 def test_backref_group_name_in_exception(self):
1209 # Issue 17341: Poor error message when compiling invalid regex
1210 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1211 re.compile('(?P=<foo>)')
1212
1213 def test_group_name_in_exception(self):
1214 # Issue 17341: Poor error message when compiling invalid regex
1215 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1216 re.compile('(?P<?foo>)')
1217
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001218 def test_issue17998(self):
1219 for reps in '*', '+', '?', '{1}':
1220 for mod in '', '?':
1221 pattern = '.' + reps + mod + 'yz'
1222 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1223 ['xyz'], msg=pattern)
1224 pattern = pattern.encode()
1225 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1226 [b'xyz'], msg=pattern)
1227
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001228 def test_match_repr(self):
1229 for string in '[abracadabra]', S('[abracadabra]'):
1230 m = re.search(r'(.+)(.*?)\1', string)
1231 self.assertEqual(repr(m), "<%s.%s object; "
1232 "span=(1, 12), match='abracadabra'>" %
1233 (type(m).__module__, type(m).__qualname__))
1234 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1235 bytearray(b'[abracadabra]'),
1236 memoryview(b'[abracadabra]')):
1237 m = re.search(rb'(.+)(.*?)\1', string)
1238 self.assertEqual(repr(m), "<%s.%s object; "
1239 "span=(1, 12), match=b'abracadabra'>" %
1240 (type(m).__module__, type(m).__qualname__))
1241
1242 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1243 self.assertEqual(repr(first), "<%s.%s object; "
1244 "span=(0, 2), match='aa'>" %
1245 (type(second).__module__, type(first).__qualname__))
1246 self.assertEqual(repr(second), "<%s.%s object; "
1247 "span=(3, 5), match='bb'>" %
1248 (type(second).__module__, type(second).__qualname__))
1249
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001250
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001251 def test_bug_2537(self):
1252 # issue 2537: empty submatches
1253 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1254 for inner_op in ('{0,}', '*', '?'):
1255 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1256 m = r.match("xyyzy")
1257 self.assertEqual(m.group(0), "xyy")
1258 self.assertEqual(m.group(1), "")
1259 self.assertEqual(m.group(2), "y")
1260
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001261 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001262 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001263 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001264 re.compile(pat, re.DEBUG)
1265 dump = '''\
1266subpattern 1
1267 literal 46
1268subpattern None
1269 branch
1270 in
1271 literal 99
1272 literal 104
1273 or
1274 literal 112
1275 literal 121
1276subpattern None
1277 groupref_exists 1
1278 at at_end
1279 else
1280 literal 58
1281 literal 32
1282'''
1283 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001284 # Debug output is output again even a second time (bypassing
1285 # the cache -- issue #20426).
1286 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001287 re.compile(pat, re.DEBUG)
1288 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001289
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001290 def test_keyword_parameters(self):
1291 # Issue #20283: Accepting the string keyword parameter.
1292 pat = re.compile(r'(ab)')
1293 self.assertEqual(
1294 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1295 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001296 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1297 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001298 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1299 self.assertEqual(
1300 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1301 self.assertEqual(
1302 pat.split(string='abracadabra', maxsplit=1),
1303 ['', 'ab', 'racadabra'])
1304 self.assertEqual(
1305 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1306 (7, 9))
1307
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001308 def test_bug_20998(self):
1309 # Issue #20998: Fullmatch of repeated single character pattern
1310 # with ignore case.
1311 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1312
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001313 def test_locale_caching(self):
1314 # Issue #22410
1315 oldlocale = locale.setlocale(locale.LC_CTYPE)
1316 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1317 for loc in 'en_US.iso88591', 'en_US.utf8':
1318 try:
1319 locale.setlocale(locale.LC_CTYPE, loc)
1320 except locale.Error:
1321 # Unsupported locale on this system
1322 self.skipTest('test needs %s locale' % loc)
1323
1324 re.purge()
1325 self.check_en_US_iso88591()
1326 self.check_en_US_utf8()
1327 re.purge()
1328 self.check_en_US_utf8()
1329 self.check_en_US_iso88591()
1330
1331 def check_en_US_iso88591(self):
1332 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1333 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1334 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1335 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1336 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1337 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1338 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1339
1340 def check_en_US_utf8(self):
1341 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1342 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1343 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1344 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1345 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1346 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1347 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1348
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001349
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001350class PatternReprTests(unittest.TestCase):
1351 def check(self, pattern, expected):
1352 self.assertEqual(repr(re.compile(pattern)), expected)
1353
1354 def check_flags(self, pattern, flags, expected):
1355 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1356
1357 def test_without_flags(self):
1358 self.check('random pattern',
1359 "re.compile('random pattern')")
1360
1361 def test_single_flag(self):
1362 self.check_flags('random pattern', re.IGNORECASE,
1363 "re.compile('random pattern', re.IGNORECASE)")
1364
1365 def test_multiple_flags(self):
1366 self.check_flags('random pattern', re.I|re.S|re.X,
1367 "re.compile('random pattern', "
1368 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1369
1370 def test_unicode_flag(self):
1371 self.check_flags('random pattern', re.U,
1372 "re.compile('random pattern')")
1373 self.check_flags('random pattern', re.I|re.S|re.U,
1374 "re.compile('random pattern', "
1375 "re.IGNORECASE|re.DOTALL)")
1376
1377 def test_inline_flags(self):
1378 self.check('(?i)pattern',
1379 "re.compile('(?i)pattern', re.IGNORECASE)")
1380
1381 def test_unknown_flags(self):
1382 self.check_flags('random pattern', 0x123000,
1383 "re.compile('random pattern', 0x123000)")
1384 self.check_flags('random pattern', 0x123000|re.I,
1385 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1386
1387 def test_bytes(self):
1388 self.check(b'bytes pattern',
1389 "re.compile(b'bytes pattern')")
1390 self.check_flags(b'bytes pattern', re.A,
1391 "re.compile(b'bytes pattern', re.ASCII)")
1392
1393 def test_quotes(self):
1394 self.check('random "double quoted" pattern',
1395 '''re.compile('random "double quoted" pattern')''')
1396 self.check("random 'single quoted' pattern",
1397 '''re.compile("random 'single quoted' pattern")''')
1398 self.check('''both 'single' and "double" quotes''',
1399 '''re.compile('both \\'single\\' and "double" quotes')''')
1400
1401 def test_long_pattern(self):
1402 pattern = 'Very %spattern' % ('long ' * 1000)
1403 r = repr(re.compile(pattern))
1404 self.assertLess(len(r), 300)
1405 self.assertEqual(r[:30], "re.compile('Very long long lon")
1406 r = repr(re.compile(pattern, re.I))
1407 self.assertLess(len(r), 300)
1408 self.assertEqual(r[:30], "re.compile('Very long long lon")
1409 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1410
1411
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001412class ImplementationTest(unittest.TestCase):
1413 """
1414 Test implementation details of the re module.
1415 """
1416
1417 def test_overlap_table(self):
1418 f = sre_compile._generate_overlap_table
1419 self.assertEqual(f(""), [])
1420 self.assertEqual(f("a"), [0])
1421 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1422 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1423 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1424 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1425
1426
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001427def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001428 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001429 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001430 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001431 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001432 # To save time, only run the first and last 10 tests
1433 #tests = tests[:10] + tests[-10:]
1434 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001435
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001436 for t in tests:
1437 sys.stdout.flush()
1438 pattern = s = outcome = repl = expected = None
1439 if len(t) == 5:
1440 pattern, s, outcome, repl, expected = t
1441 elif len(t) == 3:
1442 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001443 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001444 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001445
Guido van Rossum41360a41998-03-26 19:42:58 +00001446 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001447 obj = re.compile(pattern)
1448 except re.error:
1449 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001450 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001451 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001452 except KeyboardInterrupt: raise KeyboardInterrupt
1453 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001454 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001455 if verbose:
1456 traceback.print_exc(file=sys.stdout)
1457 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001458 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001459 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001460 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001461 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001462 if outcome == SYNTAX_ERROR:
1463 # This should have been a syntax error; forget it.
1464 pass
1465 elif outcome == FAIL:
1466 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001467 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001468 elif outcome == SUCCEED:
1469 if result is not None:
1470 # Matched, as expected, so now we compute the
1471 # result string and compare it to our expected result.
1472 start, end = result.span(0)
1473 vardict={'found': result.group(0),
1474 'groups': result.group(),
1475 'flags': result.re.flags}
1476 for i in range(1, 100):
1477 try:
1478 gi = result.group(i)
1479 # Special hack because else the string concat fails:
1480 if gi is None:
1481 gi = "None"
1482 except IndexError:
1483 gi = "Error"
1484 vardict['g%d' % i] = gi
1485 for i in result.re.groupindex.keys():
1486 try:
1487 gi = result.group(i)
1488 if gi is None:
1489 gi = "None"
1490 except IndexError:
1491 gi = "Error"
1492 vardict[i] = gi
1493 repl = eval(repl, vardict)
1494 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001495 print('=== grouping error', t, end=' ')
1496 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001497 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001498 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001499
Antoine Pitrou22628c42008-07-22 17:53:22 +00001500 # Try the match with both pattern and string converted to
1501 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001502 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001503 bpat = bytes(pattern, "ascii")
1504 bs = bytes(s, "ascii")
1505 except UnicodeEncodeError:
1506 # skip non-ascii tests
1507 pass
1508 else:
1509 try:
1510 bpat = re.compile(bpat)
1511 except Exception:
1512 print('=== Fails on bytes pattern compile', t)
1513 if verbose:
1514 traceback.print_exc(file=sys.stdout)
1515 else:
1516 bytes_result = bpat.search(bs)
1517 if bytes_result is None:
1518 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001519
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001520 # Try the match with the search area limited to the extent
1521 # of the match and see if it still succeeds. \B will
1522 # break (because it won't match at the end or start of a
1523 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001524
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001525 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1526 and result is not None:
1527 obj = re.compile(pattern)
1528 result = obj.search(s, result.start(0), result.end(0) + 1)
1529 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001530 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001531
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001532 # Try the match with IGNORECASE enabled, and check that it
1533 # still succeeds.
1534 obj = re.compile(pattern, re.IGNORECASE)
1535 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001536 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001537 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001538
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001539 # Try the match with LOCALE enabled, and check that it
1540 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001541 if '(?u)' not in pattern:
1542 obj = re.compile(pattern, re.LOCALE)
1543 result = obj.search(s)
1544 if result is None:
1545 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001546
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001547 # Try the match with UNICODE locale enabled, and check
1548 # that it still succeeds.
1549 obj = re.compile(pattern, re.UNICODE)
1550 result = obj.search(s)
1551 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001552 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001553
Gregory P. Smith5a631832010-07-27 05:31:29 +00001554
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001555def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001556 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001557 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001558
1559if __name__ == "__main__":
1560 test_main()