blob: d2547d4a96c62745c86550a34a0069aeca07f22e [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
87 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
88 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
158 self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
159 self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
183 self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
197 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
198 self.assertRaises(re.error, re.compile, '(?Px)')
199 self.assertRaises(re.error, re.compile, '(?P=)')
200 self.assertRaises(re.error, re.compile, '(?P=1)')
201 self.assertRaises(re.error, re.compile, '(?P=a)')
202 self.assertRaises(re.error, re.compile, '(?P=a1)')
203 self.assertRaises(re.error, re.compile, '(?P=a.)')
204 self.assertRaises(re.error, re.compile, '(?P<)')
205 self.assertRaises(re.error, re.compile, '(?P<>)')
206 self.assertRaises(re.error, re.compile, '(?P<1>)')
207 self.assertRaises(re.error, re.compile, '(?P<a.>)')
208 self.assertRaises(re.error, re.compile, '(?())')
209 self.assertRaises(re.error, re.compile, '(?(a))')
210 self.assertRaises(re.error, re.compile, '(?(1a))')
211 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200212 # New valid/invalid identifiers in Python 3
213 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
214 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
215 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200216
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000217 def test_symbolic_refs(self):
218 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
219 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
220 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
221 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200222 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
224 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200228 # New valid/invalid identifiers in Python 3
229 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
230 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
231 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000232
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000233 def test_re_subn(self):
234 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
235 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
236 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
237 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
238 self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000239
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000240 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300241 for string in ":a:b::c", S(":a:b::c"):
242 self.assertTypedEqual(re.split(":", string),
243 ['', 'a', 'b', '', 'c'])
244 self.assertTypedEqual(re.split(":*", string),
245 ['', 'a', 'b', 'c'])
246 self.assertTypedEqual(re.split("(:*)", string),
247 ['', ':', 'a', ':', 'b', '::', 'c'])
248 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
249 memoryview(b":a:b::c")):
250 self.assertTypedEqual(re.split(b":", string),
251 [b'', b'a', b'b', b'', b'c'])
252 self.assertTypedEqual(re.split(b":*", string),
253 [b'', b'a', b'b', b'c'])
254 self.assertTypedEqual(re.split(b"(:*)", string),
255 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300256 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
257 "\U0001d49c\U0001d49e\U0001d4b5"):
258 string = ":%s:%s::%s" % (a, b, c)
259 self.assertEqual(re.split(":", string), ['', a, b, '', c])
260 self.assertEqual(re.split(":*", string), ['', a, b, c])
261 self.assertEqual(re.split("(:*)", string),
262 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300263
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000264 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
265 self.assertEqual(re.split("(:)*", ":a:b::c"),
266 ['', ':', 'a', ':', 'b', ':', 'c'])
267 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
268 ['', ':', 'a', ':b::', 'c'])
269 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
270 ['', None, ':', 'a', None, ':', '', 'b', None, '',
271 None, '::', 'c'])
272 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
273 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000274
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000275 def test_qualified_re_split(self):
276 self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
277 self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
278 self.assertEqual(re.split("(:)", ":a:b::c", 2),
279 ['', ':', 'a', ':', 'b::c'])
280 self.assertEqual(re.split("(:*)", ":a:b::c", 2),
281 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000282
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000283 def test_re_findall(self):
284 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300285 for string in "a:b::c:::d", S("a:b::c:::d"):
286 self.assertTypedEqual(re.findall(":+", string),
287 [":", "::", ":::"])
288 self.assertTypedEqual(re.findall("(:+)", string),
289 [":", "::", ":::"])
290 self.assertTypedEqual(re.findall("(:)(:*)", string),
291 [(":", ""), (":", ":"), (":", "::")])
292 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
293 memoryview(b"a:b::c:::d")):
294 self.assertTypedEqual(re.findall(b":+", string),
295 [b":", b"::", b":::"])
296 self.assertTypedEqual(re.findall(b"(:+)", string),
297 [b":", b"::", b":::"])
298 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
299 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300300 for x in ("\xe0", "\u0430", "\U0001d49c"):
301 xx = x * 2
302 xxx = x * 3
303 string = "a%sb%sc%sd" % (x, xx, xxx)
304 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
305 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
306 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
307 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000308
Skip Montanaro5ba00542003-04-25 16:00:14 +0000309 def test_bug_117612(self):
310 self.assertEqual(re.findall(r"(a|(b))", "aba"),
311 [("a", ""),("b", "b"),("a", "")])
312
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000313 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300314 for string in 'a', S('a'):
315 self.assertEqual(re.match('a', string).groups(), ())
316 self.assertEqual(re.match('(a)', string).groups(), ('a',))
317 self.assertEqual(re.match('(a)', string).group(0), 'a')
318 self.assertEqual(re.match('(a)', string).group(1), 'a')
319 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
320 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
321 self.assertEqual(re.match(b'a', string).groups(), ())
322 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
323 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
324 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
325 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300326 for a in ("\xe0", "\u0430", "\U0001d49c"):
327 self.assertEqual(re.match(a, a).groups(), ())
328 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
329 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
330 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
331 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000332
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000333 pat = re.compile('((a)|(b))(c)?')
334 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
335 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
336 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
337 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
338 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000339
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000340 # A single group
341 m = re.match('(a)', 'a')
342 self.assertEqual(m.group(0), 'a')
343 self.assertEqual(m.group(0), 'a')
344 self.assertEqual(m.group(1), 'a')
345 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000346
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000347 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
348 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
349 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
350 (None, 'b', None))
351 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000352
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200353 def test_re_fullmatch(self):
354 # Issue 16203: Proposal: add re.fullmatch() method.
355 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
356 for string in "ab", S("ab"):
357 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
358 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
359 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
360 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
361 r = r"%s|%s" % (a, a + b)
362 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
363 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
364 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
365 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
366 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
367 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
368 self.assertIsNone(re.fullmatch(r"a+", "ab"))
369 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
370 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
371 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
372 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
373 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
374 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
375
376 self.assertEqual(
377 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
378 self.assertEqual(
379 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
380 self.assertEqual(
381 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
382
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000383 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000384 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
385 ('(', 'a'))
386 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
387 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300388 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
389 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000390 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
391 ('a', 'b'))
392 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
393 (None, 'd'))
394 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
395 (None, 'd'))
396 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
397 ('a', ''))
398
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000399 # Tests for bug #1177831: exercise groups other than the first group
400 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
401 self.assertEqual(p.match('abc').groups(),
402 ('a', 'b', 'c'))
403 self.assertEqual(p.match('ad').groups(),
404 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300405 self.assertIsNone(p.match('abd'))
406 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000407
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000408
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000409 def test_re_groupref(self):
410 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
411 ('|', 'a'))
412 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
413 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300414 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
415 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000416 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
417 ('a', 'a'))
418 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
419 (None, None))
420
421 def test_groupdict(self):
422 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
423 'first second').groupdict(),
424 {'first':'first', 'second':'second'})
425
426 def test_expand(self):
427 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
428 "first second")
429 .expand(r"\2 \1 \g<second> \g<first>"),
430 "second first second first")
431
432 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300433 self.assertIsNone(re.match("^(\w){1}$", "abc"))
434 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
435 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
436 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000437
438 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
439 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
440 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
441 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
442 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
443 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
444 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
445 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
446
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300447 self.assertIsNone(re.match("^x{1}$", "xxx"))
448 self.assertIsNone(re.match("^x{1}?$", "xxx"))
449 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
450 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000451
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300452 self.assertTrue(re.match("^x{3}$", "xxx"))
453 self.assertTrue(re.match("^x{1,3}$", "xxx"))
454 self.assertTrue(re.match("^x{1,4}$", "xxx"))
455 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
456 self.assertTrue(re.match("^x{3}?$", "xxx"))
457 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
458 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
459 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000460
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300461 self.assertIsNone(re.match("^x{}$", "xxx"))
462 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000463
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000464 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000465 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000466 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000467 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
468 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
469 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
470 {'first': 1, 'other': 2})
471
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000472 self.assertEqual(re.match("(a)", "a").pos, 0)
473 self.assertEqual(re.match("(a)", "a").endpos, 1)
474 self.assertEqual(re.match("(a)", "a").string, "a")
475 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300476 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000477
478 def test_special_escapes(self):
479 self.assertEqual(re.search(r"\b(b.)\b",
480 "abcd abc bcd bx").group(1), "bx")
481 self.assertEqual(re.search(r"\B(b.)\B",
482 "abc bcd bc abxd").group(1), "bx")
483 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300484 "abcd abc bcd bx", re.ASCII).group(1), "bx")
485 self.assertEqual(re.search(r"\B(b.)\B",
486 "abc bcd bc abxd", re.ASCII).group(1), "bx")
487 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000488 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
489 self.assertEqual(re.search(r"\B(b.)\B",
490 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000491 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
492 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300493 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300494 self.assertEqual(re.search(br"\b(b.)\b",
495 b"abcd abc bcd bx").group(1), b"bx")
496 self.assertEqual(re.search(br"\B(b.)\B",
497 b"abc bcd bc abxd").group(1), b"bx")
498 self.assertEqual(re.search(br"\b(b.)\b",
499 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
500 self.assertEqual(re.search(br"\B(b.)\B",
501 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
502 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
503 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300504 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"\d\D\w\W\s\S",
506 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300507 self.assertEqual(re.search(br"\d\D\w\W\s\S",
508 b"1aa! a").group(0), b"1aa! a")
509 self.assertEqual(re.search(r"\d\D\w\W\s\S",
510 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000511 self.assertEqual(re.search(r"\d\D\w\W\s\S",
512 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300513 self.assertEqual(re.search(br"\d\D\w\W\s\S",
514 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000515
Ezio Melotti5a045b92012-02-29 11:48:44 +0200516 def test_string_boundaries(self):
517 # See http://bugs.python.org/issue10713
518 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
519 "abc")
520 # There's a word boundary at the start of a string.
521 self.assertTrue(re.match(r"\b", "abc"))
522 # A non-empty string includes a non-boundary zero-length match.
523 self.assertTrue(re.search(r"\B", "abc"))
524 # There is no non-boundary match at the start of a string.
525 self.assertFalse(re.match(r"\B", "abc"))
526 # However, an empty string contains no word boundaries, and also no
527 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300528 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200529 # This one is questionable and different from the perlre behaviour,
530 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300531 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200532 # A single word-character string has two boundaries, but no
533 # non-boundary gaps.
534 self.assertEqual(len(re.findall(r"\b", "a")), 2)
535 self.assertEqual(len(re.findall(r"\B", "a")), 0)
536 # If there are no words, there are no boundaries
537 self.assertEqual(len(re.findall(r"\b", " ")), 0)
538 self.assertEqual(len(re.findall(r"\b", " ")), 0)
539 # Can match around the whitespace.
540 self.assertEqual(len(re.findall(r"\B", " ")), 2)
541
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000542 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000543 self.assertEqual(re.match("([\u2222\u2223])",
544 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300545 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300546 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000547
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100548 def test_big_codesize(self):
549 # Issue #1160
550 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300551 self.assertTrue(r.match('1000'))
552 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100553
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000554 def test_anyall(self):
555 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
556 "a\nb")
557 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
558 "a\n\nb")
559
Benjamin Peterson66323412014-11-30 11:49:00 -0500560 def test_non_consuming(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000561 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
562 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
563 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
564 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
565 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
566 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
567 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
568
569 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
570 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
571 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
572 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
573
574 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000575 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300576 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000577 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
578 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
579 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
580 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
581 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
582 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
583 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
584 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
585
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200586 assert '\u212a'.lower() == 'k' # 'K'
587 self.assertTrue(re.match(r'K', '\u212a', re.I))
588 self.assertTrue(re.match(r'k', '\u212a', re.I))
589 self.assertTrue(re.match(r'\u212a', 'K', re.I))
590 self.assertTrue(re.match(r'\u212a', 'k', re.I))
591 assert '\u017f'.upper() == 'S' # 'ſ'
592 self.assertTrue(re.match(r'S', '\u017f', re.I))
593 self.assertTrue(re.match(r's', '\u017f', re.I))
594 self.assertTrue(re.match(r'\u017f', 'S', re.I))
595 self.assertTrue(re.match(r'\u017f', 's', re.I))
596 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
597 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
598 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
599
600 def test_ignore_case_set(self):
601 self.assertTrue(re.match(r'[19A]', 'A', re.I))
602 self.assertTrue(re.match(r'[19a]', 'a', re.I))
603 self.assertTrue(re.match(r'[19a]', 'A', re.I))
604 self.assertTrue(re.match(r'[19A]', 'a', re.I))
605 self.assertTrue(re.match(br'[19A]', b'A', re.I))
606 self.assertTrue(re.match(br'[19a]', b'a', re.I))
607 self.assertTrue(re.match(br'[19a]', b'A', re.I))
608 self.assertTrue(re.match(br'[19A]', b'a', re.I))
609 assert '\u212a'.lower() == 'k' # 'K'
610 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
611 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
612 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
613 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
614 assert '\u017f'.upper() == 'S' # 'ſ'
615 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
616 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
617 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
618 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
619 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
620 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
621 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
622
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200623 def test_ignore_case_range(self):
624 # Issues #3511, #17381.
625 self.assertTrue(re.match(r'[9-a]', '_', re.I))
626 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
627 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
628 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
629 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
630 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
631 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
632 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
633 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
634 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
635 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
636 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
637 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
638 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
639 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
640 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
641
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200642 assert '\u212a'.lower() == 'k' # 'K'
643 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
644 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
645 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
646 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
647 assert '\u017f'.upper() == 'S' # 'ſ'
648 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
649 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
650 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
651 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
652 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
653 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
654 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
655
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000656 def test_category(self):
657 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
658
659 def test_getlower(self):
660 import _sre
661 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
662 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
663 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
664
665 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300666 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000667
668 def test_not_literal(self):
669 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
670 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
671
672 def test_search_coverage(self):
673 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
674 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
675
Ezio Melottid2114eb2011-03-25 14:08:44 +0200676 def assertMatch(self, pattern, text, match=None, span=None,
677 matcher=re.match):
678 if match is None and span is None:
679 # the pattern matches the whole text
680 match = text
681 span = (0, len(text))
682 elif match is None or span is None:
683 raise ValueError('If match is not None, span should be specified '
684 '(and vice versa).')
685 m = matcher(pattern, text)
686 self.assertTrue(m)
687 self.assertEqual(m.group(), match)
688 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000689
Ezio Melottid2114eb2011-03-25 14:08:44 +0200690 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300691 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200692 p = ''.join(chr(i) for i in range(256))
693 for c in p:
694 if c in alnum_chars:
695 self.assertEqual(re.escape(c), c)
696 elif c == '\x00':
697 self.assertEqual(re.escape(c), '\\000')
698 else:
699 self.assertEqual(re.escape(c), '\\' + c)
700 self.assertMatch(re.escape(c), c)
701 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000702
Guido van Rossum698280d2008-09-10 17:44:35 +0000703 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300704 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200705 p = bytes(range(256))
706 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000707 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200708 if b in alnum_chars:
709 self.assertEqual(re.escape(b), b)
710 elif i == 0:
711 self.assertEqual(re.escape(b), b'\\000')
712 else:
713 self.assertEqual(re.escape(b), b'\\' + b)
714 self.assertMatch(re.escape(b), b)
715 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000716
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200717 def test_re_escape_non_ascii(self):
718 s = 'xxx\u2620\u2620\u2620xxx'
719 s_escaped = re.escape(s)
720 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
721 self.assertMatch(s_escaped, s)
722 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
723 'x\u2620\u2620\u2620x', (2, 7), re.search)
724
725 def test_re_escape_non_ascii_bytes(self):
726 b = 'y\u2620y\u2620y'.encode('utf-8')
727 b_escaped = re.escape(b)
728 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
729 self.assertMatch(b_escaped, b)
730 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
731 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000732
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300733 def test_pickling(self):
734 import pickle
735 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
736 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
737 pickled = pickle.dumps(oldpat, proto)
738 newpat = pickle.loads(pickled)
739 self.assertEqual(newpat, oldpat)
740 # current pickle expects the _compile() reconstructor in re module
741 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000742
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000743 def test_constants(self):
744 self.assertEqual(re.I, re.IGNORECASE)
745 self.assertEqual(re.L, re.LOCALE)
746 self.assertEqual(re.M, re.MULTILINE)
747 self.assertEqual(re.S, re.DOTALL)
748 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000749
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000750 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000751 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300752 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000753
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000754 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200755 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
756 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300757 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
758 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
759 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
760 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
761 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
762 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200763 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300764 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
765 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
766 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
767 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
768 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
769 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
770 self.assertTrue(re.match(r"\0", "\000"))
771 self.assertTrue(re.match(r"\08", "\0008"))
772 self.assertTrue(re.match(r"\01", "\001"))
773 self.assertTrue(re.match(r"\018", "\0018"))
774 self.assertTrue(re.match(r"\567", chr(0o167)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200775 self.assertRaises(re.error, re.match, r"\911", "")
776 self.assertRaises(re.error, re.match, r"\x1", "")
777 self.assertRaises(re.error, re.match, r"\x1z", "")
778 self.assertRaises(re.error, re.match, r"\u123", "")
779 self.assertRaises(re.error, re.match, r"\u123z", "")
780 self.assertRaises(re.error, re.match, r"\U0001234", "")
781 self.assertRaises(re.error, re.match, r"\U0001234z", "")
782 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000783
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000784 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200785 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
786 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300787 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
788 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
789 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
790 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
791 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
792 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
793 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
794 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200795 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300796 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
797 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
798 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
799 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
800 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
801 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
802 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200803 self.assertRaises(re.error, re.match, r"[\911]", "")
804 self.assertRaises(re.error, re.match, r"[\x1z]", "")
805 self.assertRaises(re.error, re.match, r"[\u123z]", "")
806 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
807 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
808
809 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000810 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300811 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
812 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
813 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
814 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
815 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
816 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
817 self.assertTrue(re.match(br"\u", b'u'))
818 self.assertTrue(re.match(br"\U", b'U'))
819 self.assertTrue(re.match(br"\0", b"\000"))
820 self.assertTrue(re.match(br"\08", b"\0008"))
821 self.assertTrue(re.match(br"\01", b"\001"))
822 self.assertTrue(re.match(br"\018", b"\0018"))
823 self.assertTrue(re.match(br"\567", bytes([0o167])))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200824 self.assertRaises(re.error, re.match, br"\911", b"")
825 self.assertRaises(re.error, re.match, br"\x1", b"")
826 self.assertRaises(re.error, re.match, br"\x1z", b"")
827
828 def test_sre_byte_class_literals(self):
829 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300830 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
831 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
832 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
833 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
834 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
835 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
836 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
837 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
838 self.assertTrue(re.match(br"[\u]", b'u'))
839 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300840 self.assertRaises(re.error, re.match, br"[\911]", b"")
841 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000842
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000843 def test_bug_113254(self):
844 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
845 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
846 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
847
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000848 def test_bug_527371(self):
849 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300850 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000851 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
852 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
853 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
854 self.assertEqual(re.match("((a))", "a").lastindex, 1)
855
856 def test_bug_545855(self):
857 # bug 545855 -- This pattern failed to cause a compile error as it
858 # should, instead provoking a TypeError.
859 self.assertRaises(re.error, re.compile, 'foo[a-')
860
861 def test_bug_418626(self):
862 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
863 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
864 # pattern '*?' on a long string.
865 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
866 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
867 20003)
868 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000869 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000870 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000871 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000872
873 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000874 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000875 self.assertEqual(re.compile(pat) and 1, 1)
876
Skip Montanaro1e703c62003-04-25 15:40:28 +0000877 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000878 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000879 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000880 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
881 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
882 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000883
Serhiy Storchakafa468162013-02-16 21:23:53 +0200884 def test_unlimited_zero_width_repeat(self):
885 # Issue #9669
886 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
887 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
888 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
889 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
890 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
891 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
892
Skip Montanaro1e703c62003-04-25 15:40:28 +0000893 def test_scanner(self):
894 def s_ident(scanner, token): return token
895 def s_operator(scanner, token): return "op%s" % token
896 def s_float(scanner, token): return float(token)
897 def s_int(scanner, token): return int(token)
898
899 scanner = Scanner([
900 (r"[a-zA-Z_]\w*", s_ident),
901 (r"\d+\.\d*", s_float),
902 (r"\d+", s_int),
903 (r"=|\+|-|\*|/", s_operator),
904 (r"\s+", None),
905 ])
906
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300907 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000908
Skip Montanaro1e703c62003-04-25 15:40:28 +0000909 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
910 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
911 'op+', 'bar'], ''))
912
Skip Montanaro5ba00542003-04-25 16:00:14 +0000913 def test_bug_448951(self):
914 # bug 448951 (similar to 429357, but with single char match)
915 # (Also test greedy matches.)
916 for op in '','?','*':
917 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
918 (None, None))
919 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
920 ('a:', 'a'))
921
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000922 def test_bug_725106(self):
923 # capturing groups in alternatives in repeats
924 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
925 ('b', 'a'))
926 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
927 ('c', 'b'))
928 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
929 ('b', None))
930 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
931 ('b', None))
932 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
933 ('b', 'a'))
934 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
935 ('c', 'b'))
936 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
937 ('b', None))
938 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
939 ('b', None))
940
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000941 def test_bug_725149(self):
942 # mark_stack_base restoring before restoring marks
943 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
944 ('a', None))
945 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
946 ('a', None, None))
947
Just van Rossum12723ba2003-07-02 20:03:04 +0000948 def test_bug_764548(self):
949 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000950 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000951 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300952 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000953
Skip Montanaro5ba00542003-04-25 16:00:14 +0000954 def test_finditer(self):
955 iter = re.finditer(r":+", "a:b::c:::d")
956 self.assertEqual([item.group(0) for item in iter],
957 [":", "::", ":::"])
958
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600959 pat = re.compile(r":+")
960 iter = pat.finditer("a:b::c:::d", 1, 10)
961 self.assertEqual([item.group(0) for item in iter],
962 [":", "::", ":::"])
963
964 pat = re.compile(r":+")
965 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
966 self.assertEqual([item.group(0) for item in iter],
967 [":", "::", ":::"])
968
969 pat = re.compile(r":+")
970 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
971 self.assertEqual([item.group(0) for item in iter],
972 [":", "::", ":::"])
973
974 pat = re.compile(r":+")
975 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
976 self.assertEqual([item.group(0) for item in iter],
977 ["::", "::"])
978
Thomas Wouters40a088d2008-03-18 20:19:54 +0000979 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300980 self.assertIsNot(re.compile('bug_926075'),
981 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +0000982
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000983 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300984 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +0000985 self.assertEqual(re.compile(pattern).split("a.b.c"),
986 ['a','b','c'])
987
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000988 def test_bug_581080(self):
989 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +0000990 self.assertEqual(next(iter).span(), (1,2))
991 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000992
993 scanner = re.compile(r"\s").scanner("a b")
994 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300995 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +0000996
997 def test_bug_817234(self):
998 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +0000999 self.assertEqual(next(iter).span(), (0, 4))
1000 self.assertEqual(next(iter).span(), (4, 4))
1001 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001002
Mark Dickinson1f268282009-07-28 17:22:36 +00001003 def test_bug_6561(self):
1004 # '\d' should match characters in Unicode category 'Nd'
1005 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1006 # Letter) or 'No' (Number, Other).
1007 decimal_digits = [
1008 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1009 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1010 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1011 ]
1012 for x in decimal_digits:
1013 self.assertEqual(re.match('^\d$', x).group(0), x)
1014
1015 not_decimal_digits = [
1016 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1017 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1018 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1019 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1020 ]
1021 for x in not_decimal_digits:
1022 self.assertIsNone(re.match('^\d$', x))
1023
Guido van Rossumd8faa362007-04-27 19:54:29 +00001024 def test_empty_array(self):
1025 # SF buf 1647541
1026 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001027 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001028 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001029 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001030 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001031
Christian Heimes072c0f12008-01-03 23:01:04 +00001032 def test_inline_flags(self):
1033 # Bug #1700
Christian Heimes2e1d0f02008-01-04 00:47:51 +00001034 upper_char = chr(0x1ea0) # Latin Capital Letter A with Dot Bellow
1035 lower_char = chr(0x1ea1) # Latin Small Letter A with Dot Bellow
Christian Heimes072c0f12008-01-03 23:01:04 +00001036
1037 p = re.compile(upper_char, re.I | re.U)
1038 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001039 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001040
1041 p = re.compile(lower_char, re.I | re.U)
1042 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001043 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001044
1045 p = re.compile('(?i)' + upper_char, re.U)
1046 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001047 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001048
1049 p = re.compile('(?i)' + lower_char, re.U)
1050 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001051 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001052
1053 p = re.compile('(?iu)' + upper_char)
1054 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001055 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001056
1057 p = re.compile('(?iu)' + lower_char)
1058 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001059 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001060
Christian Heimes25bb7832008-01-11 16:17:00 +00001061 def test_dollar_matches_twice(self):
1062 "$ matches the end of string, and just before the terminating \n"
1063 pattern = re.compile('$')
1064 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1065 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1066 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1067
1068 pattern = re.compile('$', re.MULTILINE)
1069 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1070 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1071 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1072
Antoine Pitroufd036452008-08-19 17:56:33 +00001073 def test_bytes_str_mixing(self):
1074 # Mixing str and bytes is disallowed
1075 pat = re.compile('.')
1076 bpat = re.compile(b'.')
1077 self.assertRaises(TypeError, pat.match, b'b')
1078 self.assertRaises(TypeError, bpat.match, 'b')
1079 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1080 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1081 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1082 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1083 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1084 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1085
1086 def test_ascii_and_unicode_flag(self):
1087 # String patterns
1088 for flags in (0, re.UNICODE):
1089 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001090 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001091 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001092 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001093 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001094 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001095 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001096 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001097 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001098 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001099 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001100 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001101 # Bytes patterns
1102 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001103 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001104 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001105 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001106 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001107 # Incompatibilities
1108 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1109 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1110 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1111 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1112 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1113 self.assertRaises(ValueError, re.compile, '(?au)\w')
1114
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001115 def test_bug_6509(self):
1116 # Replacement strings of both types must parse properly.
1117 # all strings
1118 pat = re.compile('a(\w)')
1119 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1120 pat = re.compile('a(.)')
1121 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1122 pat = re.compile('..')
1123 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1124
1125 # all bytes
1126 pat = re.compile(b'a(\w)')
1127 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1128 pat = re.compile(b'a(.)')
1129 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1130 pat = re.compile(b'..')
1131 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1132
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001133 def test_dealloc(self):
1134 # issue 3299: check for segfault in debug build
1135 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001136 # the overflow limit is different on wide and narrow builds and it
1137 # depends on the definition of SRE_CODE (see sre.h).
1138 # 2**128 should be big enough to overflow on both. For smaller values
1139 # a RuntimeError is raised instead of OverflowError.
1140 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001141 self.assertRaises(TypeError, re.finditer, "a", {})
1142 self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
Victor Stinner5abeafb2010-03-04 21:59:53 +00001143 self.assertRaises(TypeError, _sre.compile, {}, 0, [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001146 self.assertTrue(re.search("123.*-", '123abc-'))
1147 self.assertTrue(re.search("123.*-", '123\xe9-'))
1148 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1149 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1150 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151
Ezio Melottidf723e12012-03-13 01:29:48 +02001152 def test_compile(self):
1153 # Test return value when given string and pattern as parameter
1154 pattern = re.compile('random pattern')
1155 self.assertIsInstance(pattern, re._pattern_type)
1156 same_pattern = re.compile(pattern)
1157 self.assertIsInstance(same_pattern, re._pattern_type)
1158 self.assertIs(same_pattern, pattern)
1159 # Test behaviour when not given a string or pattern as parameter
1160 self.assertRaises(TypeError, re.compile, 0)
1161
Ezio Melottife8e6e72013-01-11 08:32:01 +02001162 def test_bug_13899(self):
1163 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1164 # nothing. Ditto B and Z.
1165 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1166 ['A', 'B', '\b', 'C', 'Z'])
1167
Antoine Pitroub33941a2012-12-03 20:55:56 +01001168 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001169 def test_large_search(self, size):
1170 # Issue #10182: indices were 32-bit-truncated.
1171 s = 'a' * size
1172 m = re.search('$', s)
1173 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001174 self.assertEqual(m.start(), size)
1175 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001176
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001177 # The huge memuse is because of re.sub() using a list and a join()
1178 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001179 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001180 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001181 # Issue #10182: indices were 32-bit-truncated.
1182 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001183 r, n = re.subn('', '', s)
1184 self.assertEqual(r, s)
1185 self.assertEqual(n, size + 1)
1186
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001187 def test_bug_16688(self):
1188 # Issue 16688: Backreferences make case-insensitive regex fail on
1189 # non-ASCII strings.
1190 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1191 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001192
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001193 def test_repeat_minmax_overflow(self):
1194 # Issue #13169
1195 string = "x" * 100000
1196 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1197 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1198 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1199 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1200 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1201 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1202 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1203 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1204 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1205 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1206 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1207
1208 @cpython_only
1209 def test_repeat_minmax_overflow_maxrepeat(self):
1210 try:
1211 from _sre import MAXREPEAT
1212 except ImportError:
1213 self.skipTest('requires _sre.MAXREPEAT constant')
1214 string = "x" * 100000
1215 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1216 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1217 (0, 100000))
1218 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1219 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1220 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1221 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1222
R David Murray26dfaac92013-04-14 13:00:54 -04001223 def test_backref_group_name_in_exception(self):
1224 # Issue 17341: Poor error message when compiling invalid regex
1225 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1226 re.compile('(?P=<foo>)')
1227
1228 def test_group_name_in_exception(self):
1229 # Issue 17341: Poor error message when compiling invalid regex
1230 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1231 re.compile('(?P<?foo>)')
1232
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001233 def test_issue17998(self):
1234 for reps in '*', '+', '?', '{1}':
1235 for mod in '', '?':
1236 pattern = '.' + reps + mod + 'yz'
1237 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1238 ['xyz'], msg=pattern)
1239 pattern = pattern.encode()
1240 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1241 [b'xyz'], msg=pattern)
1242
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001243 def test_match_repr(self):
1244 for string in '[abracadabra]', S('[abracadabra]'):
1245 m = re.search(r'(.+)(.*?)\1', string)
1246 self.assertEqual(repr(m), "<%s.%s object; "
1247 "span=(1, 12), match='abracadabra'>" %
1248 (type(m).__module__, type(m).__qualname__))
1249 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1250 bytearray(b'[abracadabra]'),
1251 memoryview(b'[abracadabra]')):
1252 m = re.search(rb'(.+)(.*?)\1', string)
1253 self.assertEqual(repr(m), "<%s.%s object; "
1254 "span=(1, 12), match=b'abracadabra'>" %
1255 (type(m).__module__, type(m).__qualname__))
1256
1257 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1258 self.assertEqual(repr(first), "<%s.%s object; "
1259 "span=(0, 2), match='aa'>" %
1260 (type(second).__module__, type(first).__qualname__))
1261 self.assertEqual(repr(second), "<%s.%s object; "
1262 "span=(3, 5), match='bb'>" %
1263 (type(second).__module__, type(second).__qualname__))
1264
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001265
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001266 def test_bug_2537(self):
1267 # issue 2537: empty submatches
1268 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1269 for inner_op in ('{0,}', '*', '?'):
1270 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1271 m = r.match("xyyzy")
1272 self.assertEqual(m.group(0), "xyy")
1273 self.assertEqual(m.group(1), "")
1274 self.assertEqual(m.group(2), "y")
1275
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001276 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001277 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001278 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001279 re.compile(pat, re.DEBUG)
1280 dump = '''\
1281subpattern 1
1282 literal 46
1283subpattern None
1284 branch
1285 in
1286 literal 99
1287 literal 104
1288 or
1289 literal 112
1290 literal 121
1291subpattern None
1292 groupref_exists 1
1293 at at_end
1294 else
1295 literal 58
1296 literal 32
1297'''
1298 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001299 # Debug output is output again even a second time (bypassing
1300 # the cache -- issue #20426).
1301 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001302 re.compile(pat, re.DEBUG)
1303 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001304
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001305 def test_keyword_parameters(self):
1306 # Issue #20283: Accepting the string keyword parameter.
1307 pat = re.compile(r'(ab)')
1308 self.assertEqual(
1309 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1310 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001311 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1312 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001313 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1314 self.assertEqual(
1315 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1316 self.assertEqual(
1317 pat.split(string='abracadabra', maxsplit=1),
1318 ['', 'ab', 'racadabra'])
1319 self.assertEqual(
1320 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1321 (7, 9))
1322
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001323 def test_bug_20998(self):
1324 # Issue #20998: Fullmatch of repeated single character pattern
1325 # with ignore case.
1326 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1327
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001328 def test_locale_caching(self):
1329 # Issue #22410
1330 oldlocale = locale.setlocale(locale.LC_CTYPE)
1331 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1332 for loc in 'en_US.iso88591', 'en_US.utf8':
1333 try:
1334 locale.setlocale(locale.LC_CTYPE, loc)
1335 except locale.Error:
1336 # Unsupported locale on this system
1337 self.skipTest('test needs %s locale' % loc)
1338
1339 re.purge()
1340 self.check_en_US_iso88591()
1341 self.check_en_US_utf8()
1342 re.purge()
1343 self.check_en_US_utf8()
1344 self.check_en_US_iso88591()
1345
1346 def check_en_US_iso88591(self):
1347 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1348 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1349 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1350 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1351 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1352 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1353 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1354
1355 def check_en_US_utf8(self):
1356 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1357 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1358 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1359 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1360 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1361 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1362 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1363
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001364
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001365class PatternReprTests(unittest.TestCase):
1366 def check(self, pattern, expected):
1367 self.assertEqual(repr(re.compile(pattern)), expected)
1368
1369 def check_flags(self, pattern, flags, expected):
1370 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1371
1372 def test_without_flags(self):
1373 self.check('random pattern',
1374 "re.compile('random pattern')")
1375
1376 def test_single_flag(self):
1377 self.check_flags('random pattern', re.IGNORECASE,
1378 "re.compile('random pattern', re.IGNORECASE)")
1379
1380 def test_multiple_flags(self):
1381 self.check_flags('random pattern', re.I|re.S|re.X,
1382 "re.compile('random pattern', "
1383 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1384
1385 def test_unicode_flag(self):
1386 self.check_flags('random pattern', re.U,
1387 "re.compile('random pattern')")
1388 self.check_flags('random pattern', re.I|re.S|re.U,
1389 "re.compile('random pattern', "
1390 "re.IGNORECASE|re.DOTALL)")
1391
1392 def test_inline_flags(self):
1393 self.check('(?i)pattern',
1394 "re.compile('(?i)pattern', re.IGNORECASE)")
1395
1396 def test_unknown_flags(self):
1397 self.check_flags('random pattern', 0x123000,
1398 "re.compile('random pattern', 0x123000)")
1399 self.check_flags('random pattern', 0x123000|re.I,
1400 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1401
1402 def test_bytes(self):
1403 self.check(b'bytes pattern',
1404 "re.compile(b'bytes pattern')")
1405 self.check_flags(b'bytes pattern', re.A,
1406 "re.compile(b'bytes pattern', re.ASCII)")
1407
1408 def test_quotes(self):
1409 self.check('random "double quoted" pattern',
1410 '''re.compile('random "double quoted" pattern')''')
1411 self.check("random 'single quoted' pattern",
1412 '''re.compile("random 'single quoted' pattern")''')
1413 self.check('''both 'single' and "double" quotes''',
1414 '''re.compile('both \\'single\\' and "double" quotes')''')
1415
1416 def test_long_pattern(self):
1417 pattern = 'Very %spattern' % ('long ' * 1000)
1418 r = repr(re.compile(pattern))
1419 self.assertLess(len(r), 300)
1420 self.assertEqual(r[:30], "re.compile('Very long long lon")
1421 r = repr(re.compile(pattern, re.I))
1422 self.assertLess(len(r), 300)
1423 self.assertEqual(r[:30], "re.compile('Very long long lon")
1424 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1425
1426
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001427class ImplementationTest(unittest.TestCase):
1428 """
1429 Test implementation details of the re module.
1430 """
1431
1432 def test_overlap_table(self):
1433 f = sre_compile._generate_overlap_table
1434 self.assertEqual(f(""), [])
1435 self.assertEqual(f("a"), [0])
1436 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1437 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1438 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1439 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1440
1441
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001442class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001443
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001444 def test_re_benchmarks(self):
1445 're_tests benchmarks'
1446 from test.re_tests import benchmarks
1447 for pattern, s in benchmarks:
1448 with self.subTest(pattern=pattern, string=s):
1449 p = re.compile(pattern)
1450 self.assertTrue(p.search(s))
1451 self.assertTrue(p.match(s))
1452 self.assertTrue(p.fullmatch(s))
1453 s2 = ' '*10000 + s + ' '*10000
1454 self.assertTrue(p.search(s2))
1455 self.assertTrue(p.match(s2, 10000))
1456 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1457 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001458
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001459 def test_re_tests(self):
1460 're_tests test suite'
1461 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1462 for t in tests:
1463 pattern = s = outcome = repl = expected = None
1464 if len(t) == 5:
1465 pattern, s, outcome, repl, expected = t
1466 elif len(t) == 3:
1467 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001468 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001469 raise ValueError('Test tuples should have 3 or 5 fields', t)
1470
1471 with self.subTest(pattern=pattern, string=s):
1472 if outcome == SYNTAX_ERROR: # Expected a syntax error
1473 with self.assertRaises(re.error):
1474 re.compile(pattern)
1475 continue
1476
1477 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001478 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001479 if outcome == FAIL:
1480 self.assertIsNone(result, 'Succeeded incorrectly')
1481 continue
1482
1483 with self.subTest():
1484 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001485 # Matched, as expected, so now we compute the
1486 # result string and compare it to our expected result.
1487 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001488 vardict = {'found': result.group(0),
1489 'groups': result.group(),
1490 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001491 for i in range(1, 100):
1492 try:
1493 gi = result.group(i)
1494 # Special hack because else the string concat fails:
1495 if gi is None:
1496 gi = "None"
1497 except IndexError:
1498 gi = "Error"
1499 vardict['g%d' % i] = gi
1500 for i in result.re.groupindex.keys():
1501 try:
1502 gi = result.group(i)
1503 if gi is None:
1504 gi = "None"
1505 except IndexError:
1506 gi = "Error"
1507 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001508 self.assertEqual(eval(repl, vardict), expected,
1509 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001510
Antoine Pitrou22628c42008-07-22 17:53:22 +00001511 # Try the match with both pattern and string converted to
1512 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001513 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001514 bpat = bytes(pattern, "ascii")
1515 bs = bytes(s, "ascii")
1516 except UnicodeEncodeError:
1517 # skip non-ascii tests
1518 pass
1519 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001520 with self.subTest('bytes pattern match'):
Antoine Pitrou22628c42008-07-22 17:53:22 +00001521 bpat = re.compile(bpat)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001522 self.assertTrue(bpat.search(bs))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001523
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001524 # Try the match with the search area limited to the extent
1525 # of the match and see if it still succeeds. \B will
1526 # break (because it won't match at the end or start of a
1527 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001528 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1529 and result is not None):
1530 with self.subTest('range-limited match'):
1531 obj = re.compile(pattern)
1532 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001533
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001534 # Try the match with IGNORECASE enabled, and check that it
1535 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001536 with self.subTest('case-insensitive match'):
1537 obj = re.compile(pattern, re.IGNORECASE)
1538 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001539
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001540 # Try the match with LOCALE enabled, and check that it
1541 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001542 if '(?u)' not in pattern:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001543 with self.subTest('locale-sensitive match'):
1544 obj = re.compile(pattern, re.LOCALE)
1545 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001546
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001547 # Try the match with UNICODE locale enabled, and check
1548 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001549 with self.subTest('unicode-sensitive match'):
1550 obj = re.compile(pattern, re.UNICODE)
1551 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001552
Gregory P. Smith5a631832010-07-27 05:31:29 +00001553
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001554if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001555 unittest.main()