blob: 6e90b2fec9a7f30c33b622f4623c624bba4973f0 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +010087 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300158 self.assertRaises(re.error, re.sub, 'x', r'\400', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\777', 'x')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100183 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300197 re.compile('(?P<a1>x)\1(?(1)y)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200198 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
199 self.assertRaises(re.error, re.compile, '(?Px)')
200 self.assertRaises(re.error, re.compile, '(?P=)')
201 self.assertRaises(re.error, re.compile, '(?P=1)')
202 self.assertRaises(re.error, re.compile, '(?P=a)')
203 self.assertRaises(re.error, re.compile, '(?P=a1)')
204 self.assertRaises(re.error, re.compile, '(?P=a.)')
205 self.assertRaises(re.error, re.compile, '(?P<)')
206 self.assertRaises(re.error, re.compile, '(?P<>)')
207 self.assertRaises(re.error, re.compile, '(?P<1>)')
208 self.assertRaises(re.error, re.compile, '(?P<a.>)')
209 self.assertRaises(re.error, re.compile, '(?())')
210 self.assertRaises(re.error, re.compile, '(?(a))')
211 self.assertRaises(re.error, re.compile, '(?(1a))')
212 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200213 # New valid/invalid identifiers in Python 3
214 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
215 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
216 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300217 # Support > 100 groups.
218 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
219 pat = '(?:%s)(?(200)z|t)' % pat
220 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 def test_symbolic_refs(self):
223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
224 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000228 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300229 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<2>', 'xx')
230 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\2', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000231 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300232 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
233 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000234 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200235 # New valid/invalid identifiers in Python 3
236 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
237 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
238 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300239 # Support > 100 groups.
240 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
241 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000242
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000243 def test_re_subn(self):
244 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
245 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
246 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
247 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100248 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000249
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000250 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300251 for string in ":a:b::c", S(":a:b::c"):
252 self.assertTypedEqual(re.split(":", string),
253 ['', 'a', 'b', '', 'c'])
254 self.assertTypedEqual(re.split(":*", string),
255 ['', 'a', 'b', 'c'])
256 self.assertTypedEqual(re.split("(:*)", string),
257 ['', ':', 'a', ':', 'b', '::', 'c'])
258 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
259 memoryview(b":a:b::c")):
260 self.assertTypedEqual(re.split(b":", string),
261 [b'', b'a', b'b', b'', b'c'])
262 self.assertTypedEqual(re.split(b":*", string),
263 [b'', b'a', b'b', b'c'])
264 self.assertTypedEqual(re.split(b"(:*)", string),
265 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300266 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
267 "\U0001d49c\U0001d49e\U0001d4b5"):
268 string = ":%s:%s::%s" % (a, b, c)
269 self.assertEqual(re.split(":", string), ['', a, b, '', c])
270 self.assertEqual(re.split(":*", string), ['', a, b, c])
271 self.assertEqual(re.split("(:*)", string),
272 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
275 self.assertEqual(re.split("(:)*", ":a:b::c"),
276 ['', ':', 'a', ':', 'b', ':', 'c'])
277 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
278 ['', ':', 'a', ':b::', 'c'])
279 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
280 ['', None, ':', 'a', None, ':', '', 'b', None, '',
281 None, '::', 'c'])
282 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
283 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000284
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000285 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100286 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
287 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
288 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 ['', ':', 'a', ':', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100290 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000291 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000292
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000293 def test_re_findall(self):
294 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300295 for string in "a:b::c:::d", S("a:b::c:::d"):
296 self.assertTypedEqual(re.findall(":+", string),
297 [":", "::", ":::"])
298 self.assertTypedEqual(re.findall("(:+)", string),
299 [":", "::", ":::"])
300 self.assertTypedEqual(re.findall("(:)(:*)", string),
301 [(":", ""), (":", ":"), (":", "::")])
302 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
303 memoryview(b"a:b::c:::d")):
304 self.assertTypedEqual(re.findall(b":+", string),
305 [b":", b"::", b":::"])
306 self.assertTypedEqual(re.findall(b"(:+)", string),
307 [b":", b"::", b":::"])
308 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
309 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300310 for x in ("\xe0", "\u0430", "\U0001d49c"):
311 xx = x * 2
312 xxx = x * 3
313 string = "a%sb%sc%sd" % (x, xx, xxx)
314 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
315 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
316 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
317 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000318
Skip Montanaro5ba00542003-04-25 16:00:14 +0000319 def test_bug_117612(self):
320 self.assertEqual(re.findall(r"(a|(b))", "aba"),
321 [("a", ""),("b", "b"),("a", "")])
322
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000323 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300324 for string in 'a', S('a'):
325 self.assertEqual(re.match('a', string).groups(), ())
326 self.assertEqual(re.match('(a)', string).groups(), ('a',))
327 self.assertEqual(re.match('(a)', string).group(0), 'a')
328 self.assertEqual(re.match('(a)', string).group(1), 'a')
329 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
330 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
331 self.assertEqual(re.match(b'a', string).groups(), ())
332 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
333 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
334 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
335 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300336 for a in ("\xe0", "\u0430", "\U0001d49c"):
337 self.assertEqual(re.match(a, a).groups(), ())
338 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
339 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
340 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
341 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000342
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000343 pat = re.compile('((a)|(b))(c)?')
344 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
345 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
346 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
347 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
348 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000349
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000350 # A single group
351 m = re.match('(a)', 'a')
352 self.assertEqual(m.group(0), 'a')
353 self.assertEqual(m.group(0), 'a')
354 self.assertEqual(m.group(1), 'a')
355 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000356
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000357 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
358 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
359 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
360 (None, 'b', None))
361 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000362
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200363 def test_re_fullmatch(self):
364 # Issue 16203: Proposal: add re.fullmatch() method.
365 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
366 for string in "ab", S("ab"):
367 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
368 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
369 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
370 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
371 r = r"%s|%s" % (a, a + b)
372 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
373 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
374 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
375 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
376 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
377 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
378 self.assertIsNone(re.fullmatch(r"a+", "ab"))
379 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
380 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
381 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
382 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
383 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
384 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
385
386 self.assertEqual(
387 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
388 self.assertEqual(
389 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
390 self.assertEqual(
391 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
392
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000393 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000394 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
395 ('(', 'a'))
396 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
397 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300398 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
399 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000400 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
401 ('a', 'b'))
402 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
403 (None, 'd'))
404 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
405 (None, 'd'))
406 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
407 ('a', ''))
408
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000409 # Tests for bug #1177831: exercise groups other than the first group
410 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
411 self.assertEqual(p.match('abc').groups(),
412 ('a', 'b', 'c'))
413 self.assertEqual(p.match('ad').groups(),
414 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300415 self.assertIsNone(p.match('abd'))
416 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000417
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300418 # Support > 100 groups.
419 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
420 pat = '(?:%s)(?(200)z)' % pat
421 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000422
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000423 def test_re_groupref(self):
424 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
425 ('|', 'a'))
426 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
427 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300428 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
429 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000430 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
431 ('a', 'a'))
432 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
433 (None, None))
434
435 def test_groupdict(self):
436 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
437 'first second').groupdict(),
438 {'first':'first', 'second':'second'})
439
440 def test_expand(self):
441 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
442 "first second")
443 .expand(r"\2 \1 \g<second> \g<first>"),
444 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300445 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
446 "first")
447 .expand(r"\2 \g<second>"),
448 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000449
450 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300451 self.assertIsNone(re.match("^(\w){1}$", "abc"))
452 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
453 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
454 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000455
456 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
457 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
458 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
459 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
460 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
461 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
462 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
463 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
464
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300465 self.assertIsNone(re.match("^x{1}$", "xxx"))
466 self.assertIsNone(re.match("^x{1}?$", "xxx"))
467 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
468 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000469
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300470 self.assertTrue(re.match("^x{3}$", "xxx"))
471 self.assertTrue(re.match("^x{1,3}$", "xxx"))
472 self.assertTrue(re.match("^x{1,4}$", "xxx"))
473 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
474 self.assertTrue(re.match("^x{3}?$", "xxx"))
475 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
476 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
477 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000478
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300479 self.assertIsNone(re.match("^x{}$", "xxx"))
480 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000481
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000482 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000483 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000484 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000485 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
486 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
487 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
488 {'first': 1, 'other': 2})
489
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000490 self.assertEqual(re.match("(a)", "a").pos, 0)
491 self.assertEqual(re.match("(a)", "a").endpos, 1)
492 self.assertEqual(re.match("(a)", "a").string, "a")
493 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300494 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000495
496 def test_special_escapes(self):
497 self.assertEqual(re.search(r"\b(b.)\b",
498 "abcd abc bcd bx").group(1), "bx")
499 self.assertEqual(re.search(r"\B(b.)\B",
500 "abc bcd bc abxd").group(1), "bx")
501 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300502 "abcd abc bcd bx", re.ASCII).group(1), "bx")
503 self.assertEqual(re.search(r"\B(b.)\B",
504 "abc bcd bc abxd", re.ASCII).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000505 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
506 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300507 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300508 self.assertEqual(re.search(br"\b(b.)\b",
509 b"abcd abc bcd bx").group(1), b"bx")
510 self.assertEqual(re.search(br"\B(b.)\B",
511 b"abc bcd bc abxd").group(1), b"bx")
512 self.assertEqual(re.search(br"\b(b.)\b",
513 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
514 self.assertEqual(re.search(br"\B(b.)\B",
515 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
516 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
517 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300518 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000519 self.assertEqual(re.search(r"\d\D\w\W\s\S",
520 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300521 self.assertEqual(re.search(br"\d\D\w\W\s\S",
522 b"1aa! a").group(0), b"1aa! a")
523 self.assertEqual(re.search(r"\d\D\w\W\s\S",
524 "1aa! a", re.ASCII).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300525 self.assertEqual(re.search(br"\d\D\w\W\s\S",
526 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000527
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200528 def test_other_escapes(self):
529 self.assertRaises(re.error, re.compile, "\\")
530 self.assertEqual(re.match(r"\(", '(').group(), '(')
531 self.assertIsNone(re.match(r"\(", ')'))
532 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
533 self.assertEqual(re.match(r"\y", 'y').group(), 'y')
534 self.assertIsNone(re.match(r"\y", 'z'))
535 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
536 self.assertIsNone(re.match(r"[\]]", '['))
537 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
538 self.assertIsNone(re.match(r"[a\-c]", 'b'))
539 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
540 self.assertIsNone(re.match(r"[\^a]+", 'b'))
541
Ezio Melotti5a045b92012-02-29 11:48:44 +0200542 def test_string_boundaries(self):
543 # See http://bugs.python.org/issue10713
544 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
545 "abc")
546 # There's a word boundary at the start of a string.
547 self.assertTrue(re.match(r"\b", "abc"))
548 # A non-empty string includes a non-boundary zero-length match.
549 self.assertTrue(re.search(r"\B", "abc"))
550 # There is no non-boundary match at the start of a string.
551 self.assertFalse(re.match(r"\B", "abc"))
552 # However, an empty string contains no word boundaries, and also no
553 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300554 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200555 # This one is questionable and different from the perlre behaviour,
556 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300557 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200558 # A single word-character string has two boundaries, but no
559 # non-boundary gaps.
560 self.assertEqual(len(re.findall(r"\b", "a")), 2)
561 self.assertEqual(len(re.findall(r"\B", "a")), 0)
562 # If there are no words, there are no boundaries
563 self.assertEqual(len(re.findall(r"\b", " ")), 0)
564 self.assertEqual(len(re.findall(r"\b", " ")), 0)
565 # Can match around the whitespace.
566 self.assertEqual(len(re.findall(r"\B", " ")), 2)
567
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000568 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000569 self.assertEqual(re.match("([\u2222\u2223])",
570 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300571 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300572 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000573
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100574 def test_big_codesize(self):
575 # Issue #1160
576 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300577 self.assertTrue(r.match('1000'))
578 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100579
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000580 def test_anyall(self):
581 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
582 "a\nb")
583 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
584 "a\n\nb")
585
Benjamin Peterson66323412014-11-30 11:49:00 -0500586 def test_non_consuming(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000587 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
588 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
589 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
590 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
591 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
592 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
593 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
594
595 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
596 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
597 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
598 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
599
600 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000601 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300602 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000603 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
604 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
605 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
606 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
607 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
608 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
609 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
610 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
611
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200612 assert '\u212a'.lower() == 'k' # 'K'
613 self.assertTrue(re.match(r'K', '\u212a', re.I))
614 self.assertTrue(re.match(r'k', '\u212a', re.I))
615 self.assertTrue(re.match(r'\u212a', 'K', re.I))
616 self.assertTrue(re.match(r'\u212a', 'k', re.I))
617 assert '\u017f'.upper() == 'S' # 'ſ'
618 self.assertTrue(re.match(r'S', '\u017f', re.I))
619 self.assertTrue(re.match(r's', '\u017f', re.I))
620 self.assertTrue(re.match(r'\u017f', 'S', re.I))
621 self.assertTrue(re.match(r'\u017f', 's', re.I))
622 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
623 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
624 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
625
626 def test_ignore_case_set(self):
627 self.assertTrue(re.match(r'[19A]', 'A', re.I))
628 self.assertTrue(re.match(r'[19a]', 'a', re.I))
629 self.assertTrue(re.match(r'[19a]', 'A', re.I))
630 self.assertTrue(re.match(r'[19A]', 'a', re.I))
631 self.assertTrue(re.match(br'[19A]', b'A', re.I))
632 self.assertTrue(re.match(br'[19a]', b'a', re.I))
633 self.assertTrue(re.match(br'[19a]', b'A', re.I))
634 self.assertTrue(re.match(br'[19A]', b'a', re.I))
635 assert '\u212a'.lower() == 'k' # 'K'
636 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
637 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
638 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
639 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
640 assert '\u017f'.upper() == 'S' # 'ſ'
641 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
642 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
643 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
644 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
645 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
646 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
647 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
648
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200649 def test_ignore_case_range(self):
650 # Issues #3511, #17381.
651 self.assertTrue(re.match(r'[9-a]', '_', re.I))
652 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
653 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
654 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
655 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
656 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
657 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
658 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
659 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
660 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
661 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
662 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
663 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
664 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
665 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
666 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
667
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200668 assert '\u212a'.lower() == 'k' # 'K'
669 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
670 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
671 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
672 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
673 assert '\u017f'.upper() == 'S' # 'ſ'
674 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
675 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
676 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
677 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
678 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
679 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
680 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
681
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000682 def test_category(self):
683 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
684
685 def test_getlower(self):
686 import _sre
687 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
688 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
689 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200690 self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000691
692 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300693 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200694 self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC")
695 self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000696
697 def test_not_literal(self):
698 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
699 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
700
701 def test_search_coverage(self):
702 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
703 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
704
Ezio Melottid2114eb2011-03-25 14:08:44 +0200705 def assertMatch(self, pattern, text, match=None, span=None,
706 matcher=re.match):
707 if match is None and span is None:
708 # the pattern matches the whole text
709 match = text
710 span = (0, len(text))
711 elif match is None or span is None:
712 raise ValueError('If match is not None, span should be specified '
713 '(and vice versa).')
714 m = matcher(pattern, text)
715 self.assertTrue(m)
716 self.assertEqual(m.group(), match)
717 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000718
Ezio Melottid2114eb2011-03-25 14:08:44 +0200719 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300720 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200721 p = ''.join(chr(i) for i in range(256))
722 for c in p:
723 if c in alnum_chars:
724 self.assertEqual(re.escape(c), c)
725 elif c == '\x00':
726 self.assertEqual(re.escape(c), '\\000')
727 else:
728 self.assertEqual(re.escape(c), '\\' + c)
729 self.assertMatch(re.escape(c), c)
730 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000731
Guido van Rossum698280d2008-09-10 17:44:35 +0000732 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300733 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200734 p = bytes(range(256))
735 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000736 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200737 if b in alnum_chars:
738 self.assertEqual(re.escape(b), b)
739 elif i == 0:
740 self.assertEqual(re.escape(b), b'\\000')
741 else:
742 self.assertEqual(re.escape(b), b'\\' + b)
743 self.assertMatch(re.escape(b), b)
744 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000745
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200746 def test_re_escape_non_ascii(self):
747 s = 'xxx\u2620\u2620\u2620xxx'
748 s_escaped = re.escape(s)
749 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
750 self.assertMatch(s_escaped, s)
751 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
752 'x\u2620\u2620\u2620x', (2, 7), re.search)
753
754 def test_re_escape_non_ascii_bytes(self):
755 b = 'y\u2620y\u2620y'.encode('utf-8')
756 b_escaped = re.escape(b)
757 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
758 self.assertMatch(b_escaped, b)
759 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
760 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000761
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300762 def test_pickling(self):
763 import pickle
764 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
765 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
766 pickled = pickle.dumps(oldpat, proto)
767 newpat = pickle.loads(pickled)
768 self.assertEqual(newpat, oldpat)
769 # current pickle expects the _compile() reconstructor in re module
770 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000771
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000772 def test_constants(self):
773 self.assertEqual(re.I, re.IGNORECASE)
774 self.assertEqual(re.L, re.LOCALE)
775 self.assertEqual(re.M, re.MULTILINE)
776 self.assertEqual(re.S, re.DOTALL)
777 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000778
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000779 def test_flags(self):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200780 for flag in [re.I, re.M, re.X, re.S, re.A, re.U]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300781 self.assertTrue(re.compile('^pattern$', flag))
Serhiy Storchaka22a309a2014-12-01 11:50:07 +0200782 for flag in [re.I, re.M, re.X, re.S, re.A, re.L]:
783 self.assertTrue(re.compile(b'^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000784
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000785 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200786 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
787 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300788 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
789 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
790 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
791 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
792 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
793 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200794 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300795 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
796 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
797 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
798 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
799 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
800 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
801 self.assertTrue(re.match(r"\0", "\000"))
802 self.assertTrue(re.match(r"\08", "\0008"))
803 self.assertTrue(re.match(r"\01", "\001"))
804 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300805 self.assertRaises(re.error, re.match, r"\567", "")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200806 self.assertRaises(re.error, re.match, r"\911", "")
807 self.assertRaises(re.error, re.match, r"\x1", "")
808 self.assertRaises(re.error, re.match, r"\x1z", "")
809 self.assertRaises(re.error, re.match, r"\u123", "")
810 self.assertRaises(re.error, re.match, r"\u123z", "")
811 self.assertRaises(re.error, re.match, r"\U0001234", "")
812 self.assertRaises(re.error, re.match, r"\U0001234z", "")
813 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000814
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000815 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200816 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
817 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300818 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
819 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
820 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
821 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
822 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
823 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
824 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
825 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200826 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300827 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
828 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
829 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
830 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
831 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
832 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300833 self.assertRaises(re.error, re.match, r"[\567]", "")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200834 self.assertRaises(re.error, re.match, r"[\911]", "")
835 self.assertRaises(re.error, re.match, r"[\x1z]", "")
836 self.assertRaises(re.error, re.match, r"[\u123z]", "")
837 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
838 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300839 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200840
841 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000842 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300843 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
844 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
845 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
846 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
847 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
848 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
849 self.assertTrue(re.match(br"\u", b'u'))
850 self.assertTrue(re.match(br"\U", b'U'))
851 self.assertTrue(re.match(br"\0", b"\000"))
852 self.assertTrue(re.match(br"\08", b"\0008"))
853 self.assertTrue(re.match(br"\01", b"\001"))
854 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300855 self.assertRaises(re.error, re.match, br"\567", b"")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200856 self.assertRaises(re.error, re.match, br"\911", b"")
857 self.assertRaises(re.error, re.match, br"\x1", b"")
858 self.assertRaises(re.error, re.match, br"\x1z", b"")
859
860 def test_sre_byte_class_literals(self):
861 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300862 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
863 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
864 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
865 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
866 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
867 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
868 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
869 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
870 self.assertTrue(re.match(br"[\u]", b'u'))
871 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300872 self.assertRaises(re.error, re.match, br"[\567]", b"")
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300873 self.assertRaises(re.error, re.match, br"[\911]", b"")
874 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000875
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000876 def test_bug_113254(self):
877 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
878 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
879 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
880
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000881 def test_bug_527371(self):
882 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300883 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000884 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
885 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
886 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
887 self.assertEqual(re.match("((a))", "a").lastindex, 1)
888
889 def test_bug_545855(self):
890 # bug 545855 -- This pattern failed to cause a compile error as it
891 # should, instead provoking a TypeError.
892 self.assertRaises(re.error, re.compile, 'foo[a-')
893
894 def test_bug_418626(self):
895 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
896 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
897 # pattern '*?' on a long string.
898 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
899 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
900 20003)
901 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000902 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000903 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000904 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000905
906 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000907 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000908 self.assertEqual(re.compile(pat) and 1, 1)
909
Skip Montanaro1e703c62003-04-25 15:40:28 +0000910 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000911 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000912 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000913 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
914 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
915 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000916
Serhiy Storchakafa468162013-02-16 21:23:53 +0200917 def test_unlimited_zero_width_repeat(self):
918 # Issue #9669
919 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
920 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
921 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
922 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
923 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
924 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
925
Skip Montanaro1e703c62003-04-25 15:40:28 +0000926 def test_scanner(self):
927 def s_ident(scanner, token): return token
928 def s_operator(scanner, token): return "op%s" % token
929 def s_float(scanner, token): return float(token)
930 def s_int(scanner, token): return int(token)
931
932 scanner = Scanner([
933 (r"[a-zA-Z_]\w*", s_ident),
934 (r"\d+\.\d*", s_float),
935 (r"\d+", s_int),
936 (r"=|\+|-|\*|/", s_operator),
937 (r"\s+", None),
938 ])
939
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300940 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000941
Skip Montanaro1e703c62003-04-25 15:40:28 +0000942 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
943 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
944 'op+', 'bar'], ''))
945
Skip Montanaro5ba00542003-04-25 16:00:14 +0000946 def test_bug_448951(self):
947 # bug 448951 (similar to 429357, but with single char match)
948 # (Also test greedy matches.)
949 for op in '','?','*':
950 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
951 (None, None))
952 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
953 ('a:', 'a'))
954
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000955 def test_bug_725106(self):
956 # capturing groups in alternatives in repeats
957 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
958 ('b', 'a'))
959 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
960 ('c', 'b'))
961 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
962 ('b', None))
963 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
964 ('b', None))
965 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
966 ('b', 'a'))
967 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
968 ('c', 'b'))
969 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
970 ('b', None))
971 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
972 ('b', None))
973
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +0000974 def test_bug_725149(self):
975 # mark_stack_base restoring before restoring marks
976 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
977 ('a', None))
978 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
979 ('a', None, None))
980
Just van Rossum12723ba2003-07-02 20:03:04 +0000981 def test_bug_764548(self):
982 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000983 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +0000984 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300985 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +0000986
Skip Montanaro5ba00542003-04-25 16:00:14 +0000987 def test_finditer(self):
988 iter = re.finditer(r":+", "a:b::c:::d")
989 self.assertEqual([item.group(0) for item in iter],
990 [":", "::", ":::"])
991
Sean Reifschneider7b3c9752012-03-12 18:22:38 -0600992 pat = re.compile(r":+")
993 iter = pat.finditer("a:b::c:::d", 1, 10)
994 self.assertEqual([item.group(0) for item in iter],
995 [":", "::", ":::"])
996
997 pat = re.compile(r":+")
998 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
999 self.assertEqual([item.group(0) for item in iter],
1000 [":", "::", ":::"])
1001
1002 pat = re.compile(r":+")
1003 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1004 self.assertEqual([item.group(0) for item in iter],
1005 [":", "::", ":::"])
1006
1007 pat = re.compile(r":+")
1008 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1009 self.assertEqual([item.group(0) for item in iter],
1010 ["::", "::"])
1011
Thomas Wouters40a088d2008-03-18 20:19:54 +00001012 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001013 self.assertIsNot(re.compile('bug_926075'),
1014 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001015
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001016 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001017 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001018 self.assertEqual(re.compile(pattern).split("a.b.c"),
1019 ['a','b','c'])
1020
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001021 def test_bug_581080(self):
1022 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001023 self.assertEqual(next(iter).span(), (1,2))
1024 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001025
1026 scanner = re.compile(r"\s").scanner("a b")
1027 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001028 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001029
1030 def test_bug_817234(self):
1031 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001032 self.assertEqual(next(iter).span(), (0, 4))
1033 self.assertEqual(next(iter).span(), (4, 4))
1034 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001035
Mark Dickinson1f268282009-07-28 17:22:36 +00001036 def test_bug_6561(self):
1037 # '\d' should match characters in Unicode category 'Nd'
1038 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1039 # Letter) or 'No' (Number, Other).
1040 decimal_digits = [
1041 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1042 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1043 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1044 ]
1045 for x in decimal_digits:
1046 self.assertEqual(re.match('^\d$', x).group(0), x)
1047
1048 not_decimal_digits = [
1049 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1050 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1051 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1052 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1053 ]
1054 for x in not_decimal_digits:
1055 self.assertIsNone(re.match('^\d$', x))
1056
Guido van Rossumd8faa362007-04-27 19:54:29 +00001057 def test_empty_array(self):
1058 # SF buf 1647541
1059 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001060 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001061 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001062 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001063 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001064
Christian Heimes072c0f12008-01-03 23:01:04 +00001065 def test_inline_flags(self):
1066 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001067 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1068 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001069
1070 p = re.compile(upper_char, re.I | re.U)
1071 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001072 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001073
1074 p = re.compile(lower_char, re.I | re.U)
1075 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001076 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001077
1078 p = re.compile('(?i)' + upper_char, re.U)
1079 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001080 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001081
1082 p = re.compile('(?i)' + lower_char, re.U)
1083 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001084 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001085
1086 p = re.compile('(?iu)' + upper_char)
1087 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001088 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001089
1090 p = re.compile('(?iu)' + lower_char)
1091 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001092 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001093
Christian Heimes25bb7832008-01-11 16:17:00 +00001094 def test_dollar_matches_twice(self):
1095 "$ matches the end of string, and just before the terminating \n"
1096 pattern = re.compile('$')
1097 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1098 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1099 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1100
1101 pattern = re.compile('$', re.MULTILINE)
1102 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1103 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1104 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1105
Antoine Pitroufd036452008-08-19 17:56:33 +00001106 def test_bytes_str_mixing(self):
1107 # Mixing str and bytes is disallowed
1108 pat = re.compile('.')
1109 bpat = re.compile(b'.')
1110 self.assertRaises(TypeError, pat.match, b'b')
1111 self.assertRaises(TypeError, bpat.match, 'b')
1112 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1113 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1114 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1115 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1116 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1117 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1118
1119 def test_ascii_and_unicode_flag(self):
1120 # String patterns
1121 for flags in (0, re.UNICODE):
1122 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001123 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001124 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001125 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001126 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001127 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001128 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001129 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001130 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001131 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001132 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001133 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001134 # Bytes patterns
1135 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001136 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001137 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001138 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001139 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001140 # Incompatibilities
1141 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1142 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1143 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1144 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1145 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1146 self.assertRaises(ValueError, re.compile, '(?au)\w')
1147
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001148 def test_locale_flag(self):
1149 import locale
1150 _, enc = locale.getlocale(locale.LC_CTYPE)
1151 # Search non-ASCII letter
1152 for i in range(128, 256):
1153 try:
1154 c = bytes([i]).decode(enc)
1155 sletter = c.lower()
1156 if sletter == c: continue
1157 bletter = sletter.encode(enc)
1158 if len(bletter) != 1: continue
1159 if bletter.decode(enc) != sletter: continue
1160 bpat = re.escape(bytes([i]))
1161 break
1162 except (UnicodeError, TypeError):
1163 pass
1164 else:
1165 bletter = None
1166 bpat = b'A'
1167 # Bytes patterns
1168 pat = re.compile(bpat, re.LOCALE | re.IGNORECASE)
1169 if bletter:
1170 self.assertTrue(pat.match(bletter))
1171 pat = re.compile(b'(?L)' + bpat, re.IGNORECASE)
1172 if bletter:
1173 self.assertTrue(pat.match(bletter))
1174 pat = re.compile(bpat, re.IGNORECASE)
1175 if bletter:
1176 self.assertIsNone(pat.match(bletter))
1177 pat = re.compile(b'\w', re.LOCALE)
1178 if bletter:
1179 self.assertTrue(pat.match(bletter))
1180 pat = re.compile(b'(?L)\w')
1181 if bletter:
1182 self.assertTrue(pat.match(bletter))
1183 pat = re.compile(b'\w')
1184 if bletter:
1185 self.assertIsNone(pat.match(bletter))
1186 # Incompatibilities
1187 self.assertWarns(DeprecationWarning, re.compile, '', re.LOCALE)
1188 self.assertWarns(DeprecationWarning, re.compile, '(?L)')
1189 self.assertWarns(DeprecationWarning, re.compile, b'', re.LOCALE | re.ASCII)
1190 self.assertWarns(DeprecationWarning, re.compile, b'(?L)', re.ASCII)
1191 self.assertWarns(DeprecationWarning, re.compile, b'(?a)', re.LOCALE)
1192 self.assertWarns(DeprecationWarning, re.compile, b'(?aL)')
1193
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001194 def test_bug_6509(self):
1195 # Replacement strings of both types must parse properly.
1196 # all strings
1197 pat = re.compile('a(\w)')
1198 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1199 pat = re.compile('a(.)')
1200 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1201 pat = re.compile('..')
1202 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1203
1204 # all bytes
1205 pat = re.compile(b'a(\w)')
1206 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1207 pat = re.compile(b'a(.)')
1208 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1209 pat = re.compile(b'..')
1210 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1211
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001212 def test_dealloc(self):
1213 # issue 3299: check for segfault in debug build
1214 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001215 # the overflow limit is different on wide and narrow builds and it
1216 # depends on the definition of SRE_CODE (see sre.h).
1217 # 2**128 should be big enough to overflow on both. For smaller values
1218 # a RuntimeError is raised instead of OverflowError.
1219 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001220 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001221 with self.assertRaises(OverflowError):
1222 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1223 with self.assertRaises(TypeError):
1224 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001227 self.assertTrue(re.search("123.*-", '123abc-'))
1228 self.assertTrue(re.search("123.*-", '123\xe9-'))
1229 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1230 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1231 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001232
Ezio Melottidf723e12012-03-13 01:29:48 +02001233 def test_compile(self):
1234 # Test return value when given string and pattern as parameter
1235 pattern = re.compile('random pattern')
1236 self.assertIsInstance(pattern, re._pattern_type)
1237 same_pattern = re.compile(pattern)
1238 self.assertIsInstance(same_pattern, re._pattern_type)
1239 self.assertIs(same_pattern, pattern)
1240 # Test behaviour when not given a string or pattern as parameter
1241 self.assertRaises(TypeError, re.compile, 0)
1242
Ezio Melottife8e6e72013-01-11 08:32:01 +02001243 def test_bug_13899(self):
1244 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1245 # nothing. Ditto B and Z.
1246 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1247 ['A', 'B', '\b', 'C', 'Z'])
1248
Antoine Pitroub33941a2012-12-03 20:55:56 +01001249 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001250 def test_large_search(self, size):
1251 # Issue #10182: indices were 32-bit-truncated.
1252 s = 'a' * size
1253 m = re.search('$', s)
1254 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001255 self.assertEqual(m.start(), size)
1256 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001257
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001258 # The huge memuse is because of re.sub() using a list and a join()
1259 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001260 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001261 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001262 # Issue #10182: indices were 32-bit-truncated.
1263 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001264 r, n = re.subn('', '', s)
1265 self.assertEqual(r, s)
1266 self.assertEqual(n, size + 1)
1267
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001268 def test_bug_16688(self):
1269 # Issue 16688: Backreferences make case-insensitive regex fail on
1270 # non-ASCII strings.
1271 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1272 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001273
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001274 def test_repeat_minmax_overflow(self):
1275 # Issue #13169
1276 string = "x" * 100000
1277 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1278 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1279 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1280 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1281 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1282 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1283 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1284 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1285 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1286 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1287 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1288
1289 @cpython_only
1290 def test_repeat_minmax_overflow_maxrepeat(self):
1291 try:
1292 from _sre import MAXREPEAT
1293 except ImportError:
1294 self.skipTest('requires _sre.MAXREPEAT constant')
1295 string = "x" * 100000
1296 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1297 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1298 (0, 100000))
1299 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1300 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1301 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1302 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1303
R David Murray26dfaac92013-04-14 13:00:54 -04001304 def test_backref_group_name_in_exception(self):
1305 # Issue 17341: Poor error message when compiling invalid regex
1306 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1307 re.compile('(?P=<foo>)')
1308
1309 def test_group_name_in_exception(self):
1310 # Issue 17341: Poor error message when compiling invalid regex
1311 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1312 re.compile('(?P<?foo>)')
1313
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001314 def test_issue17998(self):
1315 for reps in '*', '+', '?', '{1}':
1316 for mod in '', '?':
1317 pattern = '.' + reps + mod + 'yz'
1318 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1319 ['xyz'], msg=pattern)
1320 pattern = pattern.encode()
1321 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1322 [b'xyz'], msg=pattern)
1323
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001324 def test_match_repr(self):
1325 for string in '[abracadabra]', S('[abracadabra]'):
1326 m = re.search(r'(.+)(.*?)\1', string)
1327 self.assertEqual(repr(m), "<%s.%s object; "
1328 "span=(1, 12), match='abracadabra'>" %
1329 (type(m).__module__, type(m).__qualname__))
1330 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1331 bytearray(b'[abracadabra]'),
1332 memoryview(b'[abracadabra]')):
1333 m = re.search(rb'(.+)(.*?)\1', string)
1334 self.assertEqual(repr(m), "<%s.%s object; "
1335 "span=(1, 12), match=b'abracadabra'>" %
1336 (type(m).__module__, type(m).__qualname__))
1337
1338 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1339 self.assertEqual(repr(first), "<%s.%s object; "
1340 "span=(0, 2), match='aa'>" %
1341 (type(second).__module__, type(first).__qualname__))
1342 self.assertEqual(repr(second), "<%s.%s object; "
1343 "span=(3, 5), match='bb'>" %
1344 (type(second).__module__, type(second).__qualname__))
1345
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001346
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001347 def test_bug_2537(self):
1348 # issue 2537: empty submatches
1349 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1350 for inner_op in ('{0,}', '*', '?'):
1351 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1352 m = r.match("xyyzy")
1353 self.assertEqual(m.group(0), "xyy")
1354 self.assertEqual(m.group(1), "")
1355 self.assertEqual(m.group(2), "y")
1356
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001357 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001358 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001359 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001360 re.compile(pat, re.DEBUG)
1361 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001362SUBPATTERN 1
1363 LITERAL 46
1364SUBPATTERN None
1365 BRANCH
1366 IN
1367 LITERAL 99
1368 LITERAL 104
1369 OR
1370 LITERAL 112
1371 LITERAL 121
1372SUBPATTERN None
1373 GROUPREF_EXISTS 1
1374 AT AT_END
1375 ELSE
1376 LITERAL 58
1377 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001378'''
1379 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001380 # Debug output is output again even a second time (bypassing
1381 # the cache -- issue #20426).
1382 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001383 re.compile(pat, re.DEBUG)
1384 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001385
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001386 def test_keyword_parameters(self):
1387 # Issue #20283: Accepting the string keyword parameter.
1388 pat = re.compile(r'(ab)')
1389 self.assertEqual(
1390 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1391 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001392 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1393 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001394 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1395 self.assertEqual(
1396 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1397 self.assertEqual(
1398 pat.split(string='abracadabra', maxsplit=1),
1399 ['', 'ab', 'racadabra'])
1400 self.assertEqual(
1401 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1402 (7, 9))
1403
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001404 def test_bug_20998(self):
1405 # Issue #20998: Fullmatch of repeated single character pattern
1406 # with ignore case.
1407 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1408
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001409 def test_locale_caching(self):
1410 # Issue #22410
1411 oldlocale = locale.setlocale(locale.LC_CTYPE)
1412 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1413 for loc in 'en_US.iso88591', 'en_US.utf8':
1414 try:
1415 locale.setlocale(locale.LC_CTYPE, loc)
1416 except locale.Error:
1417 # Unsupported locale on this system
1418 self.skipTest('test needs %s locale' % loc)
1419
1420 re.purge()
1421 self.check_en_US_iso88591()
1422 self.check_en_US_utf8()
1423 re.purge()
1424 self.check_en_US_utf8()
1425 self.check_en_US_iso88591()
1426
1427 def check_en_US_iso88591(self):
1428 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1429 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1430 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1431 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1432 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1433 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1434 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1435
1436 def check_en_US_utf8(self):
1437 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1438 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1439 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1440 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1441 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1442 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1443 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1444
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001445 def test_error(self):
1446 with self.assertRaises(re.error) as cm:
1447 re.compile('(\u20ac))')
1448 err = cm.exception
1449 self.assertIsInstance(err.pattern, str)
1450 self.assertEqual(err.pattern, '(\u20ac))')
1451 self.assertEqual(err.pos, 3)
1452 self.assertEqual(err.lineno, 1)
1453 self.assertEqual(err.colno, 4)
1454 self.assertIn(err.msg, str(err))
1455 self.assertIn(' at position 3', str(err))
1456 self.assertNotIn(' at position 3', err.msg)
1457 # Bytes pattern
1458 with self.assertRaises(re.error) as cm:
1459 re.compile(b'(\xa4))')
1460 err = cm.exception
1461 self.assertIsInstance(err.pattern, bytes)
1462 self.assertEqual(err.pattern, b'(\xa4))')
1463 self.assertEqual(err.pos, 3)
1464 # Multiline pattern
1465 with self.assertRaises(re.error) as cm:
1466 re.compile("""
1467 (
1468 abc
1469 )
1470 )
1471 (
1472 """, re.VERBOSE)
1473 err = cm.exception
1474 self.assertEqual(err.pos, 77)
1475 self.assertEqual(err.lineno, 5)
1476 self.assertEqual(err.colno, 17)
1477 self.assertIn(err.msg, str(err))
1478 self.assertIn(' at position 77', str(err))
1479 self.assertIn('(line 5, column 17)', str(err))
1480
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001481
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001482class PatternReprTests(unittest.TestCase):
1483 def check(self, pattern, expected):
1484 self.assertEqual(repr(re.compile(pattern)), expected)
1485
1486 def check_flags(self, pattern, flags, expected):
1487 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1488
1489 def test_without_flags(self):
1490 self.check('random pattern',
1491 "re.compile('random pattern')")
1492
1493 def test_single_flag(self):
1494 self.check_flags('random pattern', re.IGNORECASE,
1495 "re.compile('random pattern', re.IGNORECASE)")
1496
1497 def test_multiple_flags(self):
1498 self.check_flags('random pattern', re.I|re.S|re.X,
1499 "re.compile('random pattern', "
1500 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1501
1502 def test_unicode_flag(self):
1503 self.check_flags('random pattern', re.U,
1504 "re.compile('random pattern')")
1505 self.check_flags('random pattern', re.I|re.S|re.U,
1506 "re.compile('random pattern', "
1507 "re.IGNORECASE|re.DOTALL)")
1508
1509 def test_inline_flags(self):
1510 self.check('(?i)pattern',
1511 "re.compile('(?i)pattern', re.IGNORECASE)")
1512
1513 def test_unknown_flags(self):
1514 self.check_flags('random pattern', 0x123000,
1515 "re.compile('random pattern', 0x123000)")
1516 self.check_flags('random pattern', 0x123000|re.I,
1517 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1518
1519 def test_bytes(self):
1520 self.check(b'bytes pattern',
1521 "re.compile(b'bytes pattern')")
1522 self.check_flags(b'bytes pattern', re.A,
1523 "re.compile(b'bytes pattern', re.ASCII)")
1524
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001525 def test_locale(self):
1526 self.check_flags(b'bytes pattern', re.L,
1527 "re.compile(b'bytes pattern', re.LOCALE)")
1528
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001529 def test_quotes(self):
1530 self.check('random "double quoted" pattern',
1531 '''re.compile('random "double quoted" pattern')''')
1532 self.check("random 'single quoted' pattern",
1533 '''re.compile("random 'single quoted' pattern")''')
1534 self.check('''both 'single' and "double" quotes''',
1535 '''re.compile('both \\'single\\' and "double" quotes')''')
1536
1537 def test_long_pattern(self):
1538 pattern = 'Very %spattern' % ('long ' * 1000)
1539 r = repr(re.compile(pattern))
1540 self.assertLess(len(r), 300)
1541 self.assertEqual(r[:30], "re.compile('Very long long lon")
1542 r = repr(re.compile(pattern, re.I))
1543 self.assertLess(len(r), 300)
1544 self.assertEqual(r[:30], "re.compile('Very long long lon")
1545 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1546
1547
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001548class ImplementationTest(unittest.TestCase):
1549 """
1550 Test implementation details of the re module.
1551 """
1552
1553 def test_overlap_table(self):
1554 f = sre_compile._generate_overlap_table
1555 self.assertEqual(f(""), [])
1556 self.assertEqual(f("a"), [0])
1557 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1558 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1559 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1560 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1561
1562
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001563class ExternalTests(unittest.TestCase):
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001564
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001565 def test_re_benchmarks(self):
1566 're_tests benchmarks'
1567 from test.re_tests import benchmarks
1568 for pattern, s in benchmarks:
1569 with self.subTest(pattern=pattern, string=s):
1570 p = re.compile(pattern)
1571 self.assertTrue(p.search(s))
1572 self.assertTrue(p.match(s))
1573 self.assertTrue(p.fullmatch(s))
1574 s2 = ' '*10000 + s + ' '*10000
1575 self.assertTrue(p.search(s2))
1576 self.assertTrue(p.match(s2, 10000))
1577 self.assertTrue(p.match(s2, 10000, 10000 + len(s)))
1578 self.assertTrue(p.fullmatch(s2, 10000, 10000 + len(s)))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001579
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001580 def test_re_tests(self):
1581 're_tests test suite'
1582 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
1583 for t in tests:
1584 pattern = s = outcome = repl = expected = None
1585 if len(t) == 5:
1586 pattern, s, outcome, repl, expected = t
1587 elif len(t) == 3:
1588 pattern, s, outcome = t
Guido van Rossum41360a41998-03-26 19:42:58 +00001589 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001590 raise ValueError('Test tuples should have 3 or 5 fields', t)
1591
1592 with self.subTest(pattern=pattern, string=s):
1593 if outcome == SYNTAX_ERROR: # Expected a syntax error
1594 with self.assertRaises(re.error):
1595 re.compile(pattern)
1596 continue
1597
1598 obj = re.compile(pattern)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001599 result = obj.search(s)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001600 if outcome == FAIL:
1601 self.assertIsNone(result, 'Succeeded incorrectly')
1602 continue
1603
1604 with self.subTest():
1605 self.assertTrue(result, 'Failed incorrectly')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001606 # Matched, as expected, so now we compute the
1607 # result string and compare it to our expected result.
1608 start, end = result.span(0)
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001609 vardict = {'found': result.group(0),
1610 'groups': result.group(),
1611 'flags': result.re.flags}
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001612 for i in range(1, 100):
1613 try:
1614 gi = result.group(i)
1615 # Special hack because else the string concat fails:
1616 if gi is None:
1617 gi = "None"
1618 except IndexError:
1619 gi = "Error"
1620 vardict['g%d' % i] = gi
1621 for i in result.re.groupindex.keys():
1622 try:
1623 gi = result.group(i)
1624 if gi is None:
1625 gi = "None"
1626 except IndexError:
1627 gi = "Error"
1628 vardict[i] = gi
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001629 self.assertEqual(eval(repl, vardict), expected,
1630 'grouping error')
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001631
Antoine Pitrou22628c42008-07-22 17:53:22 +00001632 # Try the match with both pattern and string converted to
1633 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001634 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001635 bpat = bytes(pattern, "ascii")
1636 bs = bytes(s, "ascii")
1637 except UnicodeEncodeError:
1638 # skip non-ascii tests
1639 pass
1640 else:
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001641 with self.subTest('bytes pattern match'):
Serhiy Storchaka22a309a2014-12-01 11:50:07 +02001642 obj = re.compile(bpat)
1643 self.assertTrue(obj.search(bs))
1644
1645 # Try the match with LOCALE enabled, and check that it
1646 # still succeeds.
1647 with self.subTest('locale-sensitive match'):
1648 obj = re.compile(bpat, re.LOCALE)
1649 result = obj.search(bs)
1650 if result is None:
1651 print('=== Fails on locale-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001652
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001653 # Try the match with the search area limited to the extent
1654 # of the match and see if it still succeeds. \B will
1655 # break (because it won't match at the end or start of a
1656 # string), so we'll ignore patterns that feature it.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001657 if (pattern[:2] != r'\B' and pattern[-2:] != r'\B'
1658 and result is not None):
1659 with self.subTest('range-limited match'):
1660 obj = re.compile(pattern)
1661 self.assertTrue(obj.search(s, start, end + 1))
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001662
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001663 # Try the match with IGNORECASE enabled, and check that it
1664 # still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001665 with self.subTest('case-insensitive match'):
1666 obj = re.compile(pattern, re.IGNORECASE)
1667 self.assertTrue(obj.search(s))
Guido van Rossumdfa67901997-12-08 17:12:06 +00001668
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001669 # Try the match with UNICODE locale enabled, and check
1670 # that it still succeeds.
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001671 with self.subTest('unicode-sensitive match'):
1672 obj = re.compile(pattern, re.UNICODE)
1673 self.assertTrue(obj.search(s))
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001674
Gregory P. Smith5a631832010-07-27 05:31:29 +00001675
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001676if __name__ == "__main__":
Serhiy Storchaka9cba9892014-12-01 11:06:45 +02001677 unittest.main()