blob: 7bc1e935d4683b76d7db77a287c88ffa71741f16 [file] [log] [blame]
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001from test.support import verbose, run_unittest, gc_collect, bigmemtest, _2G, \
Antoine Pitroud2cc7432014-02-03 20:59:59 +01002 cpython_only, captured_stdout
Benjamin Petersone48944b2012-03-07 14:50:25 -06003import io
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02004import locale
Guido van Rossum8e0ce301997-07-11 19:34:44 +00005import re
Thomas Wouters9ada3d62006-04-21 09:47:09 +00006from re import Scanner
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02007import sre_compile
R David Murray26dfaac92013-04-14 13:00:54 -04008import sre_constants
Ezio Melottid2114eb2011-03-25 14:08:44 +02009import sys
10import string
11import traceback
Antoine Pitrou79aa68d2013-10-25 21:36:10 +020012import unittest
Raymond Hettinger027bb632004-05-31 03:09:25 +000013from weakref import proxy
Guido van Rossum8e0ce301997-07-11 19:34:44 +000014
Guido van Rossum23b22571997-07-17 22:36:14 +000015# Misc tests from Tim Peters' re.doc
16
Just van Rossum6802c6e2003-07-02 14:36:59 +000017# WARNING: Don't change details in these tests if you don't know
Ezio Melotti42da6632011-03-15 05:18:48 +020018# what you're doing. Some of these tests were carefully modeled to
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +000019# cover most of the code.
20
Serhiy Storchaka25324972013-10-16 12:46:28 +030021class S(str):
22 def __getitem__(self, index):
23 return S(super().__getitem__(index))
24
25class B(bytes):
26 def __getitem__(self, index):
27 return B(super().__getitem__(index))
28
Skip Montanaro8ed06da2003-04-24 19:43:18 +000029class ReTests(unittest.TestCase):
Raymond Hettinger027bb632004-05-31 03:09:25 +000030
Serhiy Storchaka25324972013-10-16 12:46:28 +030031 def assertTypedEqual(self, actual, expect, msg=None):
32 self.assertEqual(actual, expect, msg)
33 def recurse(actual, expect):
34 if isinstance(expect, (tuple, list)):
35 for x, y in zip(actual, expect):
36 recurse(x, y)
37 else:
38 self.assertIs(type(actual), type(expect), msg)
39 recurse(actual, expect)
40
Benjamin Petersone48944b2012-03-07 14:50:25 -060041 def test_keep_buffer(self):
42 # See bug 14212
43 b = bytearray(b'x')
44 it = re.finditer(b'a', b)
45 with self.assertRaises(BufferError):
46 b.extend(b'x'*400)
47 list(it)
48 del it
49 gc_collect()
50 b.extend(b'x'*400)
51
Raymond Hettinger027bb632004-05-31 03:09:25 +000052 def test_weakref(self):
53 s = 'QabbbcR'
54 x = re.compile('ab+c')
55 y = proxy(x)
56 self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
57
Skip Montanaro8ed06da2003-04-24 19:43:18 +000058 def test_search_star_plus(self):
59 self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
60 self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
61 self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
62 self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030063 self.assertIsNone(re.search('x', 'aaa'))
Skip Montanaro8ed06da2003-04-24 19:43:18 +000064 self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
65 self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
66 self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
67 self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +030068 self.assertIsNone(re.match('a+', 'xxx'))
Guido van Rossum8430c581998-04-03 21:47:12 +000069
Skip Montanaro8ed06da2003-04-24 19:43:18 +000070 def bump_num(self, matchobj):
Guido van Rossum41360a41998-03-26 19:42:58 +000071 int_value = int(matchobj.group(0))
72 return str(int_value + 1)
Guido van Rossum23b22571997-07-17 22:36:14 +000073
Skip Montanaro8ed06da2003-04-24 19:43:18 +000074 def test_basic_re_sub(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +030075 self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
76 self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
77 self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
78 self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
79 self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
80 self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
Serhiy Storchaka9eabac62013-10-26 10:45:48 +030081 for y in ("\xe0", "\u0430", "\U0001d49c"):
82 self.assertEqual(re.sub(y, 'a', 'x%sz' % y), 'xaz')
Serhiy Storchaka25324972013-10-16 12:46:28 +030083
Skip Montanaro8ed06da2003-04-24 19:43:18 +000084 self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
85 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
86 '9.3 -3 24x100y')
Victor Stinner55e614a2014-10-29 16:58:59 +010087 self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', count=3),
Skip Montanaro8ed06da2003-04-24 19:43:18 +000088 '9.3 -3 23x99y')
Fredrik Lundh1151a8c2000-08-08 16:47:42 +000089
Skip Montanaro8ed06da2003-04-24 19:43:18 +000090 self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
91 self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
Guido van Rossumdfa67901997-12-08 17:12:06 +000092
Skip Montanaro8ed06da2003-04-24 19:43:18 +000093 s = r"\1\1"
94 self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
95 self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
96 self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
Guido van Rossum23b22571997-07-17 22:36:14 +000097
Skip Montanaro8ed06da2003-04-24 19:43:18 +000098 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
99 self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
100 self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
101 self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
Guido van Rossum49946571997-07-18 04:26:25 +0000102
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000103 self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
104 '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
105 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
106 self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
107 (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
Guido van Rossum95e80531997-08-13 22:34:14 +0000108
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000109 self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
Guido van Rossume056e4d2001-08-10 14:52:48 +0000110
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000111 def test_bug_449964(self):
112 # fails for group followed by other escape
113 self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
114 'xx\bxx\b')
115
116 def test_bug_449000(self):
117 # Test for sub() on escaped characters
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000118 self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
119 'abc\ndef\n')
120 self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
121 'abc\ndef\n')
122 self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
123 'abc\ndef\n')
124 self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
125 'abc\ndef\n')
Guido van Rossum23b22571997-07-17 22:36:14 +0000126
Christian Heimes5fb7c2a2007-12-24 08:52:31 +0000127 def test_bug_1661(self):
128 # Verify that flags do not get silently ignored with compiled patterns
129 pattern = re.compile('.')
130 self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
131 self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
132 self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
133 self.assertRaises(ValueError, re.compile, pattern, re.I)
134
Guido van Rossum92f8f3e2008-09-10 14:30:50 +0000135 def test_bug_3629(self):
136 # A regex that triggered a bug in the sre-code validator
137 re.compile("(?P<quote>)(?(quote))")
138
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000139 def test_sub_template_numeric_escape(self):
140 # bug 776311 and friends
141 self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
142 self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
143 self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
144 self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
145 self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
146 self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
147 self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
148
149 self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
150 self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
151
152 self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
153 self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
154 self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
155 self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
156 self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
157
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300158 self.assertRaises(re.error, re.sub, 'x', r'\400', 'x')
159 self.assertRaises(re.error, re.sub, 'x', r'\777', 'x')
Tim Peters0e9980f2004-09-12 03:49:31 +0000160
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000161 self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
162 self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
163 self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
164 self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
165 self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
166 self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
167 self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
168 self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
169 self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
170 self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
171 self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
172 self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
173
174 # in python2.3 (etc), these loop endlessly in sre_parser.py
175 self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
176 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
177 'xz8')
178 self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
179 'xza')
180
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000181 def test_qualified_re_sub(self):
182 self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
Victor Stinner55e614a2014-10-29 16:58:59 +0100183 self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa')
Guido van Rossum8430c581998-04-03 21:47:12 +0000184
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000185 def test_bug_114660(self):
186 self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
187 'hello there')
188
189 def test_bug_462270(self):
190 # Test for empty sub() behaviour, see SF bug #462270
191 self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
192 self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
193
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200194 def test_symbolic_groups(self):
195 re.compile('(?P<a>x)(?P=a)(?(a)y)')
196 re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300197 re.compile('(?P<a1>x)\1(?(1)y)')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200198 self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
199 self.assertRaises(re.error, re.compile, '(?Px)')
200 self.assertRaises(re.error, re.compile, '(?P=)')
201 self.assertRaises(re.error, re.compile, '(?P=1)')
202 self.assertRaises(re.error, re.compile, '(?P=a)')
203 self.assertRaises(re.error, re.compile, '(?P=a1)')
204 self.assertRaises(re.error, re.compile, '(?P=a.)')
205 self.assertRaises(re.error, re.compile, '(?P<)')
206 self.assertRaises(re.error, re.compile, '(?P<>)')
207 self.assertRaises(re.error, re.compile, '(?P<1>)')
208 self.assertRaises(re.error, re.compile, '(?P<a.>)')
209 self.assertRaises(re.error, re.compile, '(?())')
210 self.assertRaises(re.error, re.compile, '(?(a))')
211 self.assertRaises(re.error, re.compile, '(?(1a))')
212 self.assertRaises(re.error, re.compile, '(?(a.))')
Georg Brandl1d472b72013-04-14 11:40:00 +0200213 # New valid/invalid identifiers in Python 3
214 re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
215 re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
216 self.assertRaises(re.error, re.compile, '(?P<©>x)')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300217 # Support > 100 groups.
218 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
219 pat = '(?:%s)(?(200)z|t)' % pat
220 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200221
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000222 def test_symbolic_refs(self):
223 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
224 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
225 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
226 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
Ezio Melotti0941d9f2012-11-03 20:33:08 +0200227 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<>', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000228 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300229 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<2>', 'xx')
230 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\2', 'xx')
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000231 self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300232 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
233 self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '')
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000234 self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
Georg Brandl1d472b72013-04-14 11:40:00 +0200235 # New valid/invalid identifiers in Python 3
236 self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
237 self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
238 self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300239 # Support > 100 groups.
240 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
241 self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
Guido van Rossumf473cb01998-01-14 16:42:17 +0000242
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000243 def test_re_subn(self):
244 self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
245 self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
246 self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
247 self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
Victor Stinner55e614a2014-10-29 16:58:59 +0100248 self.assertEqual(re.subn("b*", "x", "xyz", count=2), ('xxxyz', 2))
Guido van Rossum49946571997-07-18 04:26:25 +0000249
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000250 def test_re_split(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300251 for string in ":a:b::c", S(":a:b::c"):
252 self.assertTypedEqual(re.split(":", string),
253 ['', 'a', 'b', '', 'c'])
254 self.assertTypedEqual(re.split(":*", string),
255 ['', 'a', 'b', 'c'])
256 self.assertTypedEqual(re.split("(:*)", string),
257 ['', ':', 'a', ':', 'b', '::', 'c'])
258 for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
259 memoryview(b":a:b::c")):
260 self.assertTypedEqual(re.split(b":", string),
261 [b'', b'a', b'b', b'', b'c'])
262 self.assertTypedEqual(re.split(b":*", string),
263 [b'', b'a', b'b', b'c'])
264 self.assertTypedEqual(re.split(b"(:*)", string),
265 [b'', b':', b'a', b':', b'b', b'::', b'c'])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300266 for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
267 "\U0001d49c\U0001d49e\U0001d4b5"):
268 string = ":%s:%s::%s" % (a, b, c)
269 self.assertEqual(re.split(":", string), ['', a, b, '', c])
270 self.assertEqual(re.split(":*", string), ['', a, b, c])
271 self.assertEqual(re.split("(:*)", string),
272 ['', ':', a, ':', b, '::', c])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300273
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000274 self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
275 self.assertEqual(re.split("(:)*", ":a:b::c"),
276 ['', ':', 'a', ':', 'b', ':', 'c'])
277 self.assertEqual(re.split("([b:]+)", ":a:b::c"),
278 ['', ':', 'a', ':b::', 'c'])
279 self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
280 ['', None, ':', 'a', None, ':', '', 'b', None, '',
281 None, '::', 'c'])
282 self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
283 ['', 'a', '', '', 'c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000284
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000285 def test_qualified_re_split(self):
Victor Stinner55e614a2014-10-29 16:58:59 +0100286 self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
287 self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
288 self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000289 ['', ':', 'a', ':', 'b::c'])
Victor Stinner55e614a2014-10-29 16:58:59 +0100290 self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000291 ['', ':', 'a', ':', 'b::c'])
Guido van Rossum49946571997-07-18 04:26:25 +0000292
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000293 def test_re_findall(self):
294 self.assertEqual(re.findall(":+", "abc"), [])
Serhiy Storchaka25324972013-10-16 12:46:28 +0300295 for string in "a:b::c:::d", S("a:b::c:::d"):
296 self.assertTypedEqual(re.findall(":+", string),
297 [":", "::", ":::"])
298 self.assertTypedEqual(re.findall("(:+)", string),
299 [":", "::", ":::"])
300 self.assertTypedEqual(re.findall("(:)(:*)", string),
301 [(":", ""), (":", ":"), (":", "::")])
302 for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
303 memoryview(b"a:b::c:::d")):
304 self.assertTypedEqual(re.findall(b":+", string),
305 [b":", b"::", b":::"])
306 self.assertTypedEqual(re.findall(b"(:+)", string),
307 [b":", b"::", b":::"])
308 self.assertTypedEqual(re.findall(b"(:)(:*)", string),
309 [(b":", b""), (b":", b":"), (b":", b"::")])
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300310 for x in ("\xe0", "\u0430", "\U0001d49c"):
311 xx = x * 2
312 xxx = x * 3
313 string = "a%sb%sc%sd" % (x, xx, xxx)
314 self.assertEqual(re.findall("%s+" % x, string), [x, xx, xxx])
315 self.assertEqual(re.findall("(%s+)" % x, string), [x, xx, xxx])
316 self.assertEqual(re.findall("(%s)(%s*)" % (x, x), string),
317 [(x, ""), (x, x), (x, xx)])
Guido van Rossum49946571997-07-18 04:26:25 +0000318
Skip Montanaro5ba00542003-04-25 16:00:14 +0000319 def test_bug_117612(self):
320 self.assertEqual(re.findall(r"(a|(b))", "aba"),
321 [("a", ""),("b", "b"),("a", "")])
322
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000323 def test_re_match(self):
Serhiy Storchaka25324972013-10-16 12:46:28 +0300324 for string in 'a', S('a'):
325 self.assertEqual(re.match('a', string).groups(), ())
326 self.assertEqual(re.match('(a)', string).groups(), ('a',))
327 self.assertEqual(re.match('(a)', string).group(0), 'a')
328 self.assertEqual(re.match('(a)', string).group(1), 'a')
329 self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
330 for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
331 self.assertEqual(re.match(b'a', string).groups(), ())
332 self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
333 self.assertEqual(re.match(b'(a)', string).group(0), b'a')
334 self.assertEqual(re.match(b'(a)', string).group(1), b'a')
335 self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
Serhiy Storchaka9eabac62013-10-26 10:45:48 +0300336 for a in ("\xe0", "\u0430", "\U0001d49c"):
337 self.assertEqual(re.match(a, a).groups(), ())
338 self.assertEqual(re.match('(%s)' % a, a).groups(), (a,))
339 self.assertEqual(re.match('(%s)' % a, a).group(0), a)
340 self.assertEqual(re.match('(%s)' % a, a).group(1), a)
341 self.assertEqual(re.match('(%s)' % a, a).group(1, 1), (a, a))
Guido van Rossum49946571997-07-18 04:26:25 +0000342
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000343 pat = re.compile('((a)|(b))(c)?')
344 self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
345 self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
346 self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
347 self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
348 self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
Guido van Rossum8430c581998-04-03 21:47:12 +0000349
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000350 # A single group
351 m = re.match('(a)', 'a')
352 self.assertEqual(m.group(0), 'a')
353 self.assertEqual(m.group(0), 'a')
354 self.assertEqual(m.group(1), 'a')
355 self.assertEqual(m.group(1, 1), ('a', 'a'))
Guido van Rossum49946571997-07-18 04:26:25 +0000356
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000357 pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
358 self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
359 self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
360 (None, 'b', None))
361 self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
Guido van Rossum49946571997-07-18 04:26:25 +0000362
Serhiy Storchaka32eddc12013-11-23 23:20:30 +0200363 def test_re_fullmatch(self):
364 # Issue 16203: Proposal: add re.fullmatch() method.
365 self.assertEqual(re.fullmatch(r"a", "a").span(), (0, 1))
366 for string in "ab", S("ab"):
367 self.assertEqual(re.fullmatch(r"a|ab", string).span(), (0, 2))
368 for string in b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab"):
369 self.assertEqual(re.fullmatch(br"a|ab", string).span(), (0, 2))
370 for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
371 r = r"%s|%s" % (a, a + b)
372 self.assertEqual(re.fullmatch(r, a + b).span(), (0, 2))
373 self.assertEqual(re.fullmatch(r".*?$", "abc").span(), (0, 3))
374 self.assertEqual(re.fullmatch(r".*?", "abc").span(), (0, 3))
375 self.assertEqual(re.fullmatch(r"a.*?b", "ab").span(), (0, 2))
376 self.assertEqual(re.fullmatch(r"a.*?b", "abb").span(), (0, 3))
377 self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
378 self.assertIsNone(re.fullmatch(r"a+", "ab"))
379 self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
380 self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
381 self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
382 self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
383 self.assertEqual(re.fullmatch(r"ab(?<=b)cd", "abcd").span(), (0, 4))
384 self.assertEqual(re.fullmatch(r"(?=a|ab)ab", "ab").span(), (0, 2))
385
386 self.assertEqual(
387 re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
388 self.assertEqual(
389 re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
390 self.assertEqual(
391 re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span(), (1, 3))
392
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000393 def test_re_groupref_exists(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000394 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
395 ('(', 'a'))
396 self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
397 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300398 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'))
399 self.assertIsNone(re.match('^(\()?([^()]+)(?(1)\))$', '(a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000400 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
401 ('a', 'b'))
402 self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
403 (None, 'd'))
404 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
405 (None, 'd'))
406 self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
407 ('a', ''))
408
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000409 # Tests for bug #1177831: exercise groups other than the first group
410 p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
411 self.assertEqual(p.match('abc').groups(),
412 ('a', 'b', 'c'))
413 self.assertEqual(p.match('ad').groups(),
414 ('a', None, 'd'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300415 self.assertIsNone(p.match('abd'))
416 self.assertIsNone(p.match('ac'))
Michael W. Hudsone7fa1af2005-06-03 13:55:58 +0000417
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +0300418 # Support > 100 groups.
419 pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
420 pat = '(?:%s)(?(200)z)' % pat
421 self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
Andrew M. Kuchling3554cad2005-06-02 13:38:45 +0000422
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000423 def test_re_groupref(self):
424 self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
425 ('|', 'a'))
426 self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
427 (None, 'a'))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300428 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', 'a|'))
429 self.assertIsNone(re.match(r'^(\|)?([^()]+)\1$', '|a'))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000430 self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
431 ('a', 'a'))
432 self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
433 (None, None))
434
435 def test_groupdict(self):
436 self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
437 'first second').groupdict(),
438 {'first':'first', 'second':'second'})
439
440 def test_expand(self):
441 self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
442 "first second")
443 .expand(r"\2 \1 \g<second> \g<first>"),
444 "second first second first")
Serhiy Storchaka7438e4b2014-10-10 11:06:31 +0300445 self.assertEqual(re.match("(?P<first>first)|(?P<second>second)",
446 "first")
447 .expand(r"\2 \g<second>"),
448 " ")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000449
450 def test_repeat_minmax(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300451 self.assertIsNone(re.match("^(\w){1}$", "abc"))
452 self.assertIsNone(re.match("^(\w){1}?$", "abc"))
453 self.assertIsNone(re.match("^(\w){1,2}$", "abc"))
454 self.assertIsNone(re.match("^(\w){1,2}?$", "abc"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000455
456 self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
457 self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
458 self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
459 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
460 self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
461 self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
462 self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
463 self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
464
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300465 self.assertIsNone(re.match("^x{1}$", "xxx"))
466 self.assertIsNone(re.match("^x{1}?$", "xxx"))
467 self.assertIsNone(re.match("^x{1,2}$", "xxx"))
468 self.assertIsNone(re.match("^x{1,2}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000469
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300470 self.assertTrue(re.match("^x{3}$", "xxx"))
471 self.assertTrue(re.match("^x{1,3}$", "xxx"))
472 self.assertTrue(re.match("^x{1,4}$", "xxx"))
473 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
474 self.assertTrue(re.match("^x{3}?$", "xxx"))
475 self.assertTrue(re.match("^x{1,3}?$", "xxx"))
476 self.assertTrue(re.match("^x{1,4}?$", "xxx"))
477 self.assertTrue(re.match("^x{3,4}?$", "xxx"))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000478
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300479 self.assertIsNone(re.match("^x{}$", "xxx"))
480 self.assertTrue(re.match("^x{}$", "x{}"))
Gustavo Niemeyer6fa0c5a2005-09-14 08:54:39 +0000481
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000482 def test_getattr(self):
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000483 self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
Antoine Pitroufd036452008-08-19 17:56:33 +0000484 self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
Amaury Forgeot d'Arce43d33a2008-07-02 20:50:16 +0000485 self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
486 self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
487 self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
488 {'first': 1, 'other': 2})
489
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000490 self.assertEqual(re.match("(a)", "a").pos, 0)
491 self.assertEqual(re.match("(a)", "a").endpos, 1)
492 self.assertEqual(re.match("(a)", "a").string, "a")
493 self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300494 self.assertTrue(re.match("(a)", "a").re)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000495
496 def test_special_escapes(self):
497 self.assertEqual(re.search(r"\b(b.)\b",
498 "abcd abc bcd bx").group(1), "bx")
499 self.assertEqual(re.search(r"\B(b.)\B",
500 "abc bcd bc abxd").group(1), "bx")
501 self.assertEqual(re.search(r"\b(b.)\b",
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300502 "abcd abc bcd bx", re.ASCII).group(1), "bx")
503 self.assertEqual(re.search(r"\B(b.)\B",
504 "abc bcd bc abxd", re.ASCII).group(1), "bx")
505 self.assertEqual(re.search(r"\b(b.)\b",
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000506 "abcd abc bcd bx", re.LOCALE).group(1), "bx")
507 self.assertEqual(re.search(r"\B(b.)\B",
508 "abc bcd bc abxd", re.LOCALE).group(1), "bx")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000509 self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
510 self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300511 self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300512 self.assertEqual(re.search(br"\b(b.)\b",
513 b"abcd abc bcd bx").group(1), b"bx")
514 self.assertEqual(re.search(br"\B(b.)\B",
515 b"abc bcd bc abxd").group(1), b"bx")
516 self.assertEqual(re.search(br"\b(b.)\b",
517 b"abcd abc bcd bx", re.LOCALE).group(1), b"bx")
518 self.assertEqual(re.search(br"\B(b.)\B",
519 b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
520 self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
521 self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300522 self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000523 self.assertEqual(re.search(r"\d\D\w\W\s\S",
524 "1aa! a").group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300525 self.assertEqual(re.search(br"\d\D\w\W\s\S",
526 b"1aa! a").group(0), b"1aa! a")
527 self.assertEqual(re.search(r"\d\D\w\W\s\S",
528 "1aa! a", re.ASCII).group(0), "1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000529 self.assertEqual(re.search(r"\d\D\w\W\s\S",
530 "1aa! a", re.LOCALE).group(0), "1aa! a")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300531 self.assertEqual(re.search(br"\d\D\w\W\s\S",
532 b"1aa! a", re.LOCALE).group(0), b"1aa! a")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000533
Serhiy Storchakab99c1322014-11-10 14:38:16 +0200534 def test_other_escapes(self):
535 self.assertRaises(re.error, re.compile, "\\")
536 self.assertEqual(re.match(r"\(", '(').group(), '(')
537 self.assertIsNone(re.match(r"\(", ')'))
538 self.assertEqual(re.match(r"\\", '\\').group(), '\\')
539 self.assertEqual(re.match(r"\y", 'y').group(), 'y')
540 self.assertIsNone(re.match(r"\y", 'z'))
541 self.assertEqual(re.match(r"[\]]", ']').group(), ']')
542 self.assertIsNone(re.match(r"[\]]", '['))
543 self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
544 self.assertIsNone(re.match(r"[a\-c]", 'b'))
545 self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
546 self.assertIsNone(re.match(r"[\^a]+", 'b'))
547
Ezio Melotti5a045b92012-02-29 11:48:44 +0200548 def test_string_boundaries(self):
549 # See http://bugs.python.org/issue10713
550 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
551 "abc")
552 # There's a word boundary at the start of a string.
553 self.assertTrue(re.match(r"\b", "abc"))
554 # A non-empty string includes a non-boundary zero-length match.
555 self.assertTrue(re.search(r"\B", "abc"))
556 # There is no non-boundary match at the start of a string.
557 self.assertFalse(re.match(r"\B", "abc"))
558 # However, an empty string contains no word boundaries, and also no
559 # non-boundaries.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300560 self.assertIsNone(re.search(r"\B", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200561 # This one is questionable and different from the perlre behaviour,
562 # but describes current behavior.
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300563 self.assertIsNone(re.search(r"\b", ""))
Ezio Melotti5a045b92012-02-29 11:48:44 +0200564 # A single word-character string has two boundaries, but no
565 # non-boundary gaps.
566 self.assertEqual(len(re.findall(r"\b", "a")), 2)
567 self.assertEqual(len(re.findall(r"\B", "a")), 0)
568 # If there are no words, there are no boundaries
569 self.assertEqual(len(re.findall(r"\b", " ")), 0)
570 self.assertEqual(len(re.findall(r"\b", " ")), 0)
571 # Can match around the whitespace.
572 self.assertEqual(len(re.findall(r"\B", " ")), 2)
573
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000574 def test_bigcharset(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000575 self.assertEqual(re.match("([\u2222\u2223])",
576 "\u2222").group(1), "\u2222")
Serhiy Storchakabe80fc92013-10-24 22:02:58 +0300577 r = '[%s]' % ''.join(map(chr, range(256, 2**16, 255)))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300578 self.assertEqual(re.match(r, "\uff01").group(), "\uff01")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000579
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100580 def test_big_codesize(self):
581 # Issue #1160
582 r = re.compile('|'.join(('%d'%x for x in range(10000))))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300583 self.assertTrue(r.match('1000'))
584 self.assertTrue(r.match('9999'))
Antoine Pitrou39bdad82012-11-20 22:30:42 +0100585
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000586 def test_anyall(self):
587 self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
588 "a\nb")
589 self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
590 "a\n\nb")
591
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200592 def test_lookahead(self):
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000593 self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
594 self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
595 self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
596 self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
597 self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
598 self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
599 self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
600
601 self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
602 self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
603 self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
604 self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
605
Serhiy Storchaka84df7fe2014-11-07 21:43:57 +0200606 # Group reference.
607 self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
608 self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
609 # Conditional group reference.
610 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
611 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
612 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
613 self.assertIsNone(re.match('(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
614 self.assertTrue(re.match('(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
615 # Group used before defined.
616 self.assertTrue(re.match('(a)b(?=(?(2)x|c))(c)', 'abc'))
617 self.assertIsNone(re.match('(a)b(?=(?(2)b|x))(c)', 'abc'))
618 self.assertTrue(re.match('(a)b(?=(?(1)c|x))(c)', 'abc'))
619
620 def test_lookbehind(self):
621 self.assertTrue(re.match('ab(?<=b)c', 'abc'))
622 self.assertIsNone(re.match('ab(?<=c)c', 'abc'))
623 self.assertIsNone(re.match('ab(?<!b)c', 'abc'))
624 self.assertTrue(re.match('ab(?<!c)c', 'abc'))
625 # Group reference.
626 self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
627 self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
628 self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
629 self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
630 # Conditional group reference.
631 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
632 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
633 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
634 self.assertIsNone(re.match('(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
635 self.assertTrue(re.match('(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
636 # Group used before defined.
637 self.assertIsNone(re.match('(a)b(?<=(?(2)x|c))(c)', 'abc'))
638 self.assertIsNone(re.match('(a)b(?<=(?(2)b|x))(c)', 'abc'))
639 self.assertIsNone(re.match('(a)b(?<=(?(1)c|x))(c)', 'abc'))
640 self.assertTrue(re.match('(a)b(?<=(?(1)b|x))(c)', 'abc'))
641
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000642 def test_ignore_case(self):
Benjamin Petersona786b022008-08-25 21:05:21 +0000643 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300644 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000645 self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
646 self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
647 self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
648 self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
649 self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
650 self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
651 self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
652 self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
653
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200654 assert '\u212a'.lower() == 'k' # 'K'
655 self.assertTrue(re.match(r'K', '\u212a', re.I))
656 self.assertTrue(re.match(r'k', '\u212a', re.I))
657 self.assertTrue(re.match(r'\u212a', 'K', re.I))
658 self.assertTrue(re.match(r'\u212a', 'k', re.I))
659 assert '\u017f'.upper() == 'S' # 'ſ'
660 self.assertTrue(re.match(r'S', '\u017f', re.I))
661 self.assertTrue(re.match(r's', '\u017f', re.I))
662 self.assertTrue(re.match(r'\u017f', 'S', re.I))
663 self.assertTrue(re.match(r'\u017f', 's', re.I))
664 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
665 self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I))
666 self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I))
667
668 def test_ignore_case_set(self):
669 self.assertTrue(re.match(r'[19A]', 'A', re.I))
670 self.assertTrue(re.match(r'[19a]', 'a', re.I))
671 self.assertTrue(re.match(r'[19a]', 'A', re.I))
672 self.assertTrue(re.match(r'[19A]', 'a', re.I))
673 self.assertTrue(re.match(br'[19A]', b'A', re.I))
674 self.assertTrue(re.match(br'[19a]', b'a', re.I))
675 self.assertTrue(re.match(br'[19a]', b'A', re.I))
676 self.assertTrue(re.match(br'[19A]', b'a', re.I))
677 assert '\u212a'.lower() == 'k' # 'K'
678 self.assertTrue(re.match(r'[19K]', '\u212a', re.I))
679 self.assertTrue(re.match(r'[19k]', '\u212a', re.I))
680 self.assertTrue(re.match(r'[19\u212a]', 'K', re.I))
681 self.assertTrue(re.match(r'[19\u212a]', 'k', re.I))
682 assert '\u017f'.upper() == 'S' # 'ſ'
683 self.assertTrue(re.match(r'[19S]', '\u017f', re.I))
684 self.assertTrue(re.match(r'[19s]', '\u017f', re.I))
685 self.assertTrue(re.match(r'[19\u017f]', 'S', re.I))
686 self.assertTrue(re.match(r'[19\u017f]', 's', re.I))
687 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
688 self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I))
689 self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I))
690
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200691 def test_ignore_case_range(self):
692 # Issues #3511, #17381.
693 self.assertTrue(re.match(r'[9-a]', '_', re.I))
694 self.assertIsNone(re.match(r'[9-A]', '_', re.I))
695 self.assertTrue(re.match(br'[9-a]', b'_', re.I))
696 self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
697 self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
698 self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
699 self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
700 self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
701 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
702 self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
703 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0450', re.I))
704 self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I))
705 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I))
706 self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I))
707 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
708 self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
709
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200710 assert '\u212a'.lower() == 'k' # 'K'
711 self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
712 self.assertTrue(re.match(r'[j-m]', '\u212a', re.I))
713 self.assertTrue(re.match(r'[\u2129-\u212b]', 'K', re.I))
714 self.assertTrue(re.match(r'[\u2129-\u212b]', 'k', re.I))
715 assert '\u017f'.upper() == 'S' # 'ſ'
716 self.assertTrue(re.match(r'[R-T]', '\u017f', re.I))
717 self.assertTrue(re.match(r'[r-t]', '\u017f', re.I))
718 self.assertTrue(re.match(r'[\u017e-\u0180]', 'S', re.I))
719 self.assertTrue(re.match(r'[\u017e-\u0180]', 's', re.I))
720 assert '\ufb05'.upper() == '\ufb06'.upper() == 'ST' # 'ſt', 'st'
721 self.assertTrue(re.match(r'[\ufb04-\ufb05]', '\ufb06', re.I))
722 self.assertTrue(re.match(r'[\ufb06-\ufb07]', '\ufb05', re.I))
723
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000724 def test_category(self):
725 self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
726
727 def test_getlower(self):
728 import _sre
729 self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
730 self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
731 self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
732
733 self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
Serhiy Storchakaa25875c2014-09-14 15:56:27 +0300734 self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000735
736 def test_not_literal(self):
737 self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
738 self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
739
740 def test_search_coverage(self):
741 self.assertEqual(re.search("\s(b)", " b").group(1), "b")
742 self.assertEqual(re.search("a\s", "a ").group(0), "a ")
743
Ezio Melottid2114eb2011-03-25 14:08:44 +0200744 def assertMatch(self, pattern, text, match=None, span=None,
745 matcher=re.match):
746 if match is None and span is None:
747 # the pattern matches the whole text
748 match = text
749 span = (0, len(text))
750 elif match is None or span is None:
751 raise ValueError('If match is not None, span should be specified '
752 '(and vice versa).')
753 m = matcher(pattern, text)
754 self.assertTrue(m)
755 self.assertEqual(m.group(), match)
756 self.assertEqual(m.span(), span)
Guido van Rossum49946571997-07-18 04:26:25 +0000757
Ezio Melottid2114eb2011-03-25 14:08:44 +0200758 def test_re_escape(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300759 alnum_chars = string.ascii_letters + string.digits + '_'
Ezio Melottid2114eb2011-03-25 14:08:44 +0200760 p = ''.join(chr(i) for i in range(256))
761 for c in p:
762 if c in alnum_chars:
763 self.assertEqual(re.escape(c), c)
764 elif c == '\x00':
765 self.assertEqual(re.escape(c), '\\000')
766 else:
767 self.assertEqual(re.escape(c), '\\' + c)
768 self.assertMatch(re.escape(c), c)
769 self.assertMatch(re.escape(p), p)
Guido van Rossum49946571997-07-18 04:26:25 +0000770
Guido van Rossum698280d2008-09-10 17:44:35 +0000771 def test_re_escape_byte(self):
Ezio Melotti88fdeb42011-04-10 12:59:16 +0300772 alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
Ezio Melottid2114eb2011-03-25 14:08:44 +0200773 p = bytes(range(256))
774 for i in p:
Guido van Rossum698280d2008-09-10 17:44:35 +0000775 b = bytes([i])
Ezio Melottid2114eb2011-03-25 14:08:44 +0200776 if b in alnum_chars:
777 self.assertEqual(re.escape(b), b)
778 elif i == 0:
779 self.assertEqual(re.escape(b), b'\\000')
780 else:
781 self.assertEqual(re.escape(b), b'\\' + b)
782 self.assertMatch(re.escape(b), b)
783 self.assertMatch(re.escape(p), p)
Guido van Rossum698280d2008-09-10 17:44:35 +0000784
Ezio Melotti7b9e97b2011-03-25 14:09:33 +0200785 def test_re_escape_non_ascii(self):
786 s = 'xxx\u2620\u2620\u2620xxx'
787 s_escaped = re.escape(s)
788 self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx')
789 self.assertMatch(s_escaped, s)
790 self.assertMatch('.%s+.' % re.escape('\u2620'), s,
791 'x\u2620\u2620\u2620x', (2, 7), re.search)
792
793 def test_re_escape_non_ascii_bytes(self):
794 b = 'y\u2620y\u2620y'.encode('utf-8')
795 b_escaped = re.escape(b)
796 self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y')
797 self.assertMatch(b_escaped, b)
798 res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
799 self.assertEqual(len(res), 2)
Guido van Rossum698280d2008-09-10 17:44:35 +0000800
Serhiy Storchakab85a9762014-09-15 11:33:19 +0300801 def test_pickling(self):
802 import pickle
803 oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)', re.UNICODE)
804 for proto in range(pickle.HIGHEST_PROTOCOL + 1):
805 pickled = pickle.dumps(oldpat, proto)
806 newpat = pickle.loads(pickled)
807 self.assertEqual(newpat, oldpat)
808 # current pickle expects the _compile() reconstructor in re module
809 from re import _compile
Guido van Rossum23b22571997-07-17 22:36:14 +0000810
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000811 def test_constants(self):
812 self.assertEqual(re.I, re.IGNORECASE)
813 self.assertEqual(re.L, re.LOCALE)
814 self.assertEqual(re.M, re.MULTILINE)
815 self.assertEqual(re.S, re.DOTALL)
816 self.assertEqual(re.X, re.VERBOSE)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +0000817
Skip Montanaro8ed06da2003-04-24 19:43:18 +0000818 def test_flags(self):
Skip Montanaro1e703c62003-04-25 15:40:28 +0000819 for flag in [re.I, re.M, re.X, re.S, re.L]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300820 self.assertTrue(re.compile('^pattern$', flag))
Guido van Rossumf473cb01998-01-14 16:42:17 +0000821
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000822 def test_sre_character_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200823 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
824 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300825 self.assertTrue(re.match(r"\%03o" % i, chr(i)))
826 self.assertTrue(re.match(r"\%03o0" % i, chr(i)+"0"))
827 self.assertTrue(re.match(r"\%03o8" % i, chr(i)+"8"))
828 self.assertTrue(re.match(r"\x%02x" % i, chr(i)))
829 self.assertTrue(re.match(r"\x%02x0" % i, chr(i)+"0"))
830 self.assertTrue(re.match(r"\x%02xz" % i, chr(i)+"z"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200831 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300832 self.assertTrue(re.match(r"\u%04x" % i, chr(i)))
833 self.assertTrue(re.match(r"\u%04x0" % i, chr(i)+"0"))
834 self.assertTrue(re.match(r"\u%04xz" % i, chr(i)+"z"))
835 self.assertTrue(re.match(r"\U%08x" % i, chr(i)))
836 self.assertTrue(re.match(r"\U%08x0" % i, chr(i)+"0"))
837 self.assertTrue(re.match(r"\U%08xz" % i, chr(i)+"z"))
838 self.assertTrue(re.match(r"\0", "\000"))
839 self.assertTrue(re.match(r"\08", "\0008"))
840 self.assertTrue(re.match(r"\01", "\001"))
841 self.assertTrue(re.match(r"\018", "\0018"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300842 self.assertRaises(re.error, re.match, r"\567", "")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200843 self.assertRaises(re.error, re.match, r"\911", "")
844 self.assertRaises(re.error, re.match, r"\x1", "")
845 self.assertRaises(re.error, re.match, r"\x1z", "")
846 self.assertRaises(re.error, re.match, r"\u123", "")
847 self.assertRaises(re.error, re.match, r"\u123z", "")
848 self.assertRaises(re.error, re.match, r"\U0001234", "")
849 self.assertRaises(re.error, re.match, r"\U0001234z", "")
850 self.assertRaises(re.error, re.match, r"\U00110000", "")
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000851
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000852 def test_sre_character_class_literals(self):
Antoine Pitrou463badf2012-06-23 13:29:19 +0200853 for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
854 if i < 256:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300855 self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
856 self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
857 self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
858 self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
859 self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
860 self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
861 self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
862 self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200863 if i < 0x10000:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300864 self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
865 self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
866 self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
867 self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
868 self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
869 self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300870 self.assertRaises(re.error, re.match, r"[\567]", "")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200871 self.assertRaises(re.error, re.match, r"[\911]", "")
872 self.assertRaises(re.error, re.match, r"[\x1z]", "")
873 self.assertRaises(re.error, re.match, r"[\u123z]", "")
874 self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
875 self.assertRaises(re.error, re.match, r"[\U00110000]", "")
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300876 self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
Antoine Pitrou463badf2012-06-23 13:29:19 +0200877
878 def test_sre_byte_literals(self):
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000879 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300880 self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
881 self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
882 self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
883 self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
884 self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
885 self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
886 self.assertTrue(re.match(br"\u", b'u'))
887 self.assertTrue(re.match(br"\U", b'U'))
888 self.assertTrue(re.match(br"\0", b"\000"))
889 self.assertTrue(re.match(br"\08", b"\0008"))
890 self.assertTrue(re.match(br"\01", b"\001"))
891 self.assertTrue(re.match(br"\018", b"\0018"))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300892 self.assertRaises(re.error, re.match, br"\567", b"")
Antoine Pitrou463badf2012-06-23 13:29:19 +0200893 self.assertRaises(re.error, re.match, br"\911", b"")
894 self.assertRaises(re.error, re.match, br"\x1", b"")
895 self.assertRaises(re.error, re.match, br"\x1z", b"")
896
897 def test_sre_byte_class_literals(self):
898 for i in [0, 8, 16, 32, 64, 127, 128, 255]:
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300899 self.assertTrue(re.match((r"[\%o]" % i).encode(), bytes([i])))
900 self.assertTrue(re.match((r"[\%o8]" % i).encode(), bytes([i])))
901 self.assertTrue(re.match((r"[\%03o]" % i).encode(), bytes([i])))
902 self.assertTrue(re.match((r"[\%03o0]" % i).encode(), bytes([i])))
903 self.assertTrue(re.match((r"[\%03o8]" % i).encode(), bytes([i])))
904 self.assertTrue(re.match((r"[\x%02x]" % i).encode(), bytes([i])))
905 self.assertTrue(re.match((r"[\x%02x0]" % i).encode(), bytes([i])))
906 self.assertTrue(re.match((r"[\x%02xz]" % i).encode(), bytes([i])))
907 self.assertTrue(re.match(br"[\u]", b'u'))
908 self.assertTrue(re.match(br"[\U]", b'U'))
Serhiy Storchakac563caf2014-09-23 23:22:41 +0300909 self.assertRaises(re.error, re.match, br"[\567]", b"")
Serhiy Storchakacd9032d2014-09-23 23:04:21 +0300910 self.assertRaises(re.error, re.match, br"[\911]", b"")
911 self.assertRaises(re.error, re.match, br"[\x1z]", b"")
Gustavo Niemeyera01a2ee2004-09-03 17:06:10 +0000912
Skip Montanaro7d9963f2003-04-25 14:12:40 +0000913 def test_bug_113254(self):
914 self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
915 self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
916 self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
917
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000918 def test_bug_527371(self):
919 # bug described in patches 527371/672491
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300920 self.assertIsNone(re.match(r'(a)?a','a').lastindex)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000921 self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
922 self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
923 self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
924 self.assertEqual(re.match("((a))", "a").lastindex, 1)
925
926 def test_bug_545855(self):
927 # bug 545855 -- This pattern failed to cause a compile error as it
928 # should, instead provoking a TypeError.
929 self.assertRaises(re.error, re.compile, 'foo[a-')
930
931 def test_bug_418626(self):
932 # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
933 # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
934 # pattern '*?' on a long string.
935 self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
936 self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
937 20003)
938 self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000939 # non-simple '*?' still used to hit the recursion limit, before the
Tim Peters58eb11c2004-01-18 20:29:55 +0000940 # non-recursive scheme was implemented.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000941 self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000942
943 def test_bug_612074(self):
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000944 pat="["+re.escape("\u2039")+"]"
Skip Montanaro2726fcd2003-04-25 14:31:54 +0000945 self.assertEqual(re.compile(pat) and 1, 1)
946
Skip Montanaro1e703c62003-04-25 15:40:28 +0000947 def test_stack_overflow(self):
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000948 # nasty cases that used to overflow the straightforward recursive
Skip Montanaro1e703c62003-04-25 15:40:28 +0000949 # implementation of repeated groups.
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000950 self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
951 self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
952 self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
Skip Montanaro1e703c62003-04-25 15:40:28 +0000953
Serhiy Storchakafa468162013-02-16 21:23:53 +0200954 def test_unlimited_zero_width_repeat(self):
955 # Issue #9669
956 self.assertIsNone(re.match(r'(?:a?)*y', 'z'))
957 self.assertIsNone(re.match(r'(?:a?)+y', 'z'))
958 self.assertIsNone(re.match(r'(?:a?){2,}y', 'z'))
959 self.assertIsNone(re.match(r'(?:a?)*?y', 'z'))
960 self.assertIsNone(re.match(r'(?:a?)+?y', 'z'))
961 self.assertIsNone(re.match(r'(?:a?){2,}?y', 'z'))
962
Skip Montanaro1e703c62003-04-25 15:40:28 +0000963 def test_scanner(self):
964 def s_ident(scanner, token): return token
965 def s_operator(scanner, token): return "op%s" % token
966 def s_float(scanner, token): return float(token)
967 def s_int(scanner, token): return int(token)
968
969 scanner = Scanner([
970 (r"[a-zA-Z_]\w*", s_ident),
971 (r"\d+\.\d*", s_float),
972 (r"\d+", s_int),
973 (r"=|\+|-|\*|/", s_operator),
974 (r"\s+", None),
975 ])
976
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +0300977 self.assertTrue(scanner.scanner.scanner("").pattern)
Gustavo Niemeyer25fe0bf2003-06-20 00:25:14 +0000978
Skip Montanaro1e703c62003-04-25 15:40:28 +0000979 self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
980 (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
981 'op+', 'bar'], ''))
982
Skip Montanaro5ba00542003-04-25 16:00:14 +0000983 def test_bug_448951(self):
984 # bug 448951 (similar to 429357, but with single char match)
985 # (Also test greedy matches.)
986 for op in '','?','*':
987 self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
988 (None, None))
989 self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
990 ('a:', 'a'))
991
Gustavo Niemeyerc34f2552003-04-27 12:34:14 +0000992 def test_bug_725106(self):
993 # capturing groups in alternatives in repeats
994 self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
995 ('b', 'a'))
996 self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
997 ('c', 'b'))
998 self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
999 ('b', None))
1000 self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
1001 ('b', None))
1002 self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
1003 ('b', 'a'))
1004 self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
1005 ('c', 'b'))
1006 self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
1007 ('b', None))
1008 self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
1009 ('b', None))
1010
Gustavo Niemeyer3646ab92003-04-27 13:25:21 +00001011 def test_bug_725149(self):
1012 # mark_stack_base restoring before restoring marks
1013 self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
1014 ('a', None))
1015 self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
1016 ('a', None, None))
1017
Just van Rossum12723ba2003-07-02 20:03:04 +00001018 def test_bug_764548(self):
1019 # bug 764548, re.compile() barfs on str/unicode subclasses
Guido van Rossumef87d6e2007-05-02 19:09:54 +00001020 class my_unicode(str): pass
Just van Rossum12723ba2003-07-02 20:03:04 +00001021 pat = re.compile(my_unicode("abc"))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001022 self.assertIsNone(pat.match("xyz"))
Just van Rossum12723ba2003-07-02 20:03:04 +00001023
Skip Montanaro5ba00542003-04-25 16:00:14 +00001024 def test_finditer(self):
1025 iter = re.finditer(r":+", "a:b::c:::d")
1026 self.assertEqual([item.group(0) for item in iter],
1027 [":", "::", ":::"])
1028
Sean Reifschneider7b3c9752012-03-12 18:22:38 -06001029 pat = re.compile(r":+")
1030 iter = pat.finditer("a:b::c:::d", 1, 10)
1031 self.assertEqual([item.group(0) for item in iter],
1032 [":", "::", ":::"])
1033
1034 pat = re.compile(r":+")
1035 iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
1036 self.assertEqual([item.group(0) for item in iter],
1037 [":", "::", ":::"])
1038
1039 pat = re.compile(r":+")
1040 iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
1041 self.assertEqual([item.group(0) for item in iter],
1042 [":", "::", ":::"])
1043
1044 pat = re.compile(r":+")
1045 iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
1046 self.assertEqual([item.group(0) for item in iter],
1047 ["::", "::"])
1048
Thomas Wouters40a088d2008-03-18 20:19:54 +00001049 def test_bug_926075(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001050 self.assertIsNot(re.compile('bug_926075'),
1051 re.compile(b'bug_926075'))
Hye-Shik Chang9f62ecc2004-04-20 21:30:07 +00001052
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001053 def test_bug_931848(self):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001054 pattern = "[\u002E\u3002\uFF0E\uFF61]"
Martin v. Löwis7d9c6c72004-05-07 07:18:13 +00001055 self.assertEqual(re.compile(pattern).split("a.b.c"),
1056 ['a','b','c'])
1057
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001058 def test_bug_581080(self):
1059 iter = re.finditer(r"\s", "a b")
Georg Brandla18af4e2007-04-21 15:47:16 +00001060 self.assertEqual(next(iter).span(), (1,2))
1061 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001062
1063 scanner = re.compile(r"\s").scanner("a b")
1064 self.assertEqual(scanner.search().span(), (1, 2))
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001065 self.assertIsNone(scanner.search())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001066
1067 def test_bug_817234(self):
1068 iter = re.finditer(r".*", "asdf")
Georg Brandla18af4e2007-04-21 15:47:16 +00001069 self.assertEqual(next(iter).span(), (0, 4))
1070 self.assertEqual(next(iter).span(), (4, 4))
1071 self.assertRaises(StopIteration, next, iter)
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001072
Mark Dickinson1f268282009-07-28 17:22:36 +00001073 def test_bug_6561(self):
1074 # '\d' should match characters in Unicode category 'Nd'
1075 # (Number, Decimal Digit), but not those in 'Nl' (Number,
1076 # Letter) or 'No' (Number, Other).
1077 decimal_digits = [
1078 '\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
1079 '\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
1080 '\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
1081 ]
1082 for x in decimal_digits:
1083 self.assertEqual(re.match('^\d$', x).group(0), x)
1084
1085 not_decimal_digits = [
1086 '\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
1087 '\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
1088 '\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
1089 '\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
1090 ]
1091 for x in not_decimal_digits:
1092 self.assertIsNone(re.match('^\d$', x))
1093
Guido van Rossumd8faa362007-04-27 19:54:29 +00001094 def test_empty_array(self):
1095 # SF buf 1647541
1096 import array
Guido van Rossum166746c2007-07-03 15:39:16 +00001097 for typecode in 'bBuhHiIlLfd':
Guido van Rossumd8faa362007-04-27 19:54:29 +00001098 a = array.array(typecode)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001099 self.assertIsNone(re.compile(b"bla").match(a))
Antoine Pitroufd036452008-08-19 17:56:33 +00001100 self.assertEqual(re.compile(b"").match(a).groups(), ())
Gustavo Niemeyer0506c642004-09-03 18:11:59 +00001101
Christian Heimes072c0f12008-01-03 23:01:04 +00001102 def test_inline_flags(self):
1103 # Bug #1700
Serhiy Storchakaab140882014-11-11 21:13:28 +02001104 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below
1105 lower_char = '\u1ea1' # Latin Small Letter A with Dot Below
Christian Heimes072c0f12008-01-03 23:01:04 +00001106
1107 p = re.compile(upper_char, re.I | re.U)
1108 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001109 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001110
1111 p = re.compile(lower_char, re.I | re.U)
1112 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001113 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001114
1115 p = re.compile('(?i)' + upper_char, re.U)
1116 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001117 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001118
1119 p = re.compile('(?i)' + lower_char, re.U)
1120 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001121 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001122
1123 p = re.compile('(?iu)' + upper_char)
1124 q = p.match(lower_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001125 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001126
1127 p = re.compile('(?iu)' + lower_char)
1128 q = p.match(upper_char)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001129 self.assertTrue(q)
Christian Heimes072c0f12008-01-03 23:01:04 +00001130
Christian Heimes25bb7832008-01-11 16:17:00 +00001131 def test_dollar_matches_twice(self):
1132 "$ matches the end of string, and just before the terminating \n"
1133 pattern = re.compile('$')
1134 self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
1135 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
1136 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1137
1138 pattern = re.compile('$', re.MULTILINE)
1139 self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
1140 self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
1141 self.assertEqual(pattern.sub('#', '\n'), '#\n#')
1142
Antoine Pitroufd036452008-08-19 17:56:33 +00001143 def test_bytes_str_mixing(self):
1144 # Mixing str and bytes is disallowed
1145 pat = re.compile('.')
1146 bpat = re.compile(b'.')
1147 self.assertRaises(TypeError, pat.match, b'b')
1148 self.assertRaises(TypeError, bpat.match, 'b')
1149 self.assertRaises(TypeError, pat.sub, b'b', 'c')
1150 self.assertRaises(TypeError, pat.sub, 'b', b'c')
1151 self.assertRaises(TypeError, pat.sub, b'b', b'c')
1152 self.assertRaises(TypeError, bpat.sub, b'b', 'c')
1153 self.assertRaises(TypeError, bpat.sub, 'b', b'c')
1154 self.assertRaises(TypeError, bpat.sub, 'b', 'c')
1155
1156 def test_ascii_and_unicode_flag(self):
1157 # String patterns
1158 for flags in (0, re.UNICODE):
1159 pat = re.compile('\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001160 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001161 pat = re.compile('\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001162 self.assertTrue(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001163 pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001164 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001165 pat = re.compile('(?a)\xc0', re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001166 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001167 pat = re.compile('\w', re.ASCII)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001168 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001169 pat = re.compile('(?a)\w')
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001170 self.assertIsNone(pat.match('\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001171 # Bytes patterns
1172 for flags in (0, re.ASCII):
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001173 pat = re.compile(b'\xc0', flags | re.IGNORECASE)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001174 self.assertIsNone(pat.match(b'\xe0'))
Serhiy Storchakaa25875c2014-09-14 15:56:27 +03001175 pat = re.compile(b'\w', flags)
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001176 self.assertIsNone(pat.match(b'\xe0'))
Antoine Pitroufd036452008-08-19 17:56:33 +00001177 # Incompatibilities
1178 self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
1179 self.assertRaises(ValueError, re.compile, b'(?u)\w')
1180 self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
1181 self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
1182 self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
1183 self.assertRaises(ValueError, re.compile, '(?au)\w')
1184
Ezio Melottib92ed7c2010-03-06 15:24:08 +00001185 def test_bug_6509(self):
1186 # Replacement strings of both types must parse properly.
1187 # all strings
1188 pat = re.compile('a(\w)')
1189 self.assertEqual(pat.sub('b\\1', 'ac'), 'bc')
1190 pat = re.compile('a(.)')
1191 self.assertEqual(pat.sub('b\\1', 'a\u1234'), 'b\u1234')
1192 pat = re.compile('..')
1193 self.assertEqual(pat.sub(lambda m: 'str', 'a5'), 'str')
1194
1195 # all bytes
1196 pat = re.compile(b'a(\w)')
1197 self.assertEqual(pat.sub(b'b\\1', b'ac'), b'bc')
1198 pat = re.compile(b'a(.)')
1199 self.assertEqual(pat.sub(b'b\\1', b'a\xCD'), b'b\xCD')
1200 pat = re.compile(b'..')
1201 self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes')
1202
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001203 def test_dealloc(self):
1204 # issue 3299: check for segfault in debug build
1205 import _sre
Ezio Melotti0f77f462010-01-23 10:49:39 +00001206 # the overflow limit is different on wide and narrow builds and it
1207 # depends on the definition of SRE_CODE (see sre.h).
1208 # 2**128 should be big enough to overflow on both. For smaller values
1209 # a RuntimeError is raised instead of OverflowError.
1210 long_overflow = 2**128
Antoine Pitrou82feb1f2010-01-14 17:34:48 +00001211 self.assertRaises(TypeError, re.finditer, "a", {})
Serhiy Storchaka9baa5b22014-09-29 22:49:23 +03001212 with self.assertRaises(OverflowError):
1213 _sre.compile("abc", 0, [long_overflow], 0, [], [])
1214 with self.assertRaises(TypeError):
1215 _sre.compile({}, 0, [], 0, [], [])
Christian Heimes072c0f12008-01-03 23:01:04 +00001216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 def test_search_dot_unicode(self):
Serhiy Storchakad9cf65f2014-09-14 16:20:20 +03001218 self.assertTrue(re.search("123.*-", '123abc-'))
1219 self.assertTrue(re.search("123.*-", '123\xe9-'))
1220 self.assertTrue(re.search("123.*-", '123\u20ac-'))
1221 self.assertTrue(re.search("123.*-", '123\U0010ffff-'))
1222 self.assertTrue(re.search("123.*-", '123\xe9\u20ac\U0010ffff-'))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223
Ezio Melottidf723e12012-03-13 01:29:48 +02001224 def test_compile(self):
1225 # Test return value when given string and pattern as parameter
1226 pattern = re.compile('random pattern')
1227 self.assertIsInstance(pattern, re._pattern_type)
1228 same_pattern = re.compile(pattern)
1229 self.assertIsInstance(same_pattern, re._pattern_type)
1230 self.assertIs(same_pattern, pattern)
1231 # Test behaviour when not given a string or pattern as parameter
1232 self.assertRaises(TypeError, re.compile, 0)
1233
Ezio Melottife8e6e72013-01-11 08:32:01 +02001234 def test_bug_13899(self):
1235 # Issue #13899: re pattern r"[\A]" should work like "A" but matches
1236 # nothing. Ditto B and Z.
1237 self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
1238 ['A', 'B', '\b', 'C', 'Z'])
1239
Antoine Pitroub33941a2012-12-03 20:55:56 +01001240 @bigmemtest(size=_2G, memuse=1)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001241 def test_large_search(self, size):
1242 # Issue #10182: indices were 32-bit-truncated.
1243 s = 'a' * size
1244 m = re.search('$', s)
1245 self.assertIsNotNone(m)
Antoine Pitrou86067c22012-12-03 21:08:43 +01001246 self.assertEqual(m.start(), size)
1247 self.assertEqual(m.end(), size)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001248
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001249 # The huge memuse is because of re.sub() using a list and a join()
1250 # to create the replacement result.
Antoine Pitroub33941a2012-12-03 20:55:56 +01001251 @bigmemtest(size=_2G, memuse=16 + 2)
Antoine Pitrou1f1888e2012-12-03 20:53:12 +01001252 def test_large_subn(self, size):
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001253 # Issue #10182: indices were 32-bit-truncated.
1254 s = 'a' * size
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001255 r, n = re.subn('', '', s)
1256 self.assertEqual(r, s)
1257 self.assertEqual(n, size + 1)
1258
Serhiy Storchakac1b59d42012-12-29 23:38:48 +02001259 def test_bug_16688(self):
1260 # Issue 16688: Backreferences make case-insensitive regex fail on
1261 # non-ASCII strings.
1262 self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a'])
1263 self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2))
Antoine Pitrou43fb54c2012-12-02 12:52:36 +01001264
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001265 def test_repeat_minmax_overflow(self):
1266 # Issue #13169
1267 string = "x" * 100000
1268 self.assertEqual(re.match(r".{65535}", string).span(), (0, 65535))
1269 self.assertEqual(re.match(r".{,65535}", string).span(), (0, 65535))
1270 self.assertEqual(re.match(r".{65535,}?", string).span(), (0, 65535))
1271 self.assertEqual(re.match(r".{65536}", string).span(), (0, 65536))
1272 self.assertEqual(re.match(r".{,65536}", string).span(), (0, 65536))
1273 self.assertEqual(re.match(r".{65536,}?", string).span(), (0, 65536))
1274 # 2**128 should be big enough to overflow both SRE_CODE and Py_ssize_t.
1275 self.assertRaises(OverflowError, re.compile, r".{%d}" % 2**128)
1276 self.assertRaises(OverflowError, re.compile, r".{,%d}" % 2**128)
1277 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
1278 self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
1279
1280 @cpython_only
1281 def test_repeat_minmax_overflow_maxrepeat(self):
1282 try:
1283 from _sre import MAXREPEAT
1284 except ImportError:
1285 self.skipTest('requires _sre.MAXREPEAT constant')
1286 string = "x" * 100000
1287 self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string))
1288 self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(),
1289 (0, 100000))
1290 self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string))
1291 self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT)
1292 self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT)
1293 self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT)
1294
R David Murray26dfaac92013-04-14 13:00:54 -04001295 def test_backref_group_name_in_exception(self):
1296 # Issue 17341: Poor error message when compiling invalid regex
1297 with self.assertRaisesRegex(sre_constants.error, '<foo>'):
1298 re.compile('(?P=<foo>)')
1299
1300 def test_group_name_in_exception(self):
1301 # Issue 17341: Poor error message when compiling invalid regex
1302 with self.assertRaisesRegex(sre_constants.error, '\?foo'):
1303 re.compile('(?P<?foo>)')
1304
Serhiy Storchaka1f35ae02013-08-03 19:18:38 +03001305 def test_issue17998(self):
1306 for reps in '*', '+', '?', '{1}':
1307 for mod in '', '?':
1308 pattern = '.' + reps + mod + 'yz'
1309 self.assertEqual(re.compile(pattern, re.S).findall('xyz'),
1310 ['xyz'], msg=pattern)
1311 pattern = pattern.encode()
1312 self.assertEqual(re.compile(pattern, re.S).findall(b'xyz'),
1313 [b'xyz'], msg=pattern)
1314
Serhiy Storchaka36af10c2013-10-20 13:13:31 +03001315 def test_match_repr(self):
1316 for string in '[abracadabra]', S('[abracadabra]'):
1317 m = re.search(r'(.+)(.*?)\1', string)
1318 self.assertEqual(repr(m), "<%s.%s object; "
1319 "span=(1, 12), match='abracadabra'>" %
1320 (type(m).__module__, type(m).__qualname__))
1321 for string in (b'[abracadabra]', B(b'[abracadabra]'),
1322 bytearray(b'[abracadabra]'),
1323 memoryview(b'[abracadabra]')):
1324 m = re.search(rb'(.+)(.*?)\1', string)
1325 self.assertEqual(repr(m), "<%s.%s object; "
1326 "span=(1, 12), match=b'abracadabra'>" %
1327 (type(m).__module__, type(m).__qualname__))
1328
1329 first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
1330 self.assertEqual(repr(first), "<%s.%s object; "
1331 "span=(0, 2), match='aa'>" %
1332 (type(second).__module__, type(first).__qualname__))
1333 self.assertEqual(repr(second), "<%s.%s object; "
1334 "span=(3, 5), match='bb'>" %
1335 (type(second).__module__, type(second).__qualname__))
1336
Serhiy Storchaka70ca0212013-02-16 16:47:47 +02001337
Serhiy Storchaka98985a12013-08-19 23:18:23 +03001338 def test_bug_2537(self):
1339 # issue 2537: empty submatches
1340 for outer_op in ('{0,}', '*', '+', '{1,187}'):
1341 for inner_op in ('{0,}', '*', '?'):
1342 r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
1343 m = r.match("xyyzy")
1344 self.assertEqual(m.group(0), "xyy")
1345 self.assertEqual(m.group(1), "")
1346 self.assertEqual(m.group(2), "y")
1347
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001348 def test_debug_flag(self):
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001349 pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001350 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001351 re.compile(pat, re.DEBUG)
1352 dump = '''\
Serhiy Storchakac7f7d382014-11-09 20:48:36 +02001353SUBPATTERN 1
1354 LITERAL 46
1355SUBPATTERN None
1356 BRANCH
1357 IN
1358 LITERAL 99
1359 LITERAL 104
1360 OR
1361 LITERAL 112
1362 LITERAL 121
1363SUBPATTERN None
1364 GROUPREF_EXISTS 1
1365 AT AT_END
1366 ELSE
1367 LITERAL 58
1368 LITERAL 32
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001369'''
1370 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001371 # Debug output is output again even a second time (bypassing
1372 # the cache -- issue #20426).
1373 with captured_stdout() as out:
Serhiy Storchaka44dae8b2014-09-21 22:47:55 +03001374 re.compile(pat, re.DEBUG)
1375 self.assertEqual(out.getvalue(), dump)
Antoine Pitroud2cc7432014-02-03 20:59:59 +01001376
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001377 def test_keyword_parameters(self):
1378 # Issue #20283: Accepting the string keyword parameter.
1379 pat = re.compile(r'(ab)')
1380 self.assertEqual(
1381 pat.match(string='abracadabra', pos=7, endpos=10).span(), (7, 9))
1382 self.assertEqual(
Serhiy Storchakaa537eb42014-03-06 11:36:15 +02001383 pat.fullmatch(string='abracadabra', pos=7, endpos=9).span(), (7, 9))
1384 self.assertEqual(
Serhiy Storchakaccdf3522014-03-06 11:28:32 +02001385 pat.search(string='abracadabra', pos=3, endpos=10).span(), (7, 9))
1386 self.assertEqual(
1387 pat.findall(string='abracadabra', pos=3, endpos=10), ['ab'])
1388 self.assertEqual(
1389 pat.split(string='abracadabra', maxsplit=1),
1390 ['', 'ab', 'racadabra'])
1391 self.assertEqual(
1392 pat.scanner(string='abracadabra', pos=3, endpos=10).search().span(),
1393 (7, 9))
1394
Serhiy Storchaka429b59e2014-05-14 21:48:17 +03001395 def test_bug_20998(self):
1396 # Issue #20998: Fullmatch of repeated single character pattern
1397 # with ignore case.
1398 self.assertEqual(re.fullmatch('[a-c]+', 'ABC', re.I).span(), (0, 3))
1399
Serhiy Storchaka4659cc02014-10-31 00:53:49 +02001400 def test_locale_caching(self):
1401 # Issue #22410
1402 oldlocale = locale.setlocale(locale.LC_CTYPE)
1403 self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1404 for loc in 'en_US.iso88591', 'en_US.utf8':
1405 try:
1406 locale.setlocale(locale.LC_CTYPE, loc)
1407 except locale.Error:
1408 # Unsupported locale on this system
1409 self.skipTest('test needs %s locale' % loc)
1410
1411 re.purge()
1412 self.check_en_US_iso88591()
1413 self.check_en_US_utf8()
1414 re.purge()
1415 self.check_en_US_utf8()
1416 self.check_en_US_iso88591()
1417
1418 def check_en_US_iso88591(self):
1419 locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
1420 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1421 self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I))
1422 self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I))
1423 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1424 self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5'))
1425 self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
1426
1427 def check_en_US_utf8(self):
1428 locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
1429 self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I))
1430 self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I))
1431 self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I))
1432 self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5'))
1433 self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
1434 self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
1435
Serhiy Storchakaad446d52014-11-10 13:49:00 +02001436 def test_error(self):
1437 with self.assertRaises(re.error) as cm:
1438 re.compile('(\u20ac))')
1439 err = cm.exception
1440 self.assertIsInstance(err.pattern, str)
1441 self.assertEqual(err.pattern, '(\u20ac))')
1442 self.assertEqual(err.pos, 3)
1443 self.assertEqual(err.lineno, 1)
1444 self.assertEqual(err.colno, 4)
1445 self.assertIn(err.msg, str(err))
1446 self.assertIn(' at position 3', str(err))
1447 self.assertNotIn(' at position 3', err.msg)
1448 # Bytes pattern
1449 with self.assertRaises(re.error) as cm:
1450 re.compile(b'(\xa4))')
1451 err = cm.exception
1452 self.assertIsInstance(err.pattern, bytes)
1453 self.assertEqual(err.pattern, b'(\xa4))')
1454 self.assertEqual(err.pos, 3)
1455 # Multiline pattern
1456 with self.assertRaises(re.error) as cm:
1457 re.compile("""
1458 (
1459 abc
1460 )
1461 )
1462 (
1463 """, re.VERBOSE)
1464 err = cm.exception
1465 self.assertEqual(err.pos, 77)
1466 self.assertEqual(err.lineno, 5)
1467 self.assertEqual(err.colno, 17)
1468 self.assertIn(err.msg, str(err))
1469 self.assertIn(' at position 77', str(err))
1470 self.assertIn('(line 5, column 17)', str(err))
1471
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001472
Serhiy Storchaka5c24d0e2013-11-23 22:42:43 +02001473class PatternReprTests(unittest.TestCase):
1474 def check(self, pattern, expected):
1475 self.assertEqual(repr(re.compile(pattern)), expected)
1476
1477 def check_flags(self, pattern, flags, expected):
1478 self.assertEqual(repr(re.compile(pattern, flags)), expected)
1479
1480 def test_without_flags(self):
1481 self.check('random pattern',
1482 "re.compile('random pattern')")
1483
1484 def test_single_flag(self):
1485 self.check_flags('random pattern', re.IGNORECASE,
1486 "re.compile('random pattern', re.IGNORECASE)")
1487
1488 def test_multiple_flags(self):
1489 self.check_flags('random pattern', re.I|re.S|re.X,
1490 "re.compile('random pattern', "
1491 "re.IGNORECASE|re.DOTALL|re.VERBOSE)")
1492
1493 def test_unicode_flag(self):
1494 self.check_flags('random pattern', re.U,
1495 "re.compile('random pattern')")
1496 self.check_flags('random pattern', re.I|re.S|re.U,
1497 "re.compile('random pattern', "
1498 "re.IGNORECASE|re.DOTALL)")
1499
1500 def test_inline_flags(self):
1501 self.check('(?i)pattern',
1502 "re.compile('(?i)pattern', re.IGNORECASE)")
1503
1504 def test_unknown_flags(self):
1505 self.check_flags('random pattern', 0x123000,
1506 "re.compile('random pattern', 0x123000)")
1507 self.check_flags('random pattern', 0x123000|re.I,
1508 "re.compile('random pattern', re.IGNORECASE|0x123000)")
1509
1510 def test_bytes(self):
1511 self.check(b'bytes pattern',
1512 "re.compile(b'bytes pattern')")
1513 self.check_flags(b'bytes pattern', re.A,
1514 "re.compile(b'bytes pattern', re.ASCII)")
1515
1516 def test_quotes(self):
1517 self.check('random "double quoted" pattern',
1518 '''re.compile('random "double quoted" pattern')''')
1519 self.check("random 'single quoted' pattern",
1520 '''re.compile("random 'single quoted' pattern")''')
1521 self.check('''both 'single' and "double" quotes''',
1522 '''re.compile('both \\'single\\' and "double" quotes')''')
1523
1524 def test_long_pattern(self):
1525 pattern = 'Very %spattern' % ('long ' * 1000)
1526 r = repr(re.compile(pattern))
1527 self.assertLess(len(r), 300)
1528 self.assertEqual(r[:30], "re.compile('Very long long lon")
1529 r = repr(re.compile(pattern, re.I))
1530 self.assertLess(len(r), 300)
1531 self.assertEqual(r[:30], "re.compile('Very long long lon")
1532 self.assertEqual(r[-16:], ", re.IGNORECASE)")
1533
1534
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001535class ImplementationTest(unittest.TestCase):
1536 """
1537 Test implementation details of the re module.
1538 """
1539
1540 def test_overlap_table(self):
1541 f = sre_compile._generate_overlap_table
1542 self.assertEqual(f(""), [])
1543 self.assertEqual(f("a"), [0])
1544 self.assertEqual(f("abcd"), [0, 0, 0, 0])
1545 self.assertEqual(f("aaaa"), [0, 1, 2, 3])
1546 self.assertEqual(f("ababba"), [0, 0, 1, 2, 0, 1])
1547 self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
1548
1549
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001550def run_re_tests():
Georg Brandl1b37e872010-03-14 10:45:50 +00001551 from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001552 if verbose:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001553 print('Running re_tests test suite')
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001554 else:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001555 # To save time, only run the first and last 10 tests
1556 #tests = tests[:10] + tests[-10:]
1557 pass
Guido van Rossum8e0ce301997-07-11 19:34:44 +00001558
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001559 for t in tests:
1560 sys.stdout.flush()
1561 pattern = s = outcome = repl = expected = None
1562 if len(t) == 5:
1563 pattern, s, outcome, repl, expected = t
1564 elif len(t) == 3:
1565 pattern, s, outcome = t
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001566 else:
Collin Winter3add4d72007-08-29 23:37:32 +00001567 raise ValueError('Test tuples should have 3 or 5 fields', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001568
Guido van Rossum41360a41998-03-26 19:42:58 +00001569 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001570 obj = re.compile(pattern)
1571 except re.error:
1572 if outcome == SYNTAX_ERROR: pass # Expected a syntax error
Guido van Rossum41360a41998-03-26 19:42:58 +00001573 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001574 print('=== Syntax error:', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001575 except KeyboardInterrupt: raise KeyboardInterrupt
1576 except:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001577 print('*** Unexpected error ***', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001578 if verbose:
1579 traceback.print_exc(file=sys.stdout)
1580 else:
Fredrik Lundh17741be2001-03-22 15:51:28 +00001581 try:
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001582 result = obj.search(s)
Guido van Rossumb940e112007-01-10 16:19:56 +00001583 except re.error as msg:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001584 print('=== Unexpected exception', t, repr(msg))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001585 if outcome == SYNTAX_ERROR:
1586 # This should have been a syntax error; forget it.
1587 pass
1588 elif outcome == FAIL:
1589 if result is None: pass # No match, as expected
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001590 else: print('=== Succeeded incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001591 elif outcome == SUCCEED:
1592 if result is not None:
1593 # Matched, as expected, so now we compute the
1594 # result string and compare it to our expected result.
1595 start, end = result.span(0)
1596 vardict={'found': result.group(0),
1597 'groups': result.group(),
1598 'flags': result.re.flags}
1599 for i in range(1, 100):
1600 try:
1601 gi = result.group(i)
1602 # Special hack because else the string concat fails:
1603 if gi is None:
1604 gi = "None"
1605 except IndexError:
1606 gi = "Error"
1607 vardict['g%d' % i] = gi
1608 for i in result.re.groupindex.keys():
1609 try:
1610 gi = result.group(i)
1611 if gi is None:
1612 gi = "None"
1613 except IndexError:
1614 gi = "Error"
1615 vardict[i] = gi
1616 repl = eval(repl, vardict)
1617 if repl != expected:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001618 print('=== grouping error', t, end=' ')
1619 print(repr(repl) + ' should be ' + repr(expected))
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001620 else:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001621 print('=== Failed incorrectly', t)
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001622
Antoine Pitrou22628c42008-07-22 17:53:22 +00001623 # Try the match with both pattern and string converted to
1624 # bytes, and check that it still succeeds.
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001625 try:
Antoine Pitrou22628c42008-07-22 17:53:22 +00001626 bpat = bytes(pattern, "ascii")
1627 bs = bytes(s, "ascii")
1628 except UnicodeEncodeError:
1629 # skip non-ascii tests
1630 pass
1631 else:
1632 try:
1633 bpat = re.compile(bpat)
1634 except Exception:
1635 print('=== Fails on bytes pattern compile', t)
1636 if verbose:
1637 traceback.print_exc(file=sys.stdout)
1638 else:
1639 bytes_result = bpat.search(bs)
1640 if bytes_result is None:
1641 print('=== Fails on bytes pattern match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001642
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001643 # Try the match with the search area limited to the extent
1644 # of the match and see if it still succeeds. \B will
1645 # break (because it won't match at the end or start of a
1646 # string), so we'll ignore patterns that feature it.
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001647
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001648 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
1649 and result is not None:
1650 obj = re.compile(pattern)
1651 result = obj.search(s, result.start(0), result.end(0) + 1)
1652 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001653 print('=== Failed on range-limited match', t)
Fredrik Lundh1151a8c2000-08-08 16:47:42 +00001654
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001655 # Try the match with IGNORECASE enabled, and check that it
1656 # still succeeds.
1657 obj = re.compile(pattern, re.IGNORECASE)
1658 result = obj.search(s)
Fred Drake132dce22000-12-12 23:11:42 +00001659 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001660 print('=== Fails on case-insensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001661
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001662 # Try the match with LOCALE enabled, and check that it
1663 # still succeeds.
Antoine Pitrou22628c42008-07-22 17:53:22 +00001664 if '(?u)' not in pattern:
1665 obj = re.compile(pattern, re.LOCALE)
1666 result = obj.search(s)
1667 if result is None:
1668 print('=== Fails on locale-sensitive match', t)
Guido van Rossumdfa67901997-12-08 17:12:06 +00001669
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001670 # Try the match with UNICODE locale enabled, and check
1671 # that it still succeeds.
1672 obj = re.compile(pattern, re.UNICODE)
1673 result = obj.search(s)
1674 if result is None:
Guido van Rossumbe19ed72007-02-09 05:37:30 +00001675 print('=== Fails on unicode-sensitive match', t)
Fredrik Lundh8e6d5712000-08-08 17:06:53 +00001676
Gregory P. Smith5a631832010-07-27 05:31:29 +00001677
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001678def test_main():
Antoine Pitrou79aa68d2013-10-25 21:36:10 +02001679 run_unittest(__name__)
Skip Montanaro1e703c62003-04-25 15:40:28 +00001680 run_re_tests()
Skip Montanaro8ed06da2003-04-24 19:43:18 +00001681
1682if __name__ == "__main__":
1683 test_main()