blob: 144620c6d1bc3fe6e0ba4325da2e7d07f4c44cc9 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Victor Stinner7fa767e2014-03-20 09:16:38 +010013import _sre
Christian Heimes5e696852008-04-09 08:37:03 +000014import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000015from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Raymond Hettingerdf1b6992014-11-09 15:56:33 -080019_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
Serhiy Storchaka821a9d12017-05-14 08:32:33 +030023_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
Raymond Hettinger049ade22005-02-28 19:27:52 +000024
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020025# Sets of lowercase characters which have the same uppercase.
26_equivalences = (
27 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
28 (0x69, 0x131), # iı
29 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
30 (0x73, 0x17f), # sſ
31 # MICRO SIGN, GREEK SMALL LETTER MU
32 (0xb5, 0x3bc), # µμ
33 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
34 (0x345, 0x3b9, 0x1fbe), # \u0345ιι
35 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
36 (0x390, 0x1fd3), # ΐΐ
37 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
38 (0x3b0, 0x1fe3), # ΰΰ
39 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
40 (0x3b2, 0x3d0), # βϐ
41 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
42 (0x3b5, 0x3f5), # εϵ
43 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
44 (0x3b8, 0x3d1), # θϑ
45 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
46 (0x3ba, 0x3f0), # κϰ
47 # GREEK SMALL LETTER PI, GREEK PI SYMBOL
48 (0x3c0, 0x3d6), # πϖ
49 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
50 (0x3c1, 0x3f1), # ρϱ
51 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
52 (0x3c2, 0x3c3), # ςσ
53 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
54 (0x3c6, 0x3d5), # φϕ
55 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
56 (0x1e61, 0x1e9b), # ṡẛ
57 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
58 (0xfb05, 0xfb06), # ſtst
59)
60
61# Maps the lowercase code to lowercase codes which have the same uppercase.
62_ignorecase_fixes = {i: tuple(j for j in t if i != j)
63 for t in _equivalences for i in t}
64
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000065def _compile(code, pattern, flags):
66 # internal: compile a (sub)pattern
67 emit = code.append
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000068 _len = len
Raymond Hettinger049ade22005-02-28 19:27:52 +000069 LITERAL_CODES = _LITERAL_CODES
70 REPEATING_CODES = _REPEATING_CODES
71 SUCCESS_CODES = _SUCCESS_CODES
72 ASSERT_CODES = _ASSERT_CODES
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030073 iscased = None
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030074 tolower = None
75 fixes = None
76 if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
77 if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030078 iscased = _sre.unicode_iscased
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030079 tolower = _sre.unicode_tolower
80 fixes = _ignorecase_fixes
81 else:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030082 iscased = _sre.ascii_iscased
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030083 tolower = _sre.ascii_tolower
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000084 for op, av in pattern:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000085 if op in LITERAL_CODES:
Serhiy Storchaka898ff032017-05-05 08:53:40 +030086 if not flags & SRE_FLAG_IGNORECASE:
87 emit(op)
88 emit(av)
89 elif flags & SRE_FLAG_LOCALE:
90 emit(OP_LOC_IGNORE[op])
91 emit(av)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030092 elif not iscased(av):
93 emit(op)
94 emit(av)
Serhiy Storchaka898ff032017-05-05 08:53:40 +030095 else:
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030096 lo = tolower(av)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020097 if fixes and lo in fixes:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020098 emit(IN_IGNORE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020099 skip = _len(code); emit(0)
100 if op is NOT_LITERAL:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200101 emit(NEGATE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200102 for k in (lo,) + fixes[lo]:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200103 emit(LITERAL)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200104 emit(k)
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200105 emit(FAILURE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200106 code[skip] = _len(code) - skip
107 else:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200108 emit(OP_IGNORE[op])
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200109 emit(lo)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000110 elif op is IN:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300111 charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
112 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300113 emit(IN_LOC_IGNORE)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300114 elif hascased:
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300115 emit(IN_IGNORE)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300116 else:
117 emit(IN)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000118 skip = _len(code); emit(0)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300119 _compile_charset(charset, flags, code)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000120 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000121 elif op is ANY:
122 if flags & SRE_FLAG_DOTALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200123 emit(ANY_ALL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000124 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200125 emit(ANY)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000126 elif op in REPEATING_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000127 if flags & SRE_FLAG_TEMPLATE:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200128 raise error("internal: unsupported template operator %r" % (op,))
Serhiy Storchaka821a9d12017-05-14 08:32:33 +0300129 if _simple(av[2]):
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000130 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200131 emit(REPEAT_ONE)
Guido van Rossum41c99e72003-04-14 17:59:34 +0000132 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200133 emit(MIN_REPEAT_ONE)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000134 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000135 emit(av[0])
136 emit(av[1])
137 _compile(code, av[2], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200138 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000139 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000140 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200141 emit(REPEAT)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000142 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000143 emit(av[0])
144 emit(av[1])
145 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000146 code[skip] = _len(code) - skip
147 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200148 emit(MAX_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000149 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200150 emit(MIN_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000151 elif op is SUBPATTERN:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300152 group, add_flags, del_flags, p = av
153 if group:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200154 emit(MARK)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300155 emit((group-1)*2)
156 # _compile_info(code, p, (flags | add_flags) & ~del_flags)
157 _compile(code, p, (flags | add_flags) & ~del_flags)
158 if group:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200159 emit(MARK)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300160 emit((group-1)*2+1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000161 elif op in SUCCESS_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200162 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000163 elif op in ASSERT_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200164 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000165 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000166 if av[0] >= 0:
167 emit(0) # look ahead
168 else:
169 lo, hi = av[1].getwidth()
170 if lo != hi:
Collin Winterce36ad82007-08-30 01:19:48 +0000171 raise error("look-behind requires fixed-width pattern")
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000172 emit(lo) # look behind
173 _compile(code, av[1], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200174 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000175 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000176 elif op is CALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200177 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000178 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000179 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200180 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000181 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000182 elif op is AT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200183 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000184 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000185 av = AT_MULTILINE.get(av, av)
186 if flags & SRE_FLAG_LOCALE:
187 av = AT_LOCALE.get(av, av)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300188 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000189 av = AT_UNICODE.get(av, av)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200190 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000191 elif op is BRANCH:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200192 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000193 tail = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000194 tailappend = tail.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000195 for av in av[1]:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000196 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000197 # _compile_info(code, av, flags)
198 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200199 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000200 tailappend(_len(code)); emit(0)
201 code[skip] = _len(code) - skip
Serhiy Storchakaab140882014-11-11 21:13:28 +0200202 emit(FAILURE) # end of branch
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000203 for tail in tail:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000204 code[tail] = _len(code) - tail
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000205 elif op is CATEGORY:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200206 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000207 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000208 av = CH_LOCALE[av]
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300209 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000210 av = CH_UNICODE[av]
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200211 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000212 elif op is GROUPREF:
213 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200214 emit(OP_IGNORE[op])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000215 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200216 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000217 emit(av-1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000218 elif op is GROUPREF_EXISTS:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200219 emit(op)
Andrew M. Kuchlingc30faa82005-06-02 13:35:52 +0000220 emit(av[0]-1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000221 skipyes = _len(code); emit(0)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 _compile(code, av[1], flags)
223 if av[2]:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200224 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000225 skipno = _len(code); emit(0)
226 code[skipyes] = _len(code) - skipyes + 1
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000227 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000228 code[skipno] = _len(code) - skipno
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000229 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000230 code[skipyes] = _len(code) - skipyes + 1
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000231 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200232 raise error("internal: unsupported operand type %r" % (op,))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000233
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300234def _compile_charset(charset, flags, code):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000235 # compile charset subprogram
236 emit = code.append
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300237 for op, av in charset:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200238 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000239 if op is NEGATE:
240 pass
241 elif op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200242 emit(av)
243 elif op is RANGE or op is RANGE_IGNORE:
244 emit(av[0])
245 emit(av[1])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000246 elif op is CHARSET:
247 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000248 elif op is BIGCHARSET:
249 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000250 elif op is CATEGORY:
251 if flags & SRE_FLAG_LOCALE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200252 emit(CH_LOCALE[av])
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300253 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200254 emit(CH_UNICODE[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000255 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200256 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000257 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200258 raise error("internal: unsupported set operator %r" % (op,))
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200259 emit(FAILURE)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000260
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300261def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000262 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000263 out = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200264 tail = []
265 charmap = bytearray(256)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300266 hascased = False
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200267 for op, av in charset:
268 while True:
269 try:
270 if op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200271 if fixup:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200272 lo = fixup(av)
273 charmap[lo] = 1
274 if fixes and lo in fixes:
275 for k in fixes[lo]:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200276 charmap[k] = 1
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300277 if not hascased and iscased(av):
278 hascased = True
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200279 else:
280 charmap[av] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200281 elif op is RANGE:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200282 r = range(av[0], av[1]+1)
283 if fixup:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300284 if fixes:
285 for i in map(fixup, r):
286 charmap[i] = 1
287 if i in fixes:
288 for k in fixes[i]:
289 charmap[k] = 1
290 else:
291 for i in map(fixup, r):
292 charmap[i] = 1
293 if not hascased:
294 hascased = any(map(iscased, r))
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200295 else:
296 for i in r:
297 charmap[i] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200298 elif op is NEGATE:
299 out.append((op, av))
300 else:
301 tail.append((op, av))
302 except IndexError:
303 if len(charmap) == 256:
304 # character set contains non-UCS1 character codes
305 charmap += b'\0' * 0xff00
306 continue
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200307 # Character set contains non-BMP character codes.
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300308 if fixup:
309 hascased = True
310 # There are only two ranges of cased non-BMP characters:
311 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
312 # and for both ranges RANGE_IGNORE works.
313 if op is RANGE:
314 op = RANGE_IGNORE
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200315 tail.append((op, av))
316 break
317
Fredrik Lundh3562f112000-07-02 12:00:07 +0000318 # compress character map
Fredrik Lundh3562f112000-07-02 12:00:07 +0000319 runs = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200320 q = 0
321 while True:
322 p = charmap.find(1, q)
323 if p < 0:
324 break
325 if len(runs) >= 2:
326 runs = None
327 break
328 q = charmap.find(0, p)
329 if q < 0:
330 runs.append((p, len(charmap)))
331 break
332 runs.append((p, q))
333 if runs is not None:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000334 # use literal/range
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200335 for p, q in runs:
336 if q - p == 1:
337 out.append((LITERAL, p))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000338 else:
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200339 out.append((RANGE, (p, q - 1)))
340 out += tail
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200341 # if the case was changed or new representation is more compact
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300342 if hascased or len(out) < len(charset):
343 return out, hascased
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200344 # else original character set is good enough
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300345 return charset, hascased
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200346
347 # use bitmap
348 if len(charmap) == 256:
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000349 data = _mk_bitmap(charmap)
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200350 out.append((CHARSET, data))
351 out += tail
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300352 return out, hascased
Fredrik Lundh3562f112000-07-02 12:00:07 +0000353
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200354 # To represent a big charset, first a bitmap of all characters in the
355 # set is constructed. Then, this bitmap is sliced into chunks of 256
356 # characters, duplicate chunks are eliminated, and each chunk is
357 # given a number. In the compiled expression, the charset is
358 # represented by a 32-bit word sequence, consisting of one word for
359 # the number of different chunks, a sequence of 256 bytes (64 words)
360 # of chunk numbers indexed by their original chunk position, and a
361 # sequence of 256-bit chunks (8 words each).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000362
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200363 # Compression is normally good: in a typical charset, large ranges of
364 # Unicode will be either completely excluded (e.g. if only cyrillic
365 # letters are to be matched), or completely included (e.g. if large
366 # subranges of Kanji match). These ranges will be represented by
367 # chunks of all one-bits or all zero-bits.
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000368
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200369 # Matching can be also done efficiently: the more significant byte of
370 # the Unicode character is an index into the chunk number, and the
371 # less significant byte is a bit index in the chunk (just like the
372 # CHARSET matching).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000373
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200374 charmap = bytes(charmap) # should be hashable
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000375 comps = {}
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200376 mapping = bytearray(256)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000377 block = 0
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200378 data = bytearray()
379 for i in range(0, 65536, 256):
380 chunk = charmap[i: i + 256]
381 if chunk in comps:
382 mapping[i // 256] = comps[chunk]
383 else:
384 mapping[i // 256] = comps[chunk] = block
385 block += 1
386 data += chunk
387 data = _mk_bitmap(data)
388 data[0:0] = [block] + _bytes_to_codes(mapping)
389 out.append((BIGCHARSET, data))
390 out += tail
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300391 return out, hascased
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200392
393_CODEBITS = _sre.CODESIZE * 8
Serhiy Storchakaab140882014-11-11 21:13:28 +0200394MAXCODE = (1 << _CODEBITS) - 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200395_BITS_TRANS = b'0' + b'1' * 255
396def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
397 s = bits.translate(_BITS_TRANS)[::-1]
398 return [_int(s[i - _CODEBITS: i], 2)
399 for i in range(len(s), 0, -_CODEBITS)]
400
401def _bytes_to_codes(b):
402 # Convert block indices to word array
Serhiy Storchaka19e91582014-11-10 13:24:47 +0200403 a = memoryview(b).cast('I')
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200404 assert a.itemsize == _sre.CODESIZE
405 assert len(a) * a.itemsize == len(b)
406 return a.tolist()
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000407
Serhiy Storchaka821a9d12017-05-14 08:32:33 +0300408def _simple(p):
409 # check if this subpattern is a "simple" operator
410 if len(p) != 1:
411 return False
412 op, av = p[0]
413 if op is SUBPATTERN:
414 return av[0] is None and _simple(av[-1])
415 return op in _UNIT_CODES
Fredrik Lundhe1869832000-08-01 22:47:49 +0000416
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200417def _generate_overlap_table(prefix):
418 """
419 Generate an overlap table for the following prefix.
420 An overlap table is a table of the same size as the prefix which
421 informs about the potential self-overlap for each index in the prefix:
422 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
423 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
424 prefix[0:k]
425 """
426 table = [0] * len(prefix)
427 for i in range(1, len(prefix)):
428 idx = table[i - 1]
429 while prefix[i] != prefix[idx]:
430 if idx == 0:
431 table[i] = 0
432 break
433 idx = table[idx - 1]
434 else:
435 table[i] = idx + 1
436 return table
437
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300438def _get_iscased(flags):
439 if not flags & SRE_FLAG_IGNORECASE:
440 return None
441 elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
442 return _sre.unicode_iscased
443 else:
444 return _sre.ascii_iscased
445
446def _get_literal_prefix(pattern, flags):
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300447 # look for literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000448 prefix = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000449 prefixappend = prefix.append
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300450 prefix_skip = None
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300451 iscased = _get_iscased(flags)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300452 for op, av in pattern.data:
453 if op is LITERAL:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300454 if iscased and iscased(av):
455 break
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300456 prefixappend(av)
457 elif op is SUBPATTERN:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300458 group, add_flags, del_flags, p = av
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300459 flags1 = (flags | add_flags) & ~del_flags
460 if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300461 break
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300462 prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300463 if prefix_skip is None:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300464 if group is not None:
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300465 prefix_skip = len(prefix)
466 elif prefix_skip1 is not None:
467 prefix_skip = len(prefix) + prefix_skip1
468 prefix.extend(prefix1)
469 if not got_all:
470 break
471 else:
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300472 break
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300473 else:
474 return prefix, prefix_skip, True
475 return prefix, prefix_skip, False
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300476
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300477def _get_charset_prefix(pattern, flags):
478 while True:
479 if not pattern.data:
480 return None
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300481 op, av = pattern.data[0]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300482 if op is not SUBPATTERN:
483 break
484 group, add_flags, del_flags, pattern = av
485 flags = (flags | add_flags) & ~del_flags
486 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
487 return None
488
489 iscased = _get_iscased(flags)
490 if op is LITERAL:
491 if iscased and iscased(av):
492 return None
493 return [(op, av)]
494 elif op is BRANCH:
495 charset = []
496 charsetappend = charset.append
497 for p in av[1]:
498 if not p:
499 return None
500 op, av = p[0]
501 if op is LITERAL and not (iscased and iscased(av)):
502 charsetappend((op, av))
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300503 else:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300504 return None
505 return charset
506 elif op is IN:
507 charset = av
508 if iscased:
509 for op, av in charset:
510 if op is LITERAL:
511 if iscased(av):
512 return None
513 elif op is RANGE:
514 if av[1] > 0xffff:
515 return None
516 if any(map(iscased, range(av[0], av[1]+1))):
517 return None
518 return charset
519 return None
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300520
521def _compile_info(code, pattern, flags):
522 # internal: compile an info block. in the current version,
523 # this contains min/max pattern width, and an optional literal
524 # prefix or a character map
525 lo, hi = pattern.getwidth()
526 if hi > MAXCODE:
527 hi = MAXCODE
528 if lo == 0:
529 code.extend([INFO, 4, 0, lo, hi])
530 return
531 # look for a literal prefix
532 prefix = []
533 prefix_skip = 0
534 charset = [] # not used
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300535 if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300536 # look for literal prefix
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300537 prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300538 # if no prefix, look for charset prefix
539 if not prefix:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300540 charset = _get_charset_prefix(pattern, flags)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000541## if prefix:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200542## print("*** PREFIX", prefix, prefix_skip)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000543## if charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200544## print("*** CHARSET", charset)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000545 # add an info block
546 emit = code.append
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200547 emit(INFO)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000548 skip = len(code); emit(0)
549 # literal flag
550 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000551 if prefix:
552 mask = SRE_INFO_PREFIX
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300553 if prefix_skip is None and got_all:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200554 mask = mask | SRE_INFO_LITERAL
Fredrik Lundh3562f112000-07-02 12:00:07 +0000555 elif charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200556 mask = mask | SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000557 emit(mask)
558 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000559 if lo < MAXCODE:
560 emit(lo)
561 else:
562 emit(MAXCODE)
563 prefix = prefix[:MAXCODE]
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200564 emit(min(hi, MAXCODE))
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000565 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000566 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000567 emit(len(prefix)) # length
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300568 if prefix_skip is None:
569 prefix_skip = len(prefix)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000570 emit(prefix_skip) # skip
571 code.extend(prefix)
572 # generate overlap table
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200573 code.extend(_generate_overlap_table(prefix))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000574 elif charset:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300575 charset, hascased = _optimize_charset(charset)
576 assert not hascased
Guido van Rossum577fb5a2003-02-24 01:18:35 +0000577 _compile_charset(charset, flags, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000578 code[skip] = len(code) - skip
579
Just van Rossum74902502003-07-02 21:37:16 +0000580def isstring(obj):
Thomas Wouters40a088d2008-03-18 20:19:54 +0000581 return isinstance(obj, (str, bytes))
Just van Rossum74902502003-07-02 21:37:16 +0000582
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000583def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000584
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000585 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000586 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000587
588 # compile info block
589 _compile_info(code, p, flags)
590
591 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000592 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000593
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200594 code.append(SUCCESS)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000595
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000596 return code
597
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +0300598def _hex_code(code):
599 return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
600
601def dis(code):
602 import sys
603
604 labels = set()
605 level = 0
606 offset_width = len(str(len(code) - 1))
607
608 def dis_(start, end):
609 def print_(*args, to=None):
610 if to is not None:
611 labels.add(to)
612 args += ('(to %d)' % (to,),)
613 print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
614 end=' '*(level-1))
615 print(*args)
616
617 def print_2(*args):
618 print(end=' '*(offset_width + 2*level))
619 print(*args)
620
621 nonlocal level
622 level += 1
623 i = start
624 while i < end:
625 start = i
626 op = code[i]
627 i += 1
628 op = OPCODES[op]
629 if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
630 MAX_UNTIL, MIN_UNTIL, NEGATE):
631 print_(op)
632 elif op in (LITERAL, NOT_LITERAL,
633 LITERAL_IGNORE, NOT_LITERAL_IGNORE,
634 LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
635 arg = code[i]
636 i += 1
637 print_(op, '%#02x (%r)' % (arg, chr(arg)))
638 elif op is AT:
639 arg = code[i]
640 i += 1
641 arg = str(ATCODES[arg])
642 assert arg[:3] == 'AT_'
643 print_(op, arg[3:])
644 elif op is CATEGORY:
645 arg = code[i]
646 i += 1
647 arg = str(CHCODES[arg])
648 assert arg[:9] == 'CATEGORY_'
649 print_(op, arg[9:])
650 elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
651 skip = code[i]
652 print_(op, skip, to=i+skip)
653 dis_(i+1, i+skip)
654 i += skip
655 elif op in (RANGE, RANGE_IGNORE):
656 lo, hi = code[i: i+2]
657 i += 2
658 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
659 elif op is CHARSET:
660 print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
661 i += 256//_CODEBITS
662 elif op is BIGCHARSET:
663 arg = code[i]
664 i += 1
665 mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
666 for x in code[i: i + 256//_sre.CODESIZE]))
667 print_(op, arg, mapping)
668 i += 256//_sre.CODESIZE
669 level += 1
670 for j in range(arg):
671 print_2(_hex_code(code[i: i + 256//_CODEBITS]))
672 i += 256//_CODEBITS
673 level -= 1
674 elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
675 arg = code[i]
676 i += 1
677 print_(op, arg)
678 elif op is JUMP:
679 skip = code[i]
680 print_(op, skip, to=i+skip)
681 i += 1
682 elif op is BRANCH:
683 skip = code[i]
684 print_(op, skip, to=i+skip)
685 while skip:
686 dis_(i+1, i+skip)
687 i += skip
688 start = i
689 skip = code[i]
690 if skip:
691 print_('branch', skip, to=i+skip)
692 else:
693 print_(FAILURE)
694 i += 1
695 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
696 skip, min, max = code[i: i+3]
697 if max == MAXREPEAT:
698 max = 'MAXREPEAT'
699 print_(op, skip, min, max, to=i+skip)
700 dis_(i+3, i+skip)
701 i += skip
702 elif op is GROUPREF_EXISTS:
703 arg, skip = code[i: i+2]
704 print_(op, arg, skip, to=i+skip)
705 i += 2
706 elif op in (ASSERT, ASSERT_NOT):
707 skip, arg = code[i: i+2]
708 print_(op, skip, arg, to=i+skip)
709 dis_(i+2, i+skip)
710 i += skip
711 elif op is INFO:
712 skip, flags, min, max = code[i: i+4]
713 if max == MAXREPEAT:
714 max = 'MAXREPEAT'
715 print_(op, skip, bin(flags), min, max, to=i+skip)
716 start = i+4
717 if flags & SRE_INFO_PREFIX:
718 prefix_len, prefix_skip = code[i+4: i+6]
719 print_2(' prefix_skip', prefix_skip)
720 start = i + 6
721 prefix = code[start: start+prefix_len]
722 print_2(' prefix',
723 '[%s]' % ', '.join('%#02x' % x for x in prefix),
724 '(%r)' % ''.join(map(chr, prefix)))
725 start += prefix_len
726 print_2(' overlap', code[start: start+prefix_len])
727 start += prefix_len
728 if flags & SRE_INFO_CHARSET:
729 level += 1
730 print_2('in')
731 dis_(start, i+skip)
732 level -= 1
733 i += skip
734 else:
735 raise ValueError(op)
736
737 level -= 1
738
739 dis_(0, len(code))
740
741
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000742def compile(p, flags=0):
743 # internal: convert pattern list to internal format
744
Just van Rossum74902502003-07-02 21:37:16 +0000745 if isstring(p):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000746 pattern = p
747 p = sre_parse.parse(p, flags)
748 else:
749 pattern = None
750
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000751 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000752
Serhiy Storchaka4ab6abf2017-05-14 09:05:13 +0300753 if flags & SRE_FLAG_DEBUG:
754 print()
755 dis(code)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000756
Fredrik Lundhc2301732000-07-02 22:25:39 +0000757 # map in either direction
758 groupindex = p.pattern.groupdict
759 indexgroup = [None] * p.pattern.groups
760 for k, i in groupindex.items():
761 indexgroup[i] = k
762
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000763 return _sre.compile(
Christian Heimes072c0f12008-01-03 23:01:04 +0000764 pattern, flags | p.pattern.flags, code,
Fredrik Lundh6f013982000-07-03 18:44:21 +0000765 p.pattern.groups-1,
Victor Stinner726a57d2016-11-22 23:04:39 +0100766 groupindex, tuple(indexgroup)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000767 )