blob: cebecb93c0ab80373853799cd12db937a1aed784 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Victor Stinner7fa767e2014-03-20 09:16:38 +010013import _sre
Christian Heimes5e696852008-04-09 08:37:03 +000014import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000015from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Raymond Hettingerdf1b6992014-11-09 15:56:33 -080019_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
Raymond Hettinger049ade22005-02-28 19:27:52 +000023
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020024# Sets of lowercase characters which have the same uppercase.
25_equivalences = (
26 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
27 (0x69, 0x131), # iı
28 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
29 (0x73, 0x17f), # sſ
30 # MICRO SIGN, GREEK SMALL LETTER MU
31 (0xb5, 0x3bc), # µμ
32 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
33 (0x345, 0x3b9, 0x1fbe), # \u0345ιι
34 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
35 (0x390, 0x1fd3), # ΐΐ
36 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
37 (0x3b0, 0x1fe3), # ΰΰ
38 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
39 (0x3b2, 0x3d0), # βϐ
40 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
41 (0x3b5, 0x3f5), # εϵ
42 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
43 (0x3b8, 0x3d1), # θϑ
44 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
45 (0x3ba, 0x3f0), # κϰ
46 # GREEK SMALL LETTER PI, GREEK PI SYMBOL
47 (0x3c0, 0x3d6), # πϖ
48 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
49 (0x3c1, 0x3f1), # ρϱ
50 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
51 (0x3c2, 0x3c3), # ςσ
52 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
53 (0x3c6, 0x3d5), # φϕ
54 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
55 (0x1e61, 0x1e9b), # ṡẛ
56 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
57 (0xfb05, 0xfb06), # ſtst
58)
59
60# Maps the lowercase code to lowercase codes which have the same uppercase.
61_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62 for t in _equivalences for i in t}
63
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000064def _compile(code, pattern, flags):
65 # internal: compile a (sub)pattern
66 emit = code.append
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000067 _len = len
Raymond Hettinger049ade22005-02-28 19:27:52 +000068 LITERAL_CODES = _LITERAL_CODES
69 REPEATING_CODES = _REPEATING_CODES
70 SUCCESS_CODES = _SUCCESS_CODES
71 ASSERT_CODES = _ASSERT_CODES
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030072 iscased = None
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030073 tolower = None
74 fixes = None
75 if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
76 if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030077 iscased = _sre.unicode_iscased
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030078 tolower = _sre.unicode_tolower
79 fixes = _ignorecase_fixes
80 else:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030081 iscased = _sre.ascii_iscased
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030082 tolower = _sre.ascii_tolower
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000083 for op, av in pattern:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000084 if op in LITERAL_CODES:
Serhiy Storchaka898ff032017-05-05 08:53:40 +030085 if not flags & SRE_FLAG_IGNORECASE:
86 emit(op)
87 emit(av)
88 elif flags & SRE_FLAG_LOCALE:
89 emit(OP_LOC_IGNORE[op])
90 emit(av)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +030091 elif not iscased(av):
92 emit(op)
93 emit(av)
Serhiy Storchaka898ff032017-05-05 08:53:40 +030094 else:
Serhiy Storchaka7186cc22017-05-05 10:42:46 +030095 lo = tolower(av)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020096 if fixes and lo in fixes:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020097 emit(IN_IGNORE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020098 skip = _len(code); emit(0)
99 if op is NOT_LITERAL:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200100 emit(NEGATE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200101 for k in (lo,) + fixes[lo]:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200102 emit(LITERAL)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200103 emit(k)
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200104 emit(FAILURE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200105 code[skip] = _len(code) - skip
106 else:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200107 emit(OP_IGNORE[op])
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200108 emit(lo)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000109 elif op is IN:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300110 charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
111 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300112 emit(IN_LOC_IGNORE)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300113 elif hascased:
Serhiy Storchaka898ff032017-05-05 08:53:40 +0300114 emit(IN_IGNORE)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300115 else:
116 emit(IN)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000117 skip = _len(code); emit(0)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300118 _compile_charset(charset, flags, code)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000119 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000120 elif op is ANY:
121 if flags & SRE_FLAG_DOTALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200122 emit(ANY_ALL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000123 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200124 emit(ANY)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000125 elif op in REPEATING_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000126 if flags & SRE_FLAG_TEMPLATE:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200127 raise error("internal: unsupported template operator %r" % (op,))
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000128 elif _simple(av) and op is not REPEAT:
129 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200130 emit(REPEAT_ONE)
Guido van Rossum41c99e72003-04-14 17:59:34 +0000131 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200132 emit(MIN_REPEAT_ONE)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000133 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000134 emit(av[0])
135 emit(av[1])
136 _compile(code, av[2], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200137 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000138 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000139 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200140 emit(REPEAT)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000141 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000142 emit(av[0])
143 emit(av[1])
144 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000145 code[skip] = _len(code) - skip
146 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200147 emit(MAX_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000148 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200149 emit(MIN_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000150 elif op is SUBPATTERN:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300151 group, add_flags, del_flags, p = av
152 if group:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200153 emit(MARK)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300154 emit((group-1)*2)
155 # _compile_info(code, p, (flags | add_flags) & ~del_flags)
156 _compile(code, p, (flags | add_flags) & ~del_flags)
157 if group:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200158 emit(MARK)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300159 emit((group-1)*2+1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000160 elif op in SUCCESS_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200161 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000162 elif op in ASSERT_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200163 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000164 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000165 if av[0] >= 0:
166 emit(0) # look ahead
167 else:
168 lo, hi = av[1].getwidth()
169 if lo != hi:
Collin Winterce36ad82007-08-30 01:19:48 +0000170 raise error("look-behind requires fixed-width pattern")
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000171 emit(lo) # look behind
172 _compile(code, av[1], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200173 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000174 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000175 elif op is CALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200176 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000177 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000178 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200179 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000180 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000181 elif op is AT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200182 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000183 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000184 av = AT_MULTILINE.get(av, av)
185 if flags & SRE_FLAG_LOCALE:
186 av = AT_LOCALE.get(av, av)
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300187 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000188 av = AT_UNICODE.get(av, av)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200189 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000190 elif op is BRANCH:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200191 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000192 tail = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000193 tailappend = tail.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000194 for av in av[1]:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000195 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000196 # _compile_info(code, av, flags)
197 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200198 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000199 tailappend(_len(code)); emit(0)
200 code[skip] = _len(code) - skip
Serhiy Storchakaab140882014-11-11 21:13:28 +0200201 emit(FAILURE) # end of branch
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000202 for tail in tail:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000203 code[tail] = _len(code) - tail
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000204 elif op is CATEGORY:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200205 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000206 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000207 av = CH_LOCALE[av]
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300208 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000209 av = CH_UNICODE[av]
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200210 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000211 elif op is GROUPREF:
212 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200213 emit(OP_IGNORE[op])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000214 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200215 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000216 emit(av-1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000217 elif op is GROUPREF_EXISTS:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200218 emit(op)
Andrew M. Kuchlingc30faa82005-06-02 13:35:52 +0000219 emit(av[0]-1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000220 skipyes = _len(code); emit(0)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000221 _compile(code, av[1], flags)
222 if av[2]:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200223 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000224 skipno = _len(code); emit(0)
225 code[skipyes] = _len(code) - skipyes + 1
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000226 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000227 code[skipno] = _len(code) - skipno
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000228 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000229 code[skipyes] = _len(code) - skipyes + 1
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000230 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200231 raise error("internal: unsupported operand type %r" % (op,))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000232
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300233def _compile_charset(charset, flags, code):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000234 # compile charset subprogram
235 emit = code.append
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300236 for op, av in charset:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200237 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000238 if op is NEGATE:
239 pass
240 elif op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200241 emit(av)
242 elif op is RANGE or op is RANGE_IGNORE:
243 emit(av[0])
244 emit(av[1])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000245 elif op is CHARSET:
246 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000247 elif op is BIGCHARSET:
248 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000249 elif op is CATEGORY:
250 if flags & SRE_FLAG_LOCALE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200251 emit(CH_LOCALE[av])
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300252 elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200253 emit(CH_UNICODE[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000254 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200255 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000256 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200257 raise error("internal: unsupported set operator %r" % (op,))
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200258 emit(FAILURE)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000259
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300260def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000261 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000262 out = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200263 tail = []
264 charmap = bytearray(256)
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300265 hascased = False
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200266 for op, av in charset:
267 while True:
268 try:
269 if op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200270 if fixup:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200271 lo = fixup(av)
272 charmap[lo] = 1
273 if fixes and lo in fixes:
274 for k in fixes[lo]:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200275 charmap[k] = 1
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300276 if not hascased and iscased(av):
277 hascased = True
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200278 else:
279 charmap[av] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200280 elif op is RANGE:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200281 r = range(av[0], av[1]+1)
282 if fixup:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300283 if fixes:
284 for i in map(fixup, r):
285 charmap[i] = 1
286 if i in fixes:
287 for k in fixes[i]:
288 charmap[k] = 1
289 else:
290 for i in map(fixup, r):
291 charmap[i] = 1
292 if not hascased:
293 hascased = any(map(iscased, r))
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200294 else:
295 for i in r:
296 charmap[i] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200297 elif op is NEGATE:
298 out.append((op, av))
299 else:
300 tail.append((op, av))
301 except IndexError:
302 if len(charmap) == 256:
303 # character set contains non-UCS1 character codes
304 charmap += b'\0' * 0xff00
305 continue
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200306 # Character set contains non-BMP character codes.
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300307 if fixup:
308 hascased = True
309 # There are only two ranges of cased non-BMP characters:
310 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
311 # and for both ranges RANGE_IGNORE works.
312 if op is RANGE:
313 op = RANGE_IGNORE
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200314 tail.append((op, av))
315 break
316
Fredrik Lundh3562f112000-07-02 12:00:07 +0000317 # compress character map
Fredrik Lundh3562f112000-07-02 12:00:07 +0000318 runs = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200319 q = 0
320 while True:
321 p = charmap.find(1, q)
322 if p < 0:
323 break
324 if len(runs) >= 2:
325 runs = None
326 break
327 q = charmap.find(0, p)
328 if q < 0:
329 runs.append((p, len(charmap)))
330 break
331 runs.append((p, q))
332 if runs is not None:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000333 # use literal/range
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200334 for p, q in runs:
335 if q - p == 1:
336 out.append((LITERAL, p))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000337 else:
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200338 out.append((RANGE, (p, q - 1)))
339 out += tail
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200340 # if the case was changed or new representation is more compact
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300341 if hascased or len(out) < len(charset):
342 return out, hascased
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200343 # else original character set is good enough
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300344 return charset, hascased
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200345
346 # use bitmap
347 if len(charmap) == 256:
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000348 data = _mk_bitmap(charmap)
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200349 out.append((CHARSET, data))
350 out += tail
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300351 return out, hascased
Fredrik Lundh3562f112000-07-02 12:00:07 +0000352
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200353 # To represent a big charset, first a bitmap of all characters in the
354 # set is constructed. Then, this bitmap is sliced into chunks of 256
355 # characters, duplicate chunks are eliminated, and each chunk is
356 # given a number. In the compiled expression, the charset is
357 # represented by a 32-bit word sequence, consisting of one word for
358 # the number of different chunks, a sequence of 256 bytes (64 words)
359 # of chunk numbers indexed by their original chunk position, and a
360 # sequence of 256-bit chunks (8 words each).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000361
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200362 # Compression is normally good: in a typical charset, large ranges of
363 # Unicode will be either completely excluded (e.g. if only cyrillic
364 # letters are to be matched), or completely included (e.g. if large
365 # subranges of Kanji match). These ranges will be represented by
366 # chunks of all one-bits or all zero-bits.
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000367
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200368 # Matching can be also done efficiently: the more significant byte of
369 # the Unicode character is an index into the chunk number, and the
370 # less significant byte is a bit index in the chunk (just like the
371 # CHARSET matching).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000372
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200373 charmap = bytes(charmap) # should be hashable
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000374 comps = {}
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200375 mapping = bytearray(256)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000376 block = 0
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200377 data = bytearray()
378 for i in range(0, 65536, 256):
379 chunk = charmap[i: i + 256]
380 if chunk in comps:
381 mapping[i // 256] = comps[chunk]
382 else:
383 mapping[i // 256] = comps[chunk] = block
384 block += 1
385 data += chunk
386 data = _mk_bitmap(data)
387 data[0:0] = [block] + _bytes_to_codes(mapping)
388 out.append((BIGCHARSET, data))
389 out += tail
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300390 return out, hascased
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200391
392_CODEBITS = _sre.CODESIZE * 8
Serhiy Storchakaab140882014-11-11 21:13:28 +0200393MAXCODE = (1 << _CODEBITS) - 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200394_BITS_TRANS = b'0' + b'1' * 255
395def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
396 s = bits.translate(_BITS_TRANS)[::-1]
397 return [_int(s[i - _CODEBITS: i], 2)
398 for i in range(len(s), 0, -_CODEBITS)]
399
400def _bytes_to_codes(b):
401 # Convert block indices to word array
Serhiy Storchaka19e91582014-11-10 13:24:47 +0200402 a = memoryview(b).cast('I')
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200403 assert a.itemsize == _sre.CODESIZE
404 assert len(a) * a.itemsize == len(b)
405 return a.tolist()
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000406
Fredrik Lundhe1869832000-08-01 22:47:49 +0000407def _simple(av):
408 # check if av is a "simple" operator
409 lo, hi = av[2].getwidth()
Fredrik Lundhe1869832000-08-01 22:47:49 +0000410 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
411
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200412def _generate_overlap_table(prefix):
413 """
414 Generate an overlap table for the following prefix.
415 An overlap table is a table of the same size as the prefix which
416 informs about the potential self-overlap for each index in the prefix:
417 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
418 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
419 prefix[0:k]
420 """
421 table = [0] * len(prefix)
422 for i in range(1, len(prefix)):
423 idx = table[i - 1]
424 while prefix[i] != prefix[idx]:
425 if idx == 0:
426 table[i] = 0
427 break
428 idx = table[idx - 1]
429 else:
430 table[i] = idx + 1
431 return table
432
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300433def _get_iscased(flags):
434 if not flags & SRE_FLAG_IGNORECASE:
435 return None
436 elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
437 return _sre.unicode_iscased
438 else:
439 return _sre.ascii_iscased
440
441def _get_literal_prefix(pattern, flags):
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300442 # look for literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000443 prefix = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000444 prefixappend = prefix.append
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300445 prefix_skip = None
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300446 iscased = _get_iscased(flags)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300447 for op, av in pattern.data:
448 if op is LITERAL:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300449 if iscased and iscased(av):
450 break
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300451 prefixappend(av)
452 elif op is SUBPATTERN:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300453 group, add_flags, del_flags, p = av
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300454 flags1 = (flags | add_flags) & ~del_flags
455 if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300456 break
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300457 prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300458 if prefix_skip is None:
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300459 if group is not None:
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300460 prefix_skip = len(prefix)
461 elif prefix_skip1 is not None:
462 prefix_skip = len(prefix) + prefix_skip1
463 prefix.extend(prefix1)
464 if not got_all:
465 break
466 else:
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300467 break
Serhiy Storchakabe9a4e52016-09-10 00:57:55 +0300468 else:
469 return prefix, prefix_skip, True
470 return prefix, prefix_skip, False
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300471
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300472def _get_charset_prefix(pattern, flags):
473 while True:
474 if not pattern.data:
475 return None
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300476 op, av = pattern.data[0]
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300477 if op is not SUBPATTERN:
478 break
479 group, add_flags, del_flags, pattern = av
480 flags = (flags | add_flags) & ~del_flags
481 if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
482 return None
483
484 iscased = _get_iscased(flags)
485 if op is LITERAL:
486 if iscased and iscased(av):
487 return None
488 return [(op, av)]
489 elif op is BRANCH:
490 charset = []
491 charsetappend = charset.append
492 for p in av[1]:
493 if not p:
494 return None
495 op, av = p[0]
496 if op is LITERAL and not (iscased and iscased(av)):
497 charsetappend((op, av))
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300498 else:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300499 return None
500 return charset
501 elif op is IN:
502 charset = av
503 if iscased:
504 for op, av in charset:
505 if op is LITERAL:
506 if iscased(av):
507 return None
508 elif op is RANGE:
509 if av[1] > 0xffff:
510 return None
511 if any(map(iscased, range(av[0], av[1]+1))):
512 return None
513 return charset
514 return None
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300515
516def _compile_info(code, pattern, flags):
517 # internal: compile an info block. in the current version,
518 # this contains min/max pattern width, and an optional literal
519 # prefix or a character map
520 lo, hi = pattern.getwidth()
521 if hi > MAXCODE:
522 hi = MAXCODE
523 if lo == 0:
524 code.extend([INFO, 4, 0, lo, hi])
525 return
526 # look for a literal prefix
527 prefix = []
528 prefix_skip = 0
529 charset = [] # not used
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300530 if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300531 # look for literal prefix
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300532 prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300533 # if no prefix, look for charset prefix
534 if not prefix:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300535 charset = _get_charset_prefix(pattern, flags)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536## if prefix:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200537## print("*** PREFIX", prefix, prefix_skip)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000538## if charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200539## print("*** CHARSET", charset)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000540 # add an info block
541 emit = code.append
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200542 emit(INFO)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000543 skip = len(code); emit(0)
544 # literal flag
545 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000546 if prefix:
547 mask = SRE_INFO_PREFIX
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300548 if prefix_skip is None and got_all:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200549 mask = mask | SRE_INFO_LITERAL
Fredrik Lundh3562f112000-07-02 12:00:07 +0000550 elif charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200551 mask = mask | SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000552 emit(mask)
553 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000554 if lo < MAXCODE:
555 emit(lo)
556 else:
557 emit(MAXCODE)
558 prefix = prefix[:MAXCODE]
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200559 emit(min(hi, MAXCODE))
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000560 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000561 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000562 emit(len(prefix)) # length
Serhiy Storchaka66dc4642015-06-21 14:06:55 +0300563 if prefix_skip is None:
564 prefix_skip = len(prefix)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000565 emit(prefix_skip) # skip
566 code.extend(prefix)
567 # generate overlap table
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200568 code.extend(_generate_overlap_table(prefix))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000569 elif charset:
Serhiy Storchaka6d336a02017-05-09 23:37:14 +0300570 charset, hascased = _optimize_charset(charset)
571 assert not hascased
Guido van Rossum577fb5a2003-02-24 01:18:35 +0000572 _compile_charset(charset, flags, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000573 code[skip] = len(code) - skip
574
Just van Rossum74902502003-07-02 21:37:16 +0000575def isstring(obj):
Thomas Wouters40a088d2008-03-18 20:19:54 +0000576 return isinstance(obj, (str, bytes))
Just van Rossum74902502003-07-02 21:37:16 +0000577
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000578def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000579
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000580 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000581 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000582
583 # compile info block
584 _compile_info(code, p, flags)
585
586 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000587 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000588
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200589 code.append(SUCCESS)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000590
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000591 return code
592
593def compile(p, flags=0):
594 # internal: convert pattern list to internal format
595
Just van Rossum74902502003-07-02 21:37:16 +0000596 if isstring(p):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000597 pattern = p
598 p = sre_parse.parse(p, flags)
599 else:
600 pattern = None
601
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000602 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000603
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200604 # print(code)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000605
Fredrik Lundhc2301732000-07-02 22:25:39 +0000606 # map in either direction
607 groupindex = p.pattern.groupdict
608 indexgroup = [None] * p.pattern.groups
609 for k, i in groupindex.items():
610 indexgroup[i] = k
611
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000612 return _sre.compile(
Christian Heimes072c0f12008-01-03 23:01:04 +0000613 pattern, flags | p.pattern.flags, code,
Fredrik Lundh6f013982000-07-03 18:44:21 +0000614 p.pattern.groups-1,
Victor Stinner726a57d2016-11-22 23:04:39 +0100615 groupindex, tuple(indexgroup)
Fredrik Lundh90a07912000-06-30 07:50:59 +0000616 )