blob: 502b0616c6e54ea683877ba8d37d0e6b31410ae8 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Victor Stinner7fa767e2014-03-20 09:16:38 +010013import _sre
Christian Heimes5e696852008-04-09 08:37:03 +000014import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000015from sre_constants import *
16
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000017assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
Raymond Hettingerdf1b6992014-11-09 15:56:33 -080019_LITERAL_CODES = {LITERAL, NOT_LITERAL}
20_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
21_SUCCESS_CODES = {SUCCESS, FAILURE}
22_ASSERT_CODES = {ASSERT, ASSERT_NOT}
Raymond Hettinger049ade22005-02-28 19:27:52 +000023
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020024# Sets of lowercase characters which have the same uppercase.
25_equivalences = (
26 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
27 (0x69, 0x131), # iı
28 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
29 (0x73, 0x17f), # sſ
30 # MICRO SIGN, GREEK SMALL LETTER MU
31 (0xb5, 0x3bc), # µμ
32 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
33 (0x345, 0x3b9, 0x1fbe), # \u0345ιι
34 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
35 (0x390, 0x1fd3), # ΐΐ
36 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
37 (0x3b0, 0x1fe3), # ΰΰ
38 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
39 (0x3b2, 0x3d0), # βϐ
40 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
41 (0x3b5, 0x3f5), # εϵ
42 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
43 (0x3b8, 0x3d1), # θϑ
44 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
45 (0x3ba, 0x3f0), # κϰ
46 # GREEK SMALL LETTER PI, GREEK PI SYMBOL
47 (0x3c0, 0x3d6), # πϖ
48 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
49 (0x3c1, 0x3f1), # ρϱ
50 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
51 (0x3c2, 0x3c3), # ςσ
52 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
53 (0x3c6, 0x3d5), # φϕ
54 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
55 (0x1e61, 0x1e9b), # ṡẛ
56 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
57 (0xfb05, 0xfb06), # ſtst
58)
59
60# Maps the lowercase code to lowercase codes which have the same uppercase.
61_ignorecase_fixes = {i: tuple(j for j in t if i != j)
62 for t in _equivalences for i in t}
63
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000064def _compile(code, pattern, flags):
65 # internal: compile a (sub)pattern
66 emit = code.append
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000067 _len = len
Raymond Hettinger049ade22005-02-28 19:27:52 +000068 LITERAL_CODES = _LITERAL_CODES
69 REPEATING_CODES = _REPEATING_CODES
70 SUCCESS_CODES = _SUCCESS_CODES
71 ASSERT_CODES = _ASSERT_CODES
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020072 if (flags & SRE_FLAG_IGNORECASE and
73 not (flags & SRE_FLAG_LOCALE) and
74 flags & SRE_FLAG_UNICODE):
75 fixes = _ignorecase_fixes
76 else:
77 fixes = None
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000078 for op, av in pattern:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000079 if op in LITERAL_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000080 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020081 lo = _sre.getlower(av, flags)
82 if fixes and lo in fixes:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020083 emit(IN_IGNORE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020084 skip = _len(code); emit(0)
85 if op is NOT_LITERAL:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020086 emit(NEGATE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020087 for k in (lo,) + fixes[lo]:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020088 emit(LITERAL)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020089 emit(k)
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020090 emit(FAILURE)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020091 code[skip] = _len(code) - skip
92 else:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +020093 emit(OP_IGNORE[op])
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020094 emit(lo)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000095 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +020096 emit(op)
Fredrik Lundh2e240442001-01-15 18:28:14 +000097 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000098 elif op is IN:
99 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200100 emit(OP_IGNORE[op])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000101 def fixup(literal, flags=flags):
102 return _sre.getlower(literal, flags)
103 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200104 emit(op)
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200105 fixup = None
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000106 skip = _len(code); emit(0)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200107 _compile_charset(av, flags, code, fixup, fixes)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000108 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000109 elif op is ANY:
110 if flags & SRE_FLAG_DOTALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200111 emit(ANY_ALL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000112 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200113 emit(ANY)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000114 elif op in REPEATING_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000115 if flags & SRE_FLAG_TEMPLATE:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200116 raise error("internal: unsupported template operator %r" % (op,))
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000117 elif _simple(av) and op is not REPEAT:
118 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200119 emit(REPEAT_ONE)
Guido van Rossum41c99e72003-04-14 17:59:34 +0000120 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200121 emit(MIN_REPEAT_ONE)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000122 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000123 emit(av[0])
124 emit(av[1])
125 _compile(code, av[2], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200126 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000127 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000128 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200129 emit(REPEAT)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000130 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000131 emit(av[0])
132 emit(av[1])
133 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000134 code[skip] = _len(code) - skip
135 if op is MAX_REPEAT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200136 emit(MAX_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000137 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200138 emit(MIN_UNTIL)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000139 elif op is SUBPATTERN:
140 if av[0]:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200141 emit(MARK)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000142 emit((av[0]-1)*2)
143 # _compile_info(code, av[1], flags)
144 _compile(code, av[1], flags)
145 if av[0]:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200146 emit(MARK)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000147 emit((av[0]-1)*2+1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000148 elif op in SUCCESS_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200149 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000150 elif op in ASSERT_CODES:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200151 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000152 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000153 if av[0] >= 0:
154 emit(0) # look ahead
155 else:
156 lo, hi = av[1].getwidth()
157 if lo != hi:
Collin Winterce36ad82007-08-30 01:19:48 +0000158 raise error("look-behind requires fixed-width pattern")
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000159 emit(lo) # look behind
160 _compile(code, av[1], flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200161 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000162 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000163 elif op is CALL:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200164 emit(op)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000165 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000166 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200167 emit(SUCCESS)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000168 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000169 elif op is AT:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200170 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000171 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000172 av = AT_MULTILINE.get(av, av)
173 if flags & SRE_FLAG_LOCALE:
174 av = AT_LOCALE.get(av, av)
175 elif flags & SRE_FLAG_UNICODE:
176 av = AT_UNICODE.get(av, av)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200177 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000178 elif op is BRANCH:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200179 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000180 tail = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000181 tailappend = tail.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000182 for av in av[1]:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000183 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000184 # _compile_info(code, av, flags)
185 _compile(code, av, flags)
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200186 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000187 tailappend(_len(code)); emit(0)
188 code[skip] = _len(code) - skip
Serhiy Storchakaab140882014-11-11 21:13:28 +0200189 emit(FAILURE) # end of branch
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000190 for tail in tail:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000191 code[tail] = _len(code) - tail
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000192 elif op is CATEGORY:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200193 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000194 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000195 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000196 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000197 av = CH_UNICODE[av]
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200198 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000199 elif op is GROUPREF:
200 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200201 emit(OP_IGNORE[op])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000202 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200203 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000204 emit(av-1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000205 elif op is GROUPREF_EXISTS:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200206 emit(op)
Andrew M. Kuchlingc30faa82005-06-02 13:35:52 +0000207 emit(av[0]-1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000208 skipyes = _len(code); emit(0)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000209 _compile(code, av[1], flags)
210 if av[2]:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200211 emit(JUMP)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000212 skipno = _len(code); emit(0)
213 code[skipyes] = _len(code) - skipyes + 1
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000214 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000215 code[skipno] = _len(code) - skipno
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000216 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000217 code[skipyes] = _len(code) - skipyes + 1
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000218 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200219 raise error("internal: unsupported operand type %r" % (op,))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000220
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200221def _compile_charset(charset, flags, code, fixup=None, fixes=None):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000222 # compile charset subprogram
223 emit = code.append
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200224 for op, av in _optimize_charset(charset, fixup, fixes):
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200225 emit(op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000226 if op is NEGATE:
227 pass
228 elif op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200229 emit(av)
230 elif op is RANGE or op is RANGE_IGNORE:
231 emit(av[0])
232 emit(av[1])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000233 elif op is CHARSET:
234 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000235 elif op is BIGCHARSET:
236 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000237 elif op is CATEGORY:
238 if flags & SRE_FLAG_LOCALE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200239 emit(CH_LOCALE[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000240 elif flags & SRE_FLAG_UNICODE:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200241 emit(CH_UNICODE[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000242 else:
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200243 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000244 else:
Serhiy Storchaka632a77e2015-03-25 21:03:47 +0200245 raise error("internal: unsupported set operator %r" % (op,))
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200246 emit(FAILURE)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000247
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200248def _optimize_charset(charset, fixup, fixes):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000249 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000250 out = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200251 tail = []
252 charmap = bytearray(256)
253 for op, av in charset:
254 while True:
255 try:
256 if op is LITERAL:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200257 if fixup:
Serhiy Storchaka5619ab92014-11-10 12:43:14 +0200258 lo = fixup(av)
259 charmap[lo] = 1
260 if fixes and lo in fixes:
261 for k in fixes[lo]:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200262 charmap[k] = 1
263 else:
264 charmap[av] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200265 elif op is RANGE:
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200266 r = range(av[0], av[1]+1)
267 if fixup:
268 r = map(fixup, r)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200269 if fixup and fixes:
270 for i in r:
271 charmap[i] = 1
272 if i in fixes:
273 for k in fixes[i]:
274 charmap[k] = 1
275 else:
276 for i in r:
277 charmap[i] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200278 elif op is NEGATE:
279 out.append((op, av))
280 else:
281 tail.append((op, av))
282 except IndexError:
283 if len(charmap) == 256:
284 # character set contains non-UCS1 character codes
285 charmap += b'\0' * 0xff00
286 continue
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200287 # Character set contains non-BMP character codes.
288 # There are only two ranges of cased non-BMP characters:
289 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
290 # and for both ranges RANGE_IGNORE works.
291 if fixup and op is RANGE:
292 op = RANGE_IGNORE
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200293 tail.append((op, av))
294 break
295
Fredrik Lundh3562f112000-07-02 12:00:07 +0000296 # compress character map
Fredrik Lundh3562f112000-07-02 12:00:07 +0000297 runs = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200298 q = 0
299 while True:
300 p = charmap.find(1, q)
301 if p < 0:
302 break
303 if len(runs) >= 2:
304 runs = None
305 break
306 q = charmap.find(0, p)
307 if q < 0:
308 runs.append((p, len(charmap)))
309 break
310 runs.append((p, q))
311 if runs is not None:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000312 # use literal/range
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200313 for p, q in runs:
314 if q - p == 1:
315 out.append((LITERAL, p))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000316 else:
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200317 out.append((RANGE, (p, q - 1)))
318 out += tail
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200319 # if the case was changed or new representation is more compact
320 if fixup or len(out) < len(charset):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000321 return out
Serhiy Storchaka4b8f8942014-10-31 12:36:56 +0200322 # else original character set is good enough
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200323 return charset
324
325 # use bitmap
326 if len(charmap) == 256:
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000327 data = _mk_bitmap(charmap)
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200328 out.append((CHARSET, data))
329 out += tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000330 return out
Fredrik Lundh3562f112000-07-02 12:00:07 +0000331
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200332 # To represent a big charset, first a bitmap of all characters in the
333 # set is constructed. Then, this bitmap is sliced into chunks of 256
334 # characters, duplicate chunks are eliminated, and each chunk is
335 # given a number. In the compiled expression, the charset is
336 # represented by a 32-bit word sequence, consisting of one word for
337 # the number of different chunks, a sequence of 256 bytes (64 words)
338 # of chunk numbers indexed by their original chunk position, and a
339 # sequence of 256-bit chunks (8 words each).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000340
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200341 # Compression is normally good: in a typical charset, large ranges of
342 # Unicode will be either completely excluded (e.g. if only cyrillic
343 # letters are to be matched), or completely included (e.g. if large
344 # subranges of Kanji match). These ranges will be represented by
345 # chunks of all one-bits or all zero-bits.
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000346
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200347 # Matching can be also done efficiently: the more significant byte of
348 # the Unicode character is an index into the chunk number, and the
349 # less significant byte is a bit index in the chunk (just like the
350 # CHARSET matching).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000351
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200352 charmap = bytes(charmap) # should be hashable
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000353 comps = {}
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200354 mapping = bytearray(256)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000355 block = 0
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200356 data = bytearray()
357 for i in range(0, 65536, 256):
358 chunk = charmap[i: i + 256]
359 if chunk in comps:
360 mapping[i // 256] = comps[chunk]
361 else:
362 mapping[i // 256] = comps[chunk] = block
363 block += 1
364 data += chunk
365 data = _mk_bitmap(data)
366 data[0:0] = [block] + _bytes_to_codes(mapping)
367 out.append((BIGCHARSET, data))
368 out += tail
369 return out
370
371_CODEBITS = _sre.CODESIZE * 8
Serhiy Storchakaab140882014-11-11 21:13:28 +0200372MAXCODE = (1 << _CODEBITS) - 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200373_BITS_TRANS = b'0' + b'1' * 255
374def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
375 s = bits.translate(_BITS_TRANS)[::-1]
376 return [_int(s[i - _CODEBITS: i], 2)
377 for i in range(len(s), 0, -_CODEBITS)]
378
379def _bytes_to_codes(b):
380 # Convert block indices to word array
Serhiy Storchaka19e91582014-11-10 13:24:47 +0200381 a = memoryview(b).cast('I')
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200382 assert a.itemsize == _sre.CODESIZE
383 assert len(a) * a.itemsize == len(b)
384 return a.tolist()
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000385
Fredrik Lundhe1869832000-08-01 22:47:49 +0000386def _simple(av):
387 # check if av is a "simple" operator
388 lo, hi = av[2].getwidth()
Fredrik Lundhe1869832000-08-01 22:47:49 +0000389 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
390
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200391def _generate_overlap_table(prefix):
392 """
393 Generate an overlap table for the following prefix.
394 An overlap table is a table of the same size as the prefix which
395 informs about the potential self-overlap for each index in the prefix:
396 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
397 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
398 prefix[0:k]
399 """
400 table = [0] * len(prefix)
401 for i in range(1, len(prefix)):
402 idx = table[i - 1]
403 while prefix[i] != prefix[idx]:
404 if idx == 0:
405 table[i] = 0
406 break
407 idx = table[idx - 1]
408 else:
409 table[i] = idx + 1
410 return table
411
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000412def _compile_info(code, pattern, flags):
413 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000414 # this contains min/max pattern width, and an optional literal
415 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000416 lo, hi = pattern.getwidth()
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200417 if hi > MAXCODE:
418 hi = MAXCODE
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000419 if lo == 0:
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200420 code.extend([INFO, 4, 0, lo, hi])
421 return
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000422 # look for a literal prefix
423 prefix = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000424 prefixappend = prefix.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000425 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000426 charset = [] # not used
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000427 charsetappend = charset.append
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000428 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000429 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000430 for op, av in pattern.data:
431 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000432 if len(prefix) == prefix_skip:
433 prefix_skip = prefix_skip + 1
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000434 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000435 elif op is SUBPATTERN and len(av[1]) == 1:
436 op, av = av[1][0]
437 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000438 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000439 else:
440 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000441 else:
442 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000443 # if no prefix, look for charset prefix
444 if not prefix and pattern.data:
445 op, av = pattern.data[0]
446 if op is SUBPATTERN and av[1]:
447 op, av = av[1][0]
448 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000449 charsetappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000450 elif op is BRANCH:
451 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000452 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000453 for p in av[1]:
454 if not p:
455 break
456 op, av = p[0]
457 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000458 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000459 else:
460 break
461 else:
462 charset = c
463 elif op is BRANCH:
464 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000465 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000466 for p in av[1]:
467 if not p:
468 break
469 op, av = p[0]
470 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000471 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000472 else:
473 break
474 else:
475 charset = c
476 elif op is IN:
477 charset = av
478## if prefix:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200479## print("*** PREFIX", prefix, prefix_skip)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000480## if charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200481## print("*** CHARSET", charset)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000482 # add an info block
483 emit = code.append
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200484 emit(INFO)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000485 skip = len(code); emit(0)
486 # literal flag
487 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000488 if prefix:
489 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000490 if len(prefix) == prefix_skip == len(pattern.data):
Serhiy Storchakaab140882014-11-11 21:13:28 +0200491 mask = mask | SRE_INFO_LITERAL
Fredrik Lundh3562f112000-07-02 12:00:07 +0000492 elif charset:
Serhiy Storchakaab140882014-11-11 21:13:28 +0200493 mask = mask | SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000494 emit(mask)
495 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000496 if lo < MAXCODE:
497 emit(lo)
498 else:
499 emit(MAXCODE)
500 prefix = prefix[:MAXCODE]
Serhiy Storchaka83e80272015-02-03 11:04:19 +0200501 emit(min(hi, MAXCODE))
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000502 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000503 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000504 emit(len(prefix)) # length
505 emit(prefix_skip) # skip
506 code.extend(prefix)
507 # generate overlap table
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200508 code.extend(_generate_overlap_table(prefix))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000509 elif charset:
Guido van Rossum577fb5a2003-02-24 01:18:35 +0000510 _compile_charset(charset, flags, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000511 code[skip] = len(code) - skip
512
Just van Rossum74902502003-07-02 21:37:16 +0000513def isstring(obj):
Thomas Wouters40a088d2008-03-18 20:19:54 +0000514 return isinstance(obj, (str, bytes))
Just van Rossum74902502003-07-02 21:37:16 +0000515
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000516def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000517
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000518 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000519 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000520
521 # compile info block
522 _compile_info(code, p, flags)
523
524 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000525 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000526
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200527 code.append(SUCCESS)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000528
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000529 return code
530
531def compile(p, flags=0):
532 # internal: convert pattern list to internal format
533
Just van Rossum74902502003-07-02 21:37:16 +0000534 if isstring(p):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000535 pattern = p
536 p = sre_parse.parse(p, flags)
537 else:
538 pattern = None
539
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000540 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000541
Serhiy Storchakac7f7d382014-11-09 20:48:36 +0200542 # print(code)
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000543
Fredrik Lundhc2301732000-07-02 22:25:39 +0000544 # map in either direction
545 groupindex = p.pattern.groupdict
546 indexgroup = [None] * p.pattern.groups
547 for k, i in groupindex.items():
548 indexgroup[i] = k
549
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000550 return _sre.compile(
Christian Heimes072c0f12008-01-03 23:01:04 +0000551 pattern, flags | p.pattern.flags, code,
Fredrik Lundh6f013982000-07-03 18:44:21 +0000552 p.pattern.groups-1,
553 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000554 )