blob: 550ea158afd6f4b2ef6e6dbf4a4de8754b66f709 [file] [log] [blame]
Guido van Rossum7627c0d2000-03-31 14:58:54 +00001#
2# Secret Labs' Regular Expression Engine
Guido van Rossum7627c0d2000-03-31 14:58:54 +00003#
4# convert template to internal format
5#
Fredrik Lundh770617b2001-01-14 15:06:11 +00006# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00007#
Fredrik Lundh29c4ba92000-08-01 18:20:07 +00008# See the sre.py file for information on usage and redistribution.
Guido van Rossum7627c0d2000-03-31 14:58:54 +00009#
10
Fred Drakeb8f22742001-09-04 19:10:20 +000011"""Internal support module for sre"""
12
Victor Stinner7fa767e2014-03-20 09:16:38 +010013import _sre
Christian Heimes5e696852008-04-09 08:37:03 +000014import sre_parse
Guido van Rossum7627c0d2000-03-31 14:58:54 +000015from sre_constants import *
Serhiy Storchaka70ca0212013-02-16 16:47:47 +020016from _sre import MAXREPEAT
Guido van Rossum7627c0d2000-03-31 14:58:54 +000017
Fredrik Lundhb35ffc02001-01-15 12:46:09 +000018assert _sre.MAGIC == MAGIC, "SRE module mismatch"
19
Martin v. Löwis78e2f062003-04-19 12:56:08 +000020if _sre.CODESIZE == 2:
21 MAXCODE = 65535
22else:
Guido van Rossume2a383d2007-01-15 16:59:06 +000023 MAXCODE = 0xFFFFFFFF
Fredrik Lundh3562f112000-07-02 12:00:07 +000024
Raymond Hettinger049ade22005-02-28 19:27:52 +000025_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
26_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
27_SUCCESS_CODES = set([SUCCESS, FAILURE])
28_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
29
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020030# Sets of lowercase characters which have the same uppercase.
31_equivalences = (
32 # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
33 (0x69, 0x131), # iı
34 # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
35 (0x73, 0x17f), # sſ
36 # MICRO SIGN, GREEK SMALL LETTER MU
37 (0xb5, 0x3bc), # µμ
38 # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
39 (0x345, 0x3b9, 0x1fbe), # \u0345ιι
40 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
41 (0x390, 0x1fd3), # ΐΐ
42 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
43 (0x3b0, 0x1fe3), # ΰΰ
44 # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
45 (0x3b2, 0x3d0), # βϐ
46 # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
47 (0x3b5, 0x3f5), # εϵ
48 # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
49 (0x3b8, 0x3d1), # θϑ
50 # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
51 (0x3ba, 0x3f0), # κϰ
52 # GREEK SMALL LETTER PI, GREEK PI SYMBOL
53 (0x3c0, 0x3d6), # πϖ
54 # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
55 (0x3c1, 0x3f1), # ρϱ
56 # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
57 (0x3c2, 0x3c3), # ςσ
58 # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
59 (0x3c6, 0x3d5), # φϕ
60 # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
61 (0x1e61, 0x1e9b), # ṡẛ
62 # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
63 (0xfb05, 0xfb06), # ſtst
64)
65
66# Maps the lowercase code to lowercase codes which have the same uppercase.
67_ignorecase_fixes = {i: tuple(j for j in t if i != j)
68 for t in _equivalences for i in t}
69
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000070def _compile(code, pattern, flags):
71 # internal: compile a (sub)pattern
72 emit = code.append
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000073 _len = len
Raymond Hettinger049ade22005-02-28 19:27:52 +000074 LITERAL_CODES = _LITERAL_CODES
75 REPEATING_CODES = _REPEATING_CODES
76 SUCCESS_CODES = _SUCCESS_CODES
77 ASSERT_CODES = _ASSERT_CODES
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020078 if (flags & SRE_FLAG_IGNORECASE and
79 not (flags & SRE_FLAG_LOCALE) and
80 flags & SRE_FLAG_UNICODE):
81 fixes = _ignorecase_fixes
82 else:
83 fixes = None
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000084 for op, av in pattern:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +000085 if op in LITERAL_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +000086 if flags & SRE_FLAG_IGNORECASE:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +020087 lo = _sre.getlower(av, flags)
88 if fixes and lo in fixes:
89 emit(OPCODES[IN_IGNORE])
90 skip = _len(code); emit(0)
91 if op is NOT_LITERAL:
92 emit(OPCODES[NEGATE])
93 for k in (lo,) + fixes[lo]:
94 emit(OPCODES[LITERAL])
95 emit(k)
96 emit(OPCODES[FAILURE])
97 code[skip] = _len(code) - skip
98 else:
99 emit(OPCODES[OP_IGNORE[op]])
100 emit(lo)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000101 else:
102 emit(OPCODES[op])
Fredrik Lundh2e240442001-01-15 18:28:14 +0000103 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000104 elif op is IN:
105 if flags & SRE_FLAG_IGNORECASE:
106 emit(OPCODES[OP_IGNORE[op]])
107 def fixup(literal, flags=flags):
108 return _sre.getlower(literal, flags)
109 else:
110 emit(OPCODES[op])
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200111 fixup = None
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000112 skip = _len(code); emit(0)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200113 _compile_charset(av, flags, code, fixup, fixes)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000114 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000115 elif op is ANY:
116 if flags & SRE_FLAG_DOTALL:
117 emit(OPCODES[ANY_ALL])
118 else:
119 emit(OPCODES[ANY])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000120 elif op in REPEATING_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000121 if flags & SRE_FLAG_TEMPLATE:
Collin Winterce36ad82007-08-30 01:19:48 +0000122 raise error("internal: unsupported template operator")
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000123 elif _simple(av) and op is not REPEAT:
124 if op is MAX_REPEAT:
Guido van Rossum41c99e72003-04-14 17:59:34 +0000125 emit(OPCODES[REPEAT_ONE])
126 else:
127 emit(OPCODES[MIN_REPEAT_ONE])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000128 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000129 emit(av[0])
130 emit(av[1])
131 _compile(code, av[2], flags)
132 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000133 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000134 else:
135 emit(OPCODES[REPEAT])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000136 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000137 emit(av[0])
138 emit(av[1])
139 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000140 code[skip] = _len(code) - skip
141 if op is MAX_REPEAT:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000142 emit(OPCODES[MAX_UNTIL])
143 else:
144 emit(OPCODES[MIN_UNTIL])
145 elif op is SUBPATTERN:
146 if av[0]:
147 emit(OPCODES[MARK])
148 emit((av[0]-1)*2)
149 # _compile_info(code, av[1], flags)
150 _compile(code, av[1], flags)
151 if av[0]:
152 emit(OPCODES[MARK])
153 emit((av[0]-1)*2+1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000154 elif op in SUCCESS_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000155 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000156 elif op in ASSERT_CODES:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000157 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000158 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000159 if av[0] >= 0:
160 emit(0) # look ahead
161 else:
162 lo, hi = av[1].getwidth()
163 if lo != hi:
Collin Winterce36ad82007-08-30 01:19:48 +0000164 raise error("look-behind requires fixed-width pattern")
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000165 emit(lo) # look behind
166 _compile(code, av[1], flags)
167 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000168 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000169 elif op is CALL:
170 emit(OPCODES[op])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000171 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000172 _compile(code, av, flags)
173 emit(OPCODES[SUCCESS])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000174 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000175 elif op is AT:
176 emit(OPCODES[op])
177 if flags & SRE_FLAG_MULTILINE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000178 av = AT_MULTILINE.get(av, av)
179 if flags & SRE_FLAG_LOCALE:
180 av = AT_LOCALE.get(av, av)
181 elif flags & SRE_FLAG_UNICODE:
182 av = AT_UNICODE.get(av, av)
183 emit(ATCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000184 elif op is BRANCH:
185 emit(OPCODES[op])
186 tail = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000187 tailappend = tail.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000188 for av in av[1]:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000189 skip = _len(code); emit(0)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000190 # _compile_info(code, av, flags)
191 _compile(code, av, flags)
192 emit(OPCODES[JUMP])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000193 tailappend(_len(code)); emit(0)
194 code[skip] = _len(code) - skip
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000195 emit(0) # end of branch
196 for tail in tail:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000197 code[tail] = _len(code) - tail
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000198 elif op is CATEGORY:
199 emit(OPCODES[op])
200 if flags & SRE_FLAG_LOCALE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000201 av = CH_LOCALE[av]
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000202 elif flags & SRE_FLAG_UNICODE:
Fredrik Lundhb25e1ad2001-03-22 15:50:10 +0000203 av = CH_UNICODE[av]
204 emit(CHCODES[av])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000205 elif op is GROUPREF:
206 if flags & SRE_FLAG_IGNORECASE:
207 emit(OPCODES[OP_IGNORE[op]])
208 else:
209 emit(OPCODES[op])
210 emit(av-1)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000211 elif op is GROUPREF_EXISTS:
212 emit(OPCODES[op])
Andrew M. Kuchlingc30faa82005-06-02 13:35:52 +0000213 emit(av[0]-1)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000214 skipyes = _len(code); emit(0)
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000215 _compile(code, av[1], flags)
216 if av[2]:
217 emit(OPCODES[JUMP])
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000218 skipno = _len(code); emit(0)
219 code[skipyes] = _len(code) - skipyes + 1
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000220 _compile(code, av[2], flags)
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000221 code[skipno] = _len(code) - skipno
Gustavo Niemeyerad3fc442003-10-17 22:13:16 +0000222 else:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000223 code[skipyes] = _len(code) - skipyes + 1
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000224 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000225 raise ValueError("unsupported operand type", op)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000226
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200227def _compile_charset(charset, flags, code, fixup=None, fixes=None):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000228 # compile charset subprogram
229 emit = code.append
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200230 for op, av in _optimize_charset(charset, fixup, fixes,
231 flags & SRE_FLAG_UNICODE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000232 emit(OPCODES[op])
233 if op is NEGATE:
234 pass
235 elif op is LITERAL:
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200236 emit(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000237 elif op is RANGE:
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200238 emit(av[0])
239 emit(av[1])
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000240 elif op is CHARSET:
241 code.extend(av)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000242 elif op is BIGCHARSET:
243 code.extend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000244 elif op is CATEGORY:
245 if flags & SRE_FLAG_LOCALE:
246 emit(CHCODES[CH_LOCALE[av]])
247 elif flags & SRE_FLAG_UNICODE:
248 emit(CHCODES[CH_UNICODE[av]])
249 else:
250 emit(CHCODES[av])
251 else:
Collin Winterce36ad82007-08-30 01:19:48 +0000252 raise error("internal: unsupported set operator")
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000253 emit(OPCODES[FAILURE])
254
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200255def _optimize_charset(charset, fixup, fixes, isunicode):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000256 # internal: optimize character set
Fredrik Lundh3562f112000-07-02 12:00:07 +0000257 out = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200258 tail = []
259 charmap = bytearray(256)
260 for op, av in charset:
261 while True:
262 try:
263 if op is LITERAL:
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200264 if fixup:
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200265 i = fixup(av)
266 charmap[i] = 1
267 if fixes and i in fixes:
268 for k in fixes[i]:
269 charmap[k] = 1
270 else:
271 charmap[av] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200272 elif op is RANGE:
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200273 r = range(av[0], av[1]+1)
274 if fixup:
275 r = map(fixup, r)
Serhiy Storchaka0c938f62014-11-10 12:37:16 +0200276 if fixup and fixes:
277 for i in r:
278 charmap[i] = 1
279 if i in fixes:
280 for k in fixes[i]:
281 charmap[k] = 1
282 else:
283 for i in r:
284 charmap[i] = 1
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200285 elif op is NEGATE:
286 out.append((op, av))
287 else:
288 tail.append((op, av))
289 except IndexError:
290 if len(charmap) == 256:
291 # character set contains non-UCS1 character codes
292 charmap += b'\0' * 0xff00
293 continue
294 # character set contains non-BMP character codes
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200295 if fixup and isunicode and op is RANGE:
296 lo, hi = av
297 ranges = [av]
298 # There are only two ranges of cased astral characters:
299 # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
300 _fixup_range(max(0x10000, lo), min(0x11fff, hi),
301 ranges, fixup)
302 for lo, hi in ranges:
303 if lo == hi:
304 tail.append((LITERAL, hi))
305 else:
306 tail.append((RANGE, (lo, hi)))
307 else:
308 tail.append((op, av))
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200309 break
310
Fredrik Lundh3562f112000-07-02 12:00:07 +0000311 # compress character map
Fredrik Lundh3562f112000-07-02 12:00:07 +0000312 runs = []
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200313 q = 0
314 while True:
315 p = charmap.find(1, q)
316 if p < 0:
317 break
318 if len(runs) >= 2:
319 runs = None
320 break
321 q = charmap.find(0, p)
322 if q < 0:
323 runs.append((p, len(charmap)))
324 break
325 runs.append((p, q))
326 if runs is not None:
Fredrik Lundh3562f112000-07-02 12:00:07 +0000327 # use literal/range
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200328 for p, q in runs:
329 if q - p == 1:
330 out.append((LITERAL, p))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000331 else:
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200332 out.append((RANGE, (p, q - 1)))
333 out += tail
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200334 # if the case was changed or new representation is more compact
335 if fixup or len(out) < len(charset):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000336 return out
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200337 # else original character set is good enough
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200338 return charset
339
340 # use bitmap
341 if len(charmap) == 256:
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000342 data = _mk_bitmap(charmap)
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200343 out.append((CHARSET, data))
344 out += tail
Fredrik Lundh3562f112000-07-02 12:00:07 +0000345 return out
Fredrik Lundh3562f112000-07-02 12:00:07 +0000346
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200347 # To represent a big charset, first a bitmap of all characters in the
348 # set is constructed. Then, this bitmap is sliced into chunks of 256
349 # characters, duplicate chunks are eliminated, and each chunk is
350 # given a number. In the compiled expression, the charset is
351 # represented by a 32-bit word sequence, consisting of one word for
352 # the number of different chunks, a sequence of 256 bytes (64 words)
353 # of chunk numbers indexed by their original chunk position, and a
354 # sequence of 256-bit chunks (8 words each).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000355
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200356 # Compression is normally good: in a typical charset, large ranges of
357 # Unicode will be either completely excluded (e.g. if only cyrillic
358 # letters are to be matched), or completely included (e.g. if large
359 # subranges of Kanji match). These ranges will be represented by
360 # chunks of all one-bits or all zero-bits.
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000361
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200362 # Matching can be also done efficiently: the more significant byte of
363 # the Unicode character is an index into the chunk number, and the
364 # less significant byte is a bit index in the chunk (just like the
365 # CHARSET matching).
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000366
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200367 charmap = bytes(charmap) # should be hashable
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000368 comps = {}
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200369 mapping = bytearray(256)
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000370 block = 0
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200371 data = bytearray()
372 for i in range(0, 65536, 256):
373 chunk = charmap[i: i + 256]
374 if chunk in comps:
375 mapping[i // 256] = comps[chunk]
376 else:
377 mapping[i // 256] = comps[chunk] = block
378 block += 1
379 data += chunk
380 data = _mk_bitmap(data)
381 data[0:0] = [block] + _bytes_to_codes(mapping)
382 out.append((BIGCHARSET, data))
383 out += tail
384 return out
385
Serhiy Storchakab1847e72014-10-31 12:37:50 +0200386def _fixup_range(lo, hi, ranges, fixup):
387 for i in map(fixup, range(lo, hi+1)):
388 for k, (lo, hi) in enumerate(ranges):
389 if i < lo:
390 if l == lo - 1:
391 ranges[k] = (i, hi)
392 else:
393 ranges.insert(k, (i, i))
394 break
395 elif i > hi:
396 if i == hi + 1:
397 ranges[k] = (lo, i)
398 break
399 else:
400 break
401 else:
402 ranges.append((i, i))
403
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200404_CODEBITS = _sre.CODESIZE * 8
405_BITS_TRANS = b'0' + b'1' * 255
406def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
407 s = bits.translate(_BITS_TRANS)[::-1]
408 return [_int(s[i - _CODEBITS: i], 2)
409 for i in range(len(s), 0, -_CODEBITS)]
410
411def _bytes_to_codes(b):
412 # Convert block indices to word array
Serhiy Storchaka19e91582014-11-10 13:24:47 +0200413 a = memoryview(b).cast('I')
Serhiy Storchaka68457be2013-10-27 08:20:29 +0200414 assert a.itemsize == _sre.CODESIZE
415 assert len(a) * a.itemsize == len(b)
416 return a.tolist()
Fredrik Lundh19af43d2001-07-02 16:58:38 +0000417
Fredrik Lundhe1869832000-08-01 22:47:49 +0000418def _simple(av):
419 # check if av is a "simple" operator
420 lo, hi = av[2].getwidth()
Fredrik Lundhe1869832000-08-01 22:47:49 +0000421 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
422
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200423def _generate_overlap_table(prefix):
424 """
425 Generate an overlap table for the following prefix.
426 An overlap table is a table of the same size as the prefix which
427 informs about the potential self-overlap for each index in the prefix:
428 - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
429 - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
430 prefix[0:k]
431 """
432 table = [0] * len(prefix)
433 for i in range(1, len(prefix)):
434 idx = table[i - 1]
435 while prefix[i] != prefix[idx]:
436 if idx == 0:
437 table[i] = 0
438 break
439 idx = table[idx - 1]
440 else:
441 table[i] = idx + 1
442 return table
443
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000444def _compile_info(code, pattern, flags):
445 # internal: compile an info block. in the current version,
Fredrik Lundh3562f112000-07-02 12:00:07 +0000446 # this contains min/max pattern width, and an optional literal
447 # prefix or a character map
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000448 lo, hi = pattern.getwidth()
449 if lo == 0:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000450 return # not worth it
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000451 # look for a literal prefix
452 prefix = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000453 prefixappend = prefix.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000454 prefix_skip = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000455 charset = [] # not used
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000456 charsetappend = charset.append
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000457 if not (flags & SRE_FLAG_IGNORECASE):
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000458 # look for literal prefix
Fredrik Lundh90a07912000-06-30 07:50:59 +0000459 for op, av in pattern.data:
460 if op is LITERAL:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000461 if len(prefix) == prefix_skip:
462 prefix_skip = prefix_skip + 1
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000463 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000464 elif op is SUBPATTERN and len(av[1]) == 1:
465 op, av = av[1][0]
466 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000467 prefixappend(av)
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000468 else:
469 break
Fredrik Lundh90a07912000-06-30 07:50:59 +0000470 else:
471 break
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000472 # if no prefix, look for charset prefix
473 if not prefix and pattern.data:
474 op, av = pattern.data[0]
475 if op is SUBPATTERN and av[1]:
476 op, av = av[1][0]
477 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000478 charsetappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000479 elif op is BRANCH:
480 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000481 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000482 for p in av[1]:
483 if not p:
484 break
485 op, av = p[0]
486 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000487 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000488 else:
489 break
490 else:
491 charset = c
492 elif op is BRANCH:
493 c = []
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000494 cappend = c.append
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000495 for p in av[1]:
496 if not p:
497 break
498 op, av = p[0]
499 if op is LITERAL:
Raymond Hettinger01c9f8c2004-03-26 11:16:55 +0000500 cappend((op, av))
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000501 else:
502 break
503 else:
504 charset = c
505 elif op is IN:
506 charset = av
507## if prefix:
508## print "*** PREFIX", prefix, prefix_skip
509## if charset:
510## print "*** CHARSET", charset
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000511 # add an info block
512 emit = code.append
513 emit(OPCODES[INFO])
514 skip = len(code); emit(0)
515 # literal flag
516 mask = 0
Fredrik Lundh3562f112000-07-02 12:00:07 +0000517 if prefix:
518 mask = SRE_INFO_PREFIX
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000519 if len(prefix) == prefix_skip == len(pattern.data):
Fredrik Lundh3562f112000-07-02 12:00:07 +0000520 mask = mask + SRE_INFO_LITERAL
521 elif charset:
522 mask = mask + SRE_INFO_CHARSET
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000523 emit(mask)
524 # pattern length
Fredrik Lundh3562f112000-07-02 12:00:07 +0000525 if lo < MAXCODE:
526 emit(lo)
527 else:
528 emit(MAXCODE)
529 prefix = prefix[:MAXCODE]
530 if hi < MAXCODE:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000531 emit(hi)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000532 else:
Fredrik Lundh90a07912000-06-30 07:50:59 +0000533 emit(0)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000534 # add literal prefix
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000535 if prefix:
Fredrik Lundh7898c3e2000-08-07 20:59:04 +0000536 emit(len(prefix)) # length
537 emit(prefix_skip) # skip
538 code.extend(prefix)
539 # generate overlap table
Antoine Pitrou79aa68d2013-10-25 21:36:10 +0200540 code.extend(_generate_overlap_table(prefix))
Fredrik Lundh3562f112000-07-02 12:00:07 +0000541 elif charset:
Guido van Rossum577fb5a2003-02-24 01:18:35 +0000542 _compile_charset(charset, flags, code)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000543 code[skip] = len(code) - skip
544
Just van Rossum74902502003-07-02 21:37:16 +0000545def isstring(obj):
Thomas Wouters40a088d2008-03-18 20:19:54 +0000546 return isinstance(obj, (str, bytes))
Just van Rossum74902502003-07-02 21:37:16 +0000547
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000548def _code(p, flags):
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000549
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000550 flags = p.pattern.flags | flags
Fredrik Lundhbe2211e2000-06-29 16:57:40 +0000551 code = []
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000552
553 # compile info block
554 _compile_info(code, p, flags)
555
556 # compile the pattern
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000557 _compile(code, p.data, flags)
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000558
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000559 code.append(OPCODES[SUCCESS])
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000560
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000561 return code
562
563def compile(p, flags=0):
564 # internal: convert pattern list to internal format
565
Just van Rossum74902502003-07-02 21:37:16 +0000566 if isstring(p):
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000567 pattern = p
568 p = sre_parse.parse(p, flags)
569 else:
570 pattern = None
571
Fredrik Lundh2f2c67d2000-08-01 21:05:41 +0000572 code = _code(p, flags)
Fredrik Lundh29c4ba92000-08-01 18:20:07 +0000573
Fredrik Lundh8a3ebf82000-07-23 21:46:17 +0000574 # print code
575
Fredrik Lundh770617b2001-01-14 15:06:11 +0000576 # XXX: <fl> get rid of this limitation!
Fredrik Lundh5e7d51b2004-10-15 06:15:08 +0000577 if p.pattern.groups > 100:
578 raise AssertionError(
579 "sorry, but this version only supports 100 named groups"
580 )
Fredrik Lundh29c08be2000-06-29 23:33:12 +0000581
Fredrik Lundhc2301732000-07-02 22:25:39 +0000582 # map in either direction
583 groupindex = p.pattern.groupdict
584 indexgroup = [None] * p.pattern.groups
585 for k, i in groupindex.items():
586 indexgroup[i] = k
587
Jeremy Hyltonb1aa1952000-06-01 17:39:12 +0000588 return _sre.compile(
Christian Heimes072c0f12008-01-03 23:01:04 +0000589 pattern, flags | p.pattern.flags, code,
Fredrik Lundh6f013982000-07-03 18:44:21 +0000590 p.pattern.groups-1,
591 groupindex, indexgroup
Fredrik Lundh90a07912000-06-30 07:50:59 +0000592 )