Blame - python/lib/Lib/sre_compile.py - platform/tools/idea

blob: a2279de8190c79fd9c74c38b58f0f37033ff3ed4 [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame^]	1	#
				2	# Secret Labs' Regular Expression Engine
				3	#
				4	# convert template to internal format
				5	#
				6	# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
				7	#
				8	# See the sre.py file for information on usage and redistribution.
				9	#
				10
				11	"""Internal support module for sre"""
				12
				13	import _sre, sys
				14
				15	from sre_constants import *
				16
				17	assert _sre.MAGIC == MAGIC, "SRE module mismatch"
				18
				19	if _sre.CODESIZE == 2:
				20	MAXCODE = 65535
				21	else:
				22	MAXCODE = 0xFFFFFFFFL
				23
				24	def _identityfunction(x):
				25	return x
				26
				27	def set(seq):
				28	s = {}
				29	for elem in seq:
				30	s[elem] = 1
				31	return s
				32
				33	_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
				34	_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
				35	_SUCCESS_CODES = set([SUCCESS, FAILURE])
				36	_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
				37
				38	def _compile(code, pattern, flags):
				39	# internal: compile a (sub)pattern
				40	emit = code.append
				41	_len = len
				42	LITERAL_CODES = _LITERAL_CODES
				43	REPEATING_CODES = _REPEATING_CODES
				44	SUCCESS_CODES = _SUCCESS_CODES
				45	ASSERT_CODES = _ASSERT_CODES
				46	for op, av in pattern:
				47	if op in LITERAL_CODES:
				48	if flags & SRE_FLAG_IGNORECASE:
				49	emit(OPCODES[OP_IGNORE[op]])
				50	emit(_sre.getlower(av, flags))
				51	else:
				52	emit(OPCODES[op])
				53	emit(av)
				54	elif op is IN:
				55	if flags & SRE_FLAG_IGNORECASE:
				56	emit(OPCODES[OP_IGNORE[op]])
				57	def fixup(literal, flags=flags):
				58	return _sre.getlower(literal, flags)
				59	else:
				60	emit(OPCODES[op])
				61	fixup = _identityfunction
				62	skip = _len(code); emit(0)
				63	_compile_charset(av, flags, code, fixup)
				64	code[skip] = _len(code) - skip
				65	elif op is ANY:
				66	if flags & SRE_FLAG_DOTALL:
				67	emit(OPCODES[ANY_ALL])
				68	else:
				69	emit(OPCODES[ANY])
				70	elif op in REPEATING_CODES:
				71	if flags & SRE_FLAG_TEMPLATE:
				72	raise error, "internal: unsupported template operator"
				73	emit(OPCODES[REPEAT])
				74	skip = _len(code); emit(0)
				75	emit(av[0])
				76	emit(av[1])
				77	_compile(code, av[2], flags)
				78	emit(OPCODES[SUCCESS])
				79	code[skip] = _len(code) - skip
				80	elif _simple(av) and op is not REPEAT:
				81	if op is MAX_REPEAT:
				82	emit(OPCODES[REPEAT_ONE])
				83	else:
				84	emit(OPCODES[MIN_REPEAT_ONE])
				85	skip = _len(code); emit(0)
				86	emit(av[0])
				87	emit(av[1])
				88	_compile(code, av[2], flags)
				89	emit(OPCODES[SUCCESS])
				90	code[skip] = _len(code) - skip
				91	else:
				92	emit(OPCODES[REPEAT])
				93	skip = _len(code); emit(0)
				94	emit(av[0])
				95	emit(av[1])
				96	_compile(code, av[2], flags)
				97	code[skip] = _len(code) - skip
				98	if op is MAX_REPEAT:
				99	emit(OPCODES[MAX_UNTIL])
				100	else:
				101	emit(OPCODES[MIN_UNTIL])
				102	elif op is SUBPATTERN:
				103	if av[0]:
				104	emit(OPCODES[MARK])
				105	emit((av[0]-1)*2)
				106	# _compile_info(code, av[1], flags)
				107	_compile(code, av[1], flags)
				108	if av[0]:
				109	emit(OPCODES[MARK])
				110	emit((av[0]-1)*2+1)
				111	elif op in SUCCESS_CODES:
				112	emit(OPCODES[op])
				113	elif op in ASSERT_CODES:
				114	emit(OPCODES[op])
				115	skip = _len(code); emit(0)
				116	if av[0] >= 0:
				117	emit(0) # look ahead
				118	else:
				119	lo, hi = av[1].getwidth()
				120	if lo != hi:
				121	raise error, "look-behind requires fixed-width pattern"
				122	emit(lo) # look behind
				123	_compile(code, av[1], flags)
				124	emit(OPCODES[SUCCESS])
				125	code[skip] = _len(code) - skip
				126	elif op is CALL:
				127	emit(OPCODES[op])
				128	skip = _len(code); emit(0)
				129	_compile(code, av, flags)
				130	emit(OPCODES[SUCCESS])
				131	code[skip] = _len(code) - skip
				132	elif op is AT:
				133	emit(OPCODES[op])
				134	if flags & SRE_FLAG_MULTILINE:
				135	av = AT_MULTILINE.get(av, av)
				136	if flags & SRE_FLAG_LOCALE:
				137	av = AT_LOCALE.get(av, av)
				138	elif flags & SRE_FLAG_UNICODE:
				139	av = AT_UNICODE.get(av, av)
				140	emit(ATCODES[av])
				141	elif op is BRANCH:
				142	emit(OPCODES[op])
				143	tail = []
				144	tailappend = tail.append
				145	for av in av[1]:
				146	skip = _len(code); emit(0)
				147	# _compile_info(code, av, flags)
				148	_compile(code, av, flags)
				149	emit(OPCODES[JUMP])
				150	tailappend(_len(code)); emit(0)
				151	code[skip] = _len(code) - skip
				152	emit(0) # end of branch
				153	for tail in tail:
				154	code[tail] = _len(code) - tail
				155	elif op is CATEGORY:
				156	emit(OPCODES[op])
				157	if flags & SRE_FLAG_LOCALE:
				158	av = CH_LOCALE[av]
				159	elif flags & SRE_FLAG_UNICODE:
				160	av = CH_UNICODE[av]
				161	emit(CHCODES[av])
				162	elif op is GROUPREF:
				163	if flags & SRE_FLAG_IGNORECASE:
				164	emit(OPCODES[OP_IGNORE[op]])
				165	else:
				166	emit(OPCODES[op])
				167	emit(av-1)
				168	elif op is GROUPREF_EXISTS:
				169	emit(OPCODES[op])
				170	emit(av[0]-1)
				171	skipyes = _len(code); emit(0)
				172	_compile(code, av[1], flags)
				173	if av[2]:
				174	emit(OPCODES[JUMP])
				175	skipno = _len(code); emit(0)
				176	code[skipyes] = _len(code) - skipyes + 1
				177	_compile(code, av[2], flags)
				178	code[skipno] = _len(code) - skipno
				179	else:
				180	code[skipyes] = _len(code) - skipyes + 1
				181	else:
				182	raise ValueError, ("unsupported operand type", op)
				183
				184	def _compile_charset(charset, flags, code, fixup=None):
				185	# compile charset subprogram
				186	emit = code.append
				187	if fixup is None:
				188	fixup = _identityfunction
				189	for op, av in _optimize_charset(charset, fixup):
				190	emit(OPCODES[op])
				191	if op is NEGATE:
				192	pass
				193	elif op is LITERAL:
				194	emit(fixup(av))
				195	elif op is RANGE:
				196	emit(fixup(av[0]))
				197	emit(fixup(av[1]))
				198	elif op is CHARSET:
				199	code.extend(av)
				200	elif op is BIGCHARSET:
				201	code.extend(av)
				202	elif op is CATEGORY:
				203	if flags & SRE_FLAG_LOCALE:
				204	emit(CHCODES[CH_LOCALE[av]])
				205	elif flags & SRE_FLAG_UNICODE:
				206	emit(CHCODES[CH_UNICODE[av]])
				207	else:
				208	emit(CHCODES[av])
				209	else:
				210	raise error, "internal: unsupported set operator"
				211	emit(OPCODES[FAILURE])
				212
				213	def _optimize_charset(charset, fixup):
				214	# internal: optimize character set
				215	out = []
				216	outappend = out.append
				217	charmap = [0]*256
				218	try:
				219	for op, av in charset:
				220	if op is NEGATE:
				221	outappend((op, av))
				222	elif op is LITERAL:
				223	charmap[fixup(av)] = 1
				224	elif op is RANGE:
				225	for i in range(fixup(av[0]), fixup(av[1])+1):
				226	charmap[i] = 1
				227	elif op is CATEGORY:
				228	# XXX: could append to charmap tail
				229	return charset # cannot compress
				230	except IndexError:
				231	# character set contains unicode characters
				232	return _optimize_unicode(charset, fixup)
				233	# compress character map
				234	i = p = n = 0
				235	runs = []
				236	runsappend = runs.append
				237	for c in charmap:
				238	if c:
				239	if n == 0:
				240	p = i
				241	n = n + 1
				242	elif n:
				243	runsappend((p, n))
				244	n = 0
				245	i = i + 1
				246	if n:
				247	runsappend((p, n))
				248	if len(runs) <= 2:
				249	# use literal/range
				250	for p, n in runs:
				251	if n == 1:
				252	outappend((LITERAL, p))
				253	else:
				254	outappend((RANGE, (p, p+n-1)))
				255	if len(out) < len(charset):
				256	return out
				257	else:
				258	# use bitmap
				259	data = _mk_bitmap(charmap)
				260	outappend((CHARSET, data))
				261	return out
				262	return charset
				263
				264	def _mk_bitmap(bits):
				265	data = []
				266	dataappend = data.append
				267	if _sre.CODESIZE == 2:
				268	start = (1, 0)
				269	else:
				270	start = (1L, 0L)
				271	m, v = start
				272	for c in bits:
				273	if c:
				274	v = v + m
				275	m = m + m
				276	if m > MAXCODE:
				277	dataappend(v)
				278	m, v = start
				279	return data
				280
				281	# To represent a big charset, first a bitmap of all characters in the
				282	# set is constructed. Then, this bitmap is sliced into chunks of 256
				283	# characters, duplicate chunks are eliminated, and each chunk is
				284	# given a number. In the compiled expression, the charset is
				285	# represented by a 16-bit word sequence, consisting of one word for
				286	# the number of different chunks, a sequence of 256 bytes (128 words)
				287	# of chunk numbers indexed by their original chunk position, and a
				288	# sequence of chunks (16 words each).
				289
				290	# Compression is normally good: in a typical charset, large ranges of
				291	# Unicode will be either completely excluded (e.g. if only cyrillic
				292	# letters are to be matched), or completely included (e.g. if large
				293	# subranges of Kanji match). These ranges will be represented by
				294	# chunks of all one-bits or all zero-bits.
				295
				296	# Matching can be also done efficiently: the more significant byte of
				297	# the Unicode character is an index into the chunk number, and the
				298	# less significant byte is a bit index in the chunk (just like the
				299	# CHARSET matching).
				300
				301	# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
				302	# of the basic multilingual plane; an efficient representation
				303	# for all of UTF-16 has not yet been developed. This means,
				304	# in particular, that negated charsets cannot be represented as
				305	# bigcharsets.
				306
				307	def _optimize_unicode(charset, fixup):
				308	# problems with optimization in Jython, forget about it for now
				309	return charset
				310
				311	try:
				312	import array
				313	except ImportError:
				314	return charset
				315	charmap = [0]*65536
				316	negate = 0
				317	try:
				318	for op, av in charset:
				319	if op is NEGATE:
				320	negate = 1
				321	elif op is LITERAL:
				322	charmap[fixup(av)] = 1
				323	elif op is RANGE:
				324	for i in xrange(fixup(av[0]), fixup(av[1])+1):
				325	charmap[i] = 1
				326	elif op is CATEGORY:
				327	# XXX: could expand category
				328	return charset # cannot compress
				329	except IndexError:
				330	# non-BMP characters
				331	return charset
				332	if negate:
				333	if sys.maxunicode != 65535:
				334	# XXX: negation does not work with big charsets
				335	return charset
				336	for i in xrange(65536):
				337	charmap[i] = not charmap[i]
				338	comps = {}
				339	mapping = [0]*256
				340	block = 0
				341	data = []
				342	for i in xrange(256):
				343	chunk = tuple(charmap[i256:(i+1)256])
				344	new = comps.setdefault(chunk, block)
				345	mapping[i] = new
				346	if new == block:
				347	block = block + 1
				348	data = data + _mk_bitmap(chunk)
				349	header = [block]
				350	if _sre.CODESIZE == 2:
				351	code = 'H'
				352	else:
				353	# change this for Jython from 'I', since that will expand to
				354	# long, and cause needless complexity (or so it seems)
				355	code = 'i'
				356	# Convert block indices to byte array of 256 bytes
				357	mapping = array.array('b', mapping).tostring()
				358	# Convert byte array to word array
				359	mapping = array.array(code, mapping)
				360	assert mapping.itemsize == _sre.CODESIZE
				361	header = header + mapping.tolist()
				362	data[0:0] = header
				363	return [(BIGCHARSET, data)]
				364
				365	def _simple(av):
				366	# check if av is a "simple" operator
				367	lo, hi = av[2].getwidth()
				368	if lo == 0 and hi == MAXREPEAT:
				369	raise error, "nothing to repeat"
				370	return lo == hi == 1 and av[2][0][0] != SUBPATTERN
				371
				372	def _compile_info(code, pattern, flags):
				373	# internal: compile an info block. in the current version,
				374	# this contains min/max pattern width, and an optional literal
				375	# prefix or a character map
				376	lo, hi = pattern.getwidth()
				377	if lo == 0:
				378	return # not worth it
				379	# look for a literal prefix
				380	prefix = []
				381	prefixappend = prefix.append
				382	prefix_skip = 0
				383	charset = [] # not used
				384	charsetappend = charset.append
				385	if not (flags & SRE_FLAG_IGNORECASE):
				386	# look for literal prefix
				387	for op, av in pattern.data:
				388	if op is LITERAL:
				389	if len(prefix) == prefix_skip:
				390	prefix_skip = prefix_skip + 1
				391	prefixappend(av)
				392	elif op is SUBPATTERN and len(av[1]) == 1:
				393	op, av = av[1][0]
				394	if op is LITERAL:
				395	prefixappend(av)
				396	else:
				397	break
				398	else:
				399	break
				400	# if no prefix, look for charset prefix
				401	if not prefix and pattern.data:
				402	op, av = pattern.data[0]
				403	if op is SUBPATTERN and av[1]:
				404	op, av = av[1][0]
				405	if op is LITERAL:
				406	charsetappend((op, av))
				407	elif op is BRANCH:
				408	c = []
				409	cappend = c.append
				410	for p in av[1]:
				411	if not p:
				412	break
				413	op, av = p[0]
				414	if op is LITERAL:
				415	cappend((op, av))
				416	else:
				417	break
				418	else:
				419	charset = c
				420	elif op is BRANCH:
				421	c = []
				422	cappend = c.append
				423	for p in av[1]:
				424	if not p:
				425	break
				426	op, av = p[0]
				427	if op is LITERAL:
				428	cappend((op, av))
				429	else:
				430	break
				431	else:
				432	charset = c
				433	elif op is IN:
				434	charset = av
				435	## if prefix:
				436	## print "*** PREFIX", prefix, prefix_skip
				437	## if charset:
				438	## print "*** CHARSET", charset
				439	# add an info block
				440	emit = code.append
				441	emit(OPCODES[INFO])
				442	skip = len(code); emit(0)
				443	# literal flag
				444	mask = 0
				445	if prefix:
				446	mask = SRE_INFO_PREFIX
				447	if len(prefix) == prefix_skip == len(pattern.data):
				448	mask = mask + SRE_INFO_LITERAL
				449	elif charset:
				450	mask = mask + SRE_INFO_CHARSET
				451	emit(mask)
				452	# pattern length
				453	if lo < MAXCODE:
				454	emit(lo)
				455	else:
				456	emit(MAXCODE)
				457	prefix = prefix[:MAXCODE]
				458	if hi < MAXCODE:
				459	emit(hi)
				460	else:
				461	emit(0)
				462	# add literal prefix
				463	if prefix:
				464	emit(len(prefix)) # length
				465	emit(prefix_skip) # skip
				466	code.extend(prefix)
				467	# generate overlap table
				468	table = [-1] + ([0]*len(prefix))
				469	for i in xrange(len(prefix)):
				470	table[i+1] = table[i]+1
				471	while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
				472	table[i+1] = table[table[i+1]-1]+1
				473	code.extend(table[1:]) # don't store first entry
				474	elif charset:
				475	_compile_charset(charset, flags, code)
				476	code[skip] = len(code) - skip
				477
				478	try:
				479	unicode
				480	except NameError:
				481	STRING_TYPES = (type(""),)
				482	else:
				483	STRING_TYPES = (type(""), type(unicode("")))
				484
				485	def isstring(obj):
				486	for tp in STRING_TYPES:
				487	if isinstance(obj, tp):
				488	return 1
				489	return 0
				490
				491	def _code(p, flags):
				492
				493	flags = p.pattern.flags \| flags
				494	code = []
				495
				496	# compile info block
				497	_compile_info(code, p, flags)
				498
				499	# compile the pattern
				500	_compile(code, p.data, flags)
				501
				502	code.append(OPCODES[SUCCESS])
				503
				504	return code
				505
				506	def compile(p, flags=0):
				507	# internal: convert pattern list to internal format
				508
				509	if isstring(p):
				510	import sre_parse
				511	pattern = p
				512	p = sre_parse.parse(p, flags)
				513	else:
				514	pattern = None
				515
				516	code = _code(p, flags)
				517
				518	# print code
				519
				520	# XXX: <fl> get rid of this limitation!
				521	if p.pattern.groups > 100:
				522	raise AssertionError(
				523	"sorry, but this version only supports 100 named groups"
				524	)
				525
				526	# map in either direction
				527	groupindex = p.pattern.groupdict
				528	indexgroup = [None] * p.pattern.groups
				529	for k, i in groupindex.items():
				530	indexgroup[i] = k
				531
				532	return _sre.compile(
				533	pattern, flags \| p.pattern.flags, code,
				534	p.pattern.groups-1,
				535	groupindex, indexgroup
				536	)