blob: fd7a02cd420392bfdfef337d5c5a71bcfc90d262 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
Guido van Rossum6af4abd1997-08-13 03:25:34 +00008# reop.error and re.error should be the same, since exceptions can be
9# raised from either module.
10error = reop.error # 're error'
11
12from reop import NORMAL, CHARCLASS, REPLACEMENT
13from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET
14from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER
Guido van Rossum5ca1b711997-07-10 21:00:31 +000015
16# compilation flags
17
18IGNORECASE = I = 0x01
19
20MULTILINE = M = 0x02
21DOTALL = S = 0x04
22VERBOSE = X = 0x08
23
Guido van Rossuma4f1a781997-07-17 22:38:10 +000024repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?',
25 '{n,}', '{n,}?', '{n,m}', '{n,m}?']
Guido van Rossum5ca1b711997-07-10 21:00:31 +000026
27#
28#
29#
30
Guido van Rossum09bcfd61997-07-15 15:38:20 +000031def valid_identifier(id):
32 if len(id) == 0:
33 return 0
Guido van Rossuma4f1a781997-07-17 22:38:10 +000034 if (not reop.syntax_table[id[0]] & reop.word) or \
35 (reop.syntax_table[id[0]] & reop.digit):
Guido van Rossum09bcfd61997-07-15 15:38:20 +000036 return 0
37 for char in id[1:]:
Guido van Rossuma4f1a781997-07-17 22:38:10 +000038 if not reop.syntax_table[char] & reop.word:
Guido van Rossum09bcfd61997-07-15 15:38:20 +000039 return 0
40 return 1
41
42#
43#
44#
45
Guido van Rossum26d80e61997-07-15 18:59:04 +000046_cache = {}
47_MAXCACHE = 20
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000048
Guido van Rossum9e18ec71997-07-17 22:39:13 +000049def _cachecompile(pattern, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000050 key = (pattern, flags)
51 try:
52 return _cache[key]
53 except KeyError:
54 pass
55 value = compile(pattern, flags)
56 if len(_cache) >= _MAXCACHE:
57 _cache.clear()
58 _cache[key] = value
59 return value
60
Guido van Rossum5ca1b711997-07-10 21:00:31 +000061def match(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000062 return _cachecompile(pattern, flags).match(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000063
Guido van Rossum5ca1b711997-07-10 21:00:31 +000064def search(pattern, string, flags=0):
Guido van Rossum26d80e61997-07-15 18:59:04 +000065 return _cachecompile(pattern, flags).search(string)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000066
Guido van Rossum5ca1b711997-07-10 21:00:31 +000067def sub(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000068 if type(pattern) == type(''):
69 pattern = _cachecompile(pattern)
70 return pattern.sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000071
72def subn(pattern, repl, string, count=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000073 if type(pattern) == type(''):
74 pattern = _cachecompile(pattern)
75 return pattern.subn(repl, string, count)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +000076
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000077def split(pattern, string, maxsplit=0):
Guido van Rossum9e18ec71997-07-17 22:39:13 +000078 if type(pattern) == type(''):
79 pattern = _cachecompile(pattern)
80 return pattern.split(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000081
82#
83#
84#
85
Guido van Rossum71fa97c1997-07-18 04:26:03 +000086def _expand(m, repl):
87 results = []
88 index = 0
89 size = len(repl)
90 while index < size:
91 found = string.find(repl, '\\', index)
92 if found < 0:
93 results.append(repl[index:])
94 break
95 if found > index:
96 results.append(repl[index:found])
97 escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT)
98 if escape_type == CHAR:
99 results.append(value)
100 elif escape_type == MEMORY_REFERENCE:
101 r = m.group(value)
102 if r is None:
103 raise error, ('group "' + str(value) + '" did not contribute '
104 'to the match')
105 results.append(m.group(value))
106 else:
107 raise error, "bad escape in replacement"
108 return string.join(results, '')
109
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000110class RegexObject:
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000111 def __init__(self, pattern, flags, code, num_regs, groupindex):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000112 self.code = code
113 self.num_regs = num_regs
114 self.flags = flags
115 self.pattern = pattern
116 self.groupindex = groupindex
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000117 self.fastmap = build_fastmap(code)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000118
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000119 if code[0].name == 'bol':
120 self.anchor = 1
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000121
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000122 elif code[0].name == 'begbuf':
123 self.anchor = 2
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000124
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000125 else:
126 self.anchor = 0
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000127
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000128 self.buffer = assemble(code)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000129 def search(self, string, pos=0):
130 regs = reop.search(self.buffer,
131 self.num_regs,
132 self.flags,
133 self.fastmap.can_be_null,
134 self.fastmap.fastmap(),
135 self.anchor,
136 string,
137 pos)
138 if regs is None:
139 return None
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000140
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000141 return MatchObject(self,
142 string,
143 pos,
144 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000145
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000146 def match(self, string, pos=0):
147 regs = reop.match(self.buffer,
148 self.num_regs,
149 self.flags,
150 self.fastmap.can_be_null,
151 self.fastmap.fastmap(),
152 self.anchor,
153 string,
154 pos)
155 if regs is None:
156 return None
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000157
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000158 return MatchObject(self,
159 string,
160 pos,
161 regs)
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000162
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000163 def sub(self, repl, string, count=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000164 return self.subn(repl, string, count)[0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000165
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000166 def subn(self, repl, source, count=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000167 if count < 0:
168 raise ValueError, "negative substibution count"
169 if count == 0:
170 import sys
171 count = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000172 if type(repl) == type(''):
173 if '\\' in repl:
174 repl = lambda m, r=repl: _expand(m, r)
175 else:
176 repl = lambda m, r=repl: r
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000177 n = 0 # Number of matches
178 pos = 0 # Where to start searching
179 lastmatch = -1 # End of last match
180 results = [] # Substrings making up the result
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000181 end = len(source)
182 while n < count and pos <= end:
183 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000184 if not m:
185 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000186 i, j = m.span(0)
187 if i == j == lastmatch:
188 # Empty match adjacent to previous match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000189 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000190 results.append(source[lastmatch:pos])
191 continue
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000192 if pos < i:
193 results.append(source[pos:i])
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000194 results.append(repl(m))
195 pos = lastmatch = j
196 if i == j:
197 # Last match was empty; don't try here again
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000198 pos = pos + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000199 results.append(source[lastmatch:pos])
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000200 n = n + 1
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000201 results.append(source[pos:])
202 return (string.join(results, ''), n)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000203
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000204 def split(self, source, maxsplit=0):
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000205 if maxsplit < 0:
206 raise error, "negative split count"
207 if maxsplit == 0:
208 import sys
209 maxsplit = sys.maxint
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000210 n = 0
211 pos = 0
212 lastmatch = 0
213 results = []
214 end = len(source)
215 while n < maxsplit:
216 m = self.search(source, pos)
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000217 if not m:
218 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000219 i, j = m.span(0)
220 if i == j:
221 # Empty match
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000222 if pos >= end:
223 break
Guido van Rossum9e18ec71997-07-17 22:39:13 +0000224 pos = pos+1
225 continue
226 results.append(source[lastmatch:i])
227 g = m.group()
228 if g:
229 results[len(results):] = list(g)
230 pos = lastmatch = j
231 results.append(source[lastmatch:])
232 return results
233
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000234class MatchObject:
235 def __init__(self, re, string, pos, regs):
236 self.re = re
237 self.string = string
238 self.pos = pos
239 self.regs = regs
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000240
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000241 def start(self, g):
242 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000243 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000244 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000245 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000246 raise IndexError, ('group "' + g + '" is undefined')
247 return self.regs[g][0]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000248
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000249 def end(self, g):
250 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000251 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000252 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000253 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000254 raise IndexError, ('group "' + g + '" is undefined')
255 return self.regs[g][1]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000256
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000257 def span(self, g):
258 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000259 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000260 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000261 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000262 raise IndexError, ('group "' + g + '" is undefined')
263 return self.regs[g]
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000264
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000265 def group(self, *groups):
266 if len(groups) == 0:
267 groups = range(1, self.re.num_regs)
Guido van Rossum53109751997-07-15 15:40:29 +0000268 use_all = 1
269 else:
270 use_all = 0
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000271 result = []
272 for g in groups:
273 if type(g) == type(''):
274 try:
275 g = self.re.groupindex[g]
276 except (KeyError, TypeError):
277 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000278 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000279 result.append(None)
280 else:
281 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossum53109751997-07-15 15:40:29 +0000282 if use_all or len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000283 return tuple(result)
284 elif len(result) == 1:
285 return result[0]
286 else:
287 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000288
289#
290# A set of classes to make assembly a bit easier, if a bit verbose.
291#
292
293class Instruction:
294 def __init__(self, opcode, size=1):
295 self.opcode = opcode
296 self.size = size
297 def assemble(self, position, labels):
298 return self.opcode
299 def __repr__(self):
300 return '%-15s' % (self.name)
301
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000302class End(Instruction):
303 name = 'end'
304 def __init__(self):
305 Instruction.__init__(self, chr(0))
306
307class Bol(Instruction):
308 name = 'bol'
309 def __init__(self):
310 self.name = 'bol'
311 Instruction.__init__(self, chr(1))
312
313class Eol(Instruction):
314 name = 'eol'
315 def __init__(self):
316 Instruction.__init__(self, chr(2))
317
318class Set(Instruction):
319 name = 'set'
320 def __init__(self, set):
321 self.set = set
322 Instruction.__init__(self, chr(3), 33)
Guido van Rossum63e18191997-07-11 11:08:38 +0000323 def assemble(self, position, labels):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000324 result = self.opcode
325 temp = 0
326 for i, c in map(lambda x: (x, chr(x)), range(256)):
Guido van Rossum63e18191997-07-11 11:08:38 +0000327 if c in self.set:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000328 temp = temp | (1 << (i & 7))
329 if (i % 8) == 7:
330 result = result + chr(temp)
331 temp = 0
332 return result
333 def __repr__(self):
334 result = '%-15s' % (self.name)
335 self.set.sort()
336 for char in self.set:
Guido van Rossum63e18191997-07-11 11:08:38 +0000337 result = result + char
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000338 return result
339
340class Exact(Instruction):
341 name = 'exact'
342 def __init__(self, char):
343 self.char = char
344 Instruction.__init__(self, chr(4), 2)
345 def assemble(self, position, labels):
346 return self.opcode + self.char
347 def __repr__(self):
348 return '%-15s %s' % (self.name, `self.char`)
349
350class AnyChar(Instruction):
351 name = 'anychar'
352 def __init__(self):
353 Instruction.__init__(self, chr(5))
354 def assemble(self, position, labels):
355 return self.opcode
356
357class MemoryInstruction(Instruction):
358 def __init__(self, opcode, register):
359 self.register = register
360 Instruction.__init__(self, opcode, 2)
361 def assemble(self, position, labels):
362 return self.opcode + chr(self.register)
363 def __repr__(self):
364 return '%-15s %i' % (self.name, self.register)
365
366class StartMemory(MemoryInstruction):
367 name = 'start_memory'
368 def __init__(self, register):
369 MemoryInstruction.__init__(self, chr(6), register)
370
371class EndMemory(MemoryInstruction):
372 name = 'end_memory'
373 def __init__(self, register):
374 MemoryInstruction.__init__(self, chr(7), register)
375
376class MatchMemory(MemoryInstruction):
377 name = 'match_memory'
378 def __init__(self, register):
379 MemoryInstruction.__init__(self, chr(8), register)
380
381class JumpInstruction(Instruction):
382 def __init__(self, opcode, label):
383 self.label = label
384 Instruction.__init__(self, opcode, 3)
385 def compute_offset(self, start, dest):
386 return dest - (start + 3)
387 def pack_offset(self, offset):
388 if offset > 32767:
389 raise error, 'offset out of range (pos)'
390 elif offset < -32768:
391 raise error, 'offset out of range (neg)'
392 elif offset < 0:
393 offset = offset + 65536
394 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
395 def assemble(self, position, labels):
396 return self.opcode + \
397 self.pack_offset(self.compute_offset(position,
398 labels[self.label]))
399 def __repr__(self):
400 return '%-15s %i' % (self.name, self.label)
401
402class Jump(JumpInstruction):
403 name = 'jump'
404 def __init__(self, label):
405 JumpInstruction.__init__(self, chr(9), label)
406
407class StarJump(JumpInstruction):
408 name = 'star_jump'
409 def __init__(self, label):
410 JumpInstruction.__init__(self, chr(10), label)
411
412class FailureJump(JumpInstruction):
413 name = 'failure_jump'
414 def __init__(self, label):
415 JumpInstruction.__init__(self, chr(11), label)
416
417class UpdateFailureJump(JumpInstruction):
418 name = 'update_failure_jump'
419 def __init__(self, label):
420 JumpInstruction.__init__(self, chr(12), label)
421
422class DummyFailureJump(JumpInstruction):
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000423 name = 'dummy_failure_jump'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000424 def __init__(self, label):
425 JumpInstruction.__init__(self, chr(13), label)
426
427class BegBuf(Instruction):
428 name = 'begbuf'
429 def __init__(self):
430 Instruction.__init__(self, chr(14))
431
432class EndBuf(Instruction):
433 name = 'endbuf'
434 def __init__(self):
435 Instruction.__init__(self, chr(15))
436
437class WordBeg(Instruction):
438 name = 'wordbeg'
439 def __init__(self):
440 Instruction.__init__(self, chr(16))
441
442class WordEnd(Instruction):
443 name = 'wordend'
444 def __init__(self):
445 Instruction.__init__(self, chr(17))
446
447class WordBound(Instruction):
448 name = 'wordbound'
449 def __init__(self):
450 Instruction.__init__(self, chr(18))
451
452class NotWordBound(Instruction):
453 name = 'notwordbound'
454 def __init__(self):
455 Instruction.__init__(self, chr(18))
456
457class SyntaxSpec(Instruction):
458 name = 'syntaxspec'
459 def __init__(self, syntax):
460 self.syntax = syntax
461 Instruction.__init__(self, chr(20), 2)
462 def assemble(self, postition, labels):
463 return self.opcode + chr(self.syntax)
464
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000465class NotSyntaxSpec(Instruction):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000466 name = 'notsyntaxspec'
467 def __init__(self, syntax):
468 self.syntax = syntax
469 Instruction.__init__(self, chr(21), 2)
470 def assemble(self, postition, labels):
471 return self.opcode + chr(self.syntax)
472
473class Label(Instruction):
474 name = 'label'
475 def __init__(self, label):
476 self.label = label
477 Instruction.__init__(self, '', 0)
478 def __repr__(self):
479 return '%-15s %i' % (self.name, self.label)
480
481class OpenParen(Instruction):
482 name = '('
483 def __init__(self, register):
484 self.register = register
485 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000486 def assemble(self, position, labels):
487 raise error, 'unmatched open parenthesis'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000488
489class Alternation(Instruction):
490 name = '|'
491 def __init__(self):
492 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000493 def assemble(self, position, labels):
494 raise error, 'an alternation was not taken care of'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000495
496#
497#
498#
499
500def assemble(instructions):
501 labels = {}
502 position = 0
503 pass1 = []
504 for instruction in instructions:
505 if instruction.name == 'label':
506 labels[instruction.label] = position
507 else:
508 pass1.append((position, instruction))
509 position = position + instruction.size
510 pass2 = ''
511 for position, instruction in pass1:
512 pass2 = pass2 + instruction.assemble(position, labels)
513 return pass2
514
515#
516#
517#
518
519def escape(pattern):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000520 result = []
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000521 for char in pattern:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000522 if not reop.syntax_table[char] & reop.word:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000523 result.append('\\')
524 result.append(char)
525 return string.join(result, '')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000526
527#
528#
529#
530
531def registers_used(instructions):
532 result = []
533 for instruction in instructions:
534 if (instruction.name in ['set_memory', 'end_memory']) and \
535 (instruction.register not in result):
536 result.append(instruction.register)
537 return result
538
539#
540#
541#
542
543class Fastmap:
544 def __init__(self):
545 self.map = ['\000']*256
546 self.can_be_null = 0
547 def add(self, char):
548 self.map[ord(char)] = '\001'
549 def fastmap(self):
550 return string.join(self.map, '')
551 def __getitem__(self, char):
552 return ord(self.map[ord(char)])
553 def __repr__(self):
554 self.map.sort()
555 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
556
557#
558#
559#
560
561def find_label(code, label):
562 line = 0
563 for instruction in code:
564 if (instruction.name == 'label') and (instruction.label == label):
565 return line + 1
566 line = line + 1
567
568def build_fastmap_aux(code, pos, visited, fastmap):
569 if visited[pos]:
570 return
571 while 1:
572 instruction = code[pos]
573 visited[pos] = 1
574 pos = pos + 1
575 if instruction.name == 'end':
576 fastmap.can_be_null = 1
577 return
578 elif instruction.name == 'syntaxspec':
579 for char in map(chr, range(256)):
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000580 if reop.syntax_table[char] & instruction.syntax:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000581 fastmap.add(char)
582 return
583 elif instruction.name == 'notsyntaxspec':
584 for char in map(chr, range(256)):
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000585 if not reop.syntax_table[char] & instruction.syntax:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000586 fastmap.add(char)
587 return
588 elif instruction.name == 'eol':
589 fastmap.add('\n')
590 if fastmap.can_be_null == 0:
591 fastmap.can_be_null = 2
592 return
593 elif instruction.name == 'set':
594 for char in instruction.set:
595 fastmap.add(char)
596 return
597 elif instruction.name == 'exact':
598 fastmap.add(instruction.char)
599 elif instruction.name == 'anychar':
600 for char in map(chr, range(256)):
601 if char != '\n':
602 fastmap.add(char)
603 return
604 elif instruction.name == 'match_memory':
605 for char in map(chr, range(256)):
606 fastmap.add(char)
607 fastmap.can_be_null = 1
608 return
609 elif instruction.name in ['jump', 'dummy_failure_jump', \
610 'update_failure_jump', 'star_jump']:
611 pos = find_label(code, instruction.label)
612 if visited[pos]:
613 return
614 visited[pos] = 1
615 elif instruction.name == 'failure_jump':
616 build_fastmap_aux(code,
617 find_label(code, instruction.label),
618 visited,
619 fastmap)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000620
621def build_fastmap(code, pos=0):
622 visited = [0] * len(code)
623 fastmap = Fastmap()
624 build_fastmap_aux(code, pos, visited, fastmap)
625 return fastmap
626
627#
628#
629#
630
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000631#[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
632#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY,
633# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9)
Guido van Rossum04a1d741997-07-15 14:38:13 +0000634
635def expand_escape(pattern, index, context=NORMAL):
636 if index >= len(pattern):
637 raise error, 'escape ends too soon'
638
639 elif pattern[index] == 't':
640 return CHAR, chr(9), index + 1
641
642 elif pattern[index] == 'n':
643 return CHAR, chr(10), index + 1
644
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000645 elif pattern[index] == 'v':
646 return CHAR, chr(11), index + 1
647
Guido van Rossum04a1d741997-07-15 14:38:13 +0000648 elif pattern[index] == 'r':
649 return CHAR, chr(13), index + 1
650
651 elif pattern[index] == 'f':
652 return CHAR, chr(12), index + 1
653
654 elif pattern[index] == 'a':
655 return CHAR, chr(7), index + 1
656
Guido van Rossum04a1d741997-07-15 14:38:13 +0000657 elif pattern[index] == 'x':
658 # CAUTION: this is the Python rule, not the Perl rule!
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000659 end = index + 1 # Skip over the 'x' character
Guido van Rossum04a1d741997-07-15 14:38:13 +0000660 while (end < len(pattern)) and (pattern[end] in string.hexdigits):
661 end = end + 1
662 if end == index:
663 raise error, "\\x must be followed by hex digit(s)"
664 # let Python evaluate it, so we don't incorrectly 2nd-guess
665 # what it's doing (and Python in turn passes it on to sscanf,
666 # so that *it* doesn't incorrectly 2nd-guess what C does!)
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000667 char = eval ('"' + pattern[index-1:end] + '"')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000668 assert len(char) == 1
669 return CHAR, char, end
670
671 elif pattern[index] == 'b':
672 if context != NORMAL:
673 return CHAR, chr(8), index + 1
674 else:
675 return WORD_BOUNDARY, '', index + 1
676
677 elif pattern[index] == 'B':
678 if context != NORMAL:
679 return CHAR, 'B', index + 1
680 else:
681 return NOT_WORD_BOUNDARY, '', index + 1
682
683 elif pattern[index] == 'A':
684 if context != NORMAL:
685 return CHAR, 'A', index + 1
686 else:
687 return BEGINNING_OF_BUFFER, '', index + 1
688
689 elif pattern[index] == 'Z':
690 if context != NORMAL:
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000691 return CHAR, 'Z', index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000692 else:
693 return END_OF_BUFFER, '', index + 1
694
695 elif pattern[index] in 'GluLUQE':
Guido van Rossum6af4abd1997-08-13 03:25:34 +0000696 raise error, ('\\' + pattern[index] + ' is not allowed')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000697
698 elif pattern[index] == 'w':
699 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000700 return SYNTAX, reop.word, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000701 elif context == CHARCLASS:
702 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000703 for char in reop.syntax_table.keys():
704 if reop.syntax_table[char] & reop.word:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000705 set.append(char)
706 return SET, set, index + 1
707 else:
708 return CHAR, 'w', index + 1
709
710 elif pattern[index] == 'W':
711 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000712 return NOT_SYNTAX, reop.word, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000713 elif context == CHARCLASS:
714 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000715 for char in reop.syntax_table.keys():
716 if not reop.syntax_table[char] & reop.word:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000717 set.append(char)
718 return SET, set, index + 1
719 else:
720 return CHAR, 'W', index + 1
721
722 elif pattern[index] == 's':
723 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000724 return SYNTAX, reop.whitespace, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000725 elif context == CHARCLASS:
726 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000727 for char in reop.syntax_table.keys():
728 if reop.syntax_table[char] & reop.whitespace:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000729 set.append(char)
730 return SET, set, index + 1
731 else:
732 return CHAR, 's', index + 1
733
734 elif pattern[index] == 'S':
735 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000736 return NOT_SYNTAX, reop.whitespace, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000737 elif context == CHARCLASS:
738 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000739 for char in reop.syntax_table.keys():
740 if not reop.syntax_table[char] & reop.word:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000741 set.append(char)
742 return SET, set, index + 1
743 else:
744 return CHAR, 'S', index + 1
745
746 elif pattern[index] == 'd':
747 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000748 return SYNTAX, reop.digit, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000749 elif context == CHARCLASS:
750 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000751 for char in reop.syntax_table.keys():
752 if reop.syntax_table[char] & reop.digit:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000753 set.append(char)
754 return SET, set, index + 1
755 else:
756 return CHAR, 'd', index + 1
757
758 elif pattern[index] == 'D':
759 if context == NORMAL:
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000760 return NOT_SYNTAX, reop.digit, index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000761 elif context == CHARCLASS:
762 set = []
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000763 for char in reop.syntax_table.keys():
764 if not reop.syntax_table[char] & reop.digit:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000765 set.append(char)
766 return SET, set, index + 1
767 else:
768 return CHAR, 'D', index + 1
769
770 elif pattern[index] in '0123456789':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000771
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000772 if pattern[index] == '0':
773 if (index + 1 < len(pattern)) and \
774 (pattern[index + 1] in string.octdigits):
775 if (index + 2 < len(pattern)) and \
776 (pattern[index + 2] in string.octdigits):
777 value = string.atoi(pattern[index:index + 3], 8)
778 index = index + 3
779
780 else:
781 value = string.atoi(pattern[index:index + 2], 8)
782 index = index + 2
783
784 else:
785 value = 0
786 index = index + 1
787
Guido van Rossum04a1d741997-07-15 14:38:13 +0000788 if value > 255:
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000789 raise error, 'octal value out of range'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000790
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000791 return CHAR, chr(value), index
792
Guido van Rossum04a1d741997-07-15 14:38:13 +0000793 else:
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000794 if (index + 1 < len(pattern)) and \
795 (pattern[index + 1] in string.digits):
796 if (index + 2 < len(pattern)) and \
797 (pattern[index + 2] in string.octdigits) and \
798 (pattern[index + 1] in string.octdigits) and \
799 (pattern[index] in string.octdigits):
800 value = string.atoi(pattern[index:index + 3], 8)
801 if value > 255:
802 raise error, 'octal value out of range'
803
804 return CHAR, chr(value), index + 3
805
806 else:
807 value = string.atoi(pattern[index:index + 2])
808 if (value < 1) or (value > 99):
809 raise error, 'memory reference out of range'
810
811 if context == CHARCLASS:
812 raise error, ('cannot reference a register from '
813 'inside a character class')
814 return MEMORY_REFERENCE, value, index + 2
815
816 else:
817 if context == CHARCLASS:
818 raise error, ('cannot reference a register from '
819 'inside a character class')
820
821 value = string.atoi(pattern[index])
822 return MEMORY_REFERENCE, value, index + 1
823
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000824 elif pattern[index] == 'g':
825 if context != REPLACEMENT:
826 return CHAR, 'g', index + 1
Guido van Rossum04a1d741997-07-15 14:38:13 +0000827
Guido van Rossum71fa97c1997-07-18 04:26:03 +0000828 index = index + 1
829 if index >= len(pattern):
830 raise error, 'unfinished symbolic reference'
831 if pattern[index] != '<':
832 raise error, 'missing < in symbolic reference'
833
834 index = index + 1
835 end = string.find(pattern, '>', index)
836 if end == -1:
837 raise error, 'unfinished symbolic reference'
838 value = pattern[index:end]
839 if not valid_identifier(value):
840 raise error, 'illegal symbolic reference'
841 return MEMORY_REFERENCE, value, end + 1
842
Guido van Rossum04a1d741997-07-15 14:38:13 +0000843 else:
844 return CHAR, pattern[index], index + 1
845
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000846def compile(pattern, flags=0):
847 stack = []
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000848 label = 0
849 register = 1
850 groupindex = {}
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000851 lastop = ''
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000852
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000853 # look for embedded pattern modifiers at the beginning of the pattern
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000854
855 index = 0
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000856
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000857 if len(pattern) >= 3 and \
858 (pattern[:2] == '(?') and \
859 (pattern[2] in 'iImMsSxX'):
860 index = 2
861 while (index < len(pattern)) and (pattern[index] != ')'):
862 if pattern[index] in 'iI':
863 flags = flags | IGNORECASE
864 elif pattern[index] in 'mM':
865 flags = flags | MULTILINE
866 elif pattern[index] in 'sS':
867 flags = flags | DOTALL
868 elif pattern[index] in 'xX':
869 flags = flags | VERBOSE
870 else:
871 raise error, 'unknown modifier'
872 index = index + 1
873 index = index + 1
874
875 # compile the rest of the pattern
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +0000876
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000877 while (index < len(pattern)):
878 char = pattern[index]
879 index = index + 1
880 if char == '\\':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000881 escape_type, value, index = expand_escape(pattern, index)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000882
Guido van Rossum04a1d741997-07-15 14:38:13 +0000883 if escape_type == CHAR:
884 stack.append([Exact(value)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000885 lastop = '\\' + value
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000886
Guido van Rossum04a1d741997-07-15 14:38:13 +0000887 elif escape_type == MEMORY_REFERENCE:
888 if value >= register:
889 raise error, ('cannot reference a register '
890 'not yet used')
891 stack.append([MatchMemory(value)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000892 lastop = '\\1'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000893
894 elif escape_type == BEGINNING_OF_BUFFER:
895 stack.append([BegBuf()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000896 lastop = '\\A'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000897
898 elif escape_type == END_OF_BUFFER:
899 stack.append([EndBuf()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000900 lastop = '\\Z'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000901
902 elif escape_type == WORD_BOUNDARY:
903 stack.append([WordBound()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000904 lastop = '\\b'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000905
906 elif escape_type == NOT_WORD_BOUNDARY:
907 stack.append([NotWordBound()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000908 lastop = '\\B'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000909
910 elif escape_type == SYNTAX:
911 stack.append([SyntaxSpec(value)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000912 if value == reop.word:
913 lastop = '\\w'
914 elif value == reop.whitespace:
915 lastop = '\\s'
916 elif value == reop.digit:
917 lastop = '\\d'
918 else:
919 lastop = '\\?'
920
Guido van Rossum04a1d741997-07-15 14:38:13 +0000921 elif escape_type == NOT_SYNTAX:
922 stack.append([NotSyntaxSpec(value)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000923 if value == reop.word:
924 lastop = '\\W'
925 elif value == reop.whitespace:
926 lastop = '\\S'
927 elif value == reop.digit:
928 lastop = '\\D'
929 else:
930 lastop = '\\?'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000931
932 elif escape_type == SET:
933 raise error, 'cannot use set escape type here'
934
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000935 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000936 raise error, 'unknown escape type'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000937
938 elif char == '|':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000939 expr = []
Guido van Rossum04a1d741997-07-15 14:38:13 +0000940
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000941 while (len(stack) != 0) and \
942 (stack[-1][0].name != '(') and \
943 (stack[-1][0].name != '|'):
944 expr = stack[-1] + expr
945 del stack[-1]
946 stack.append([FailureJump(label)] + \
947 expr + \
948 [Jump(-1),
949 Label(label)])
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000950 stack.append([Alternation()])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000951 label = label + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000952 lastop = '|'
953
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000954 elif char == '(':
955 if index >= len(pattern):
956 raise error, 'no matching close paren'
957
958 elif pattern[index] == '?':
959 # Perl style (?...) extensions
960 index = index + 1
961 if index >= len(pattern):
962 raise error, 'extension ends prematurely'
963
964 elif pattern[index] == 'P':
965 # Python extensions
966 index = index + 1
967 if index >= len(pattern):
968 raise error, 'extension ends prematurely'
969
970 elif pattern[index] == '<':
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000971 # Handle Python symbolic group names (?P<...>...)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000972 index = index + 1
973 end = string.find(pattern, '>', index)
974 if end == -1:
975 raise error, 'no end to symbolic group name'
976 name = pattern[index:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000977 if not valid_identifier(name):
978 raise error, ('symbolic group name must be a '
979 'valid identifier')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000980 index = end + 1
981 groupindex[name] = register
982 stack.append([OpenParen(register)])
983 register = register + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +0000984 lastop = '('
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000985
986 elif pattern[index] == '=':
987 # backreference to symbolic group name
988 if index >= len(pattern):
989 raise error, '(?P= at the end of the pattern'
990 start = index + 1
991 end = string.find(pattern, ')', start)
992 if end == -1:
993 raise error, 'no ) to end symbolic group name'
994 name = pattern[start:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000995 if name not in groupindex.keys():
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000996 raise error, ('symbolic group name ' + name + \
997 ' has not been used yet')
998 stack.append([MatchMemory(groupindex[name])])
999 index = end + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001000 lastop = '(?P=)'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001001
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001002 else:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +00001003 raise error, ('unknown Python extension: ' + \
1004 pattern[index])
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001005
1006 elif pattern[index] == ':':
1007 # grouping, but no registers
1008 index = index + 1
Guido van Rossum8a9a4a21997-07-11 20:48:25 +00001009 stack.append([OpenParen(-1)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001010 lastop = '('
1011
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001012 elif pattern[index] == '#':
1013 # comment
1014 index = index + 1
1015 end = string.find(pattern, ')', index)
1016 if end == -1:
1017 raise error, 'no end to comment'
1018 index = end + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001019 # do not change lastop
1020
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001021 elif pattern[index] == '=':
1022 raise error, ('zero-width positive lookahead '
1023 'assertion is unsupported')
1024
1025 elif pattern[index] == '!':
1026 raise error, ('zero-width negative lookahead '
1027 'assertion is unsupported')
1028
1029 elif pattern[index] in 'iImMsSxX':
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001030 raise error, ('embedded pattern modifiers are only '
1031 'allowed at the beginning of the pattern')
1032
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001033 else:
1034 raise error, 'unknown extension'
1035
1036 else:
1037 stack.append([OpenParen(register)])
1038 register = register + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001039 lastop = '('
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001040
1041 elif char == ')':
1042 # make one expression out of everything on the stack up to
1043 # the marker left by the last parenthesis
1044 expr = []
1045 while (len(stack) > 0) and (stack[-1][0].name != '('):
1046 expr = stack[-1] + expr
1047 del stack[-1]
1048
1049 if len(stack) == 0:
1050 raise error, 'too many close parens'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001051
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001052 # remove markers left by alternation
1053 expr = filter(lambda x: x.name != '|', expr)
1054
1055 # clean up jumps inserted by alternation
1056 need_label = 0
1057 for i in range(len(expr)):
1058 if (expr[i].name == 'jump') and (expr[i].label == -1):
Guido van Rossum04a1d741997-07-15 14:38:13 +00001059 expr[i] = Jump(label)
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001060 need_label = 1
1061 if need_label:
1062 expr.append(Label(label))
1063 label = label + 1
1064
Guido van Rossum63e18191997-07-11 11:08:38 +00001065 if stack[-1][0].register > 0:
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001066 expr = [StartMemory(stack[-1][0].register)] + \
1067 expr + \
1068 [EndMemory(stack[-1][0].register)]
1069 del stack[-1]
1070 stack.append(expr)
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001071 lastop = ')'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001072
1073 elif char == '{':
1074 if len(stack) == 0:
1075 raise error, 'no expression to repeat'
1076 end = string.find(pattern, '}', index)
1077 if end == -1:
1078 raise error, ('no close curly bracket to match'
1079 ' open curly bracket')
1080
1081 fields = map(string.strip,
1082 string.split(pattern[index:end], ','))
1083 index = end + 1
1084
1085 minimal = 0
1086 if (index < len(pattern)) and (pattern[index] == '?'):
1087 minimal = 1
1088 index = index + 1
1089
1090 if len(fields) == 1:
1091 # {n} or {n}? (there's really no difference)
1092 try:
1093 count = string.atoi(fields[0])
1094 except ValueError:
1095 raise error, ('count must be an integer '
1096 'inside curly braces')
1097 if count > 65535:
1098 raise error, 'repeat count out of range'
1099 expr = []
1100 while count > 0:
1101 expr = expr + stack[-1]
1102 count = count - 1
1103 del stack[-1]
1104 stack.append(expr)
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001105 if minimal:
1106 lastop = '{n}?'
1107 else:
1108 lastop = '{n}'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001109
1110 elif len(fields) == 2:
1111 # {n,} or {n,m}
1112 if fields[1] == '':
1113 # {n,}
1114 try:
1115 min = string.atoi(fields[0])
1116 except ValueError:
1117 raise error, ('minimum must be an integer '
1118 'inside curly braces')
1119 if min > 65535:
1120 raise error, 'minimum repeat count out of range'
1121
1122 expr = []
1123 while min > 0:
1124 expr = expr + stack[-1]
1125 min = min - 1
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001126 if minimal:
1127 expr = expr + \
1128 ([Jump(label + 1),
1129 Label(label)] + \
1130 stack[-1] + \
1131 [Label(label + 1),
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +00001132 FailureJump(label)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001133 lastop = '{n,}?'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001134 else:
1135 expr = expr + \
1136 ([Label(label),
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +00001137 FailureJump(label + 1)] +
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001138 stack[-1] +
1139 [StarJump(label),
1140 Label(label + 1)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001141 lastop = '{n,}'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001142
1143 del stack[-1]
1144 stack.append(expr)
1145 label = label + 2
1146
1147 else:
1148 # {n,m}
1149 try:
1150 min = string.atoi(fields[0])
1151 except ValueError:
1152 raise error, ('minimum must be an integer '
1153 'inside curly braces')
1154 try:
1155 max = string.atoi(fields[1])
1156 except ValueError:
1157 raise error, ('maximum must be an integer '
1158 'inside curly braces')
1159 if min > 65535:
1160 raise error, ('minumim repeat count out '
1161 'of range')
1162 if max > 65535:
1163 raise error, ('maximum repeat count out '
1164 'of range')
1165 if min > max:
1166 raise error, ('minimum repeat count must be '
1167 'less than the maximum '
1168 'repeat count')
1169 expr = []
1170 while min > 0:
1171 expr = expr + stack[-1]
1172 min = min - 1
1173 max = max - 1
1174 if minimal:
1175 while max > 0:
1176 expr = expr + \
1177 [FailureJump(label),
1178 Jump(label + 1),
1179 Label(label)] + \
1180 stack[-1] + \
1181 [Label(label + 1)]
Guido van Rossum26d80e61997-07-15 18:59:04 +00001182 max = max - 1
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001183 label = label + 2
1184 del stack[-1]
1185 stack.append(expr)
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001186 lastop = '{n,m}?'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001187 else:
1188 while max > 0:
1189 expr = expr + \
1190 [FailureJump(label)] + \
1191 stack[-1]
1192 max = max - 1
1193 del stack[-1]
1194 stack.append(expr + [Label(label)])
1195 label = label + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001196 lastop = '{n,m}'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001197
1198 else:
1199 raise error, ('there need to be one or two fields '
1200 'in a {} expression')
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001201
1202 elif char == '}':
1203 raise error, 'unbalanced close curly brace'
1204
1205 elif char == '*':
1206 # Kleene closure
1207 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001208 raise error, '* needs something to repeat'
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001209
1210 if lastop in ['(', '|']:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001211 raise error, '* needs something to repeat'
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001212
1213 if lastop in repetition_operators:
1214 raise error, 'nested repetition operators'
1215
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001216 if (index < len(pattern)) and (pattern[index] == '?'):
1217 # non-greedy matching
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +00001218 expr = [Jump(label + 1),
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001219 Label(label)] + \
1220 stack[-1] + \
1221 [Label(label + 1),
1222 FailureJump(label)]
1223 index = index + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001224 lastop = '*?'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001225 else:
1226 # greedy matching
1227 expr = [Label(label),
1228 FailureJump(label + 1)] + \
1229 stack[-1] + \
1230 [StarJump(label),
1231 Label(label + 1)]
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001232 lastop = '*'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001233 del stack[-1]
1234 stack.append(expr)
1235 label = label + 2
1236
1237 elif char == '+':
1238 # positive closure
1239 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001240 raise error, '+ needs something to repeat'
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +00001241
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001242 if lastop in ['(', '|']:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001243 raise error, '+ needs something to repeat'
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001244
1245 if lastop in repetition_operators:
1246 raise error, 'nested repetition operators'
1247
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001248 if (index < len(pattern)) and (pattern[index] == '?'):
1249 # non-greedy
1250 expr = [Label(label)] + \
1251 stack[-1] + \
1252 [FailureJump(label)]
1253 label = label + 1
1254 index = index + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001255 lastop = '+?'
1256
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001257 else:
1258 # greedy
1259 expr = [DummyFailureJump(label + 1),
1260 Label(label),
1261 FailureJump(label + 2),
1262 Label(label + 1)] + \
1263 stack[-1] + \
1264 [StarJump(label),
1265 Label(label + 2)]
1266 label = label + 3
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001267 lastop = '+'
1268
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001269 del stack[-1]
1270 stack.append(expr)
1271
1272 elif char == '?':
1273 if len(stack) == 0:
1274 raise error, 'need something to be optional'
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001275
1276 if len(stack) == 0:
1277 raise error, '? needs something to repeat'
1278
1279 if lastop in ['(', '|']:
1280 raise error, '? needs something to repeat'
1281
1282 if lastop in repetition_operators:
1283 raise error, 'nested repetition operators'
1284
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001285 if (index < len(pattern)) and (pattern[index] == '?'):
1286 # non-greedy matching
1287 expr = [FailureJump(label),
1288 Jump(label + 1),
1289 Label(label)] + \
1290 stack[-1] + \
1291 [Label(label + 1)]
1292 label = label + 2
1293 index = index + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001294 lastop = '??'
1295
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001296 else:
1297 # greedy matching
1298 expr = [FailureJump(label)] + \
1299 stack[-1] + \
1300 [Label(label)]
1301 label = label + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001302 lastop = '?'
1303
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001304 del stack[-1]
1305 stack.append(expr)
1306
1307 elif char == '.':
1308 if flags & DOTALL:
Guido van Rossuma0e4c1b1997-07-17 14:52:48 +00001309 stack.append([Set(map(chr, range(256)))])
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001310 else:
1311 stack.append([AnyChar()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001312 lastop = '.'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001313
1314 elif char == '^':
1315 if flags & MULTILINE:
1316 stack.append([Bol()])
1317 else:
1318 stack.append([BegBuf()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001319 lastop = '^'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001320
1321 elif char == '$':
1322 if flags & MULTILINE:
1323 stack.append([Eol()])
1324 else:
1325 stack.append([EndBuf()])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001326 lastop = '$'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001327
1328 elif char == '#':
1329 if flags & VERBOSE:
1330 # comment
1331 index = index + 1
1332 end = string.find(pattern, '\n', index)
1333 if end == -1:
1334 index = len(pattern)
1335 else:
1336 index = end + 1
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001337 # do not change lastop
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001338 else:
1339 stack.append([Exact(char)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001340 lastop = '#'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001341
1342 elif char in string.whitespace:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001343 if not (flags & VERBOSE):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001344 stack.append([Exact(char)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001345 lastop = char
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001346
1347 elif char == '[':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001348 # compile character class
1349
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001350 if index >= len(pattern):
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001351 raise error, 'unclosed character class'
1352
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001353 negate = 0
1354 last = ''
1355 set = []
Guido van Rossum04a1d741997-07-15 14:38:13 +00001356
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001357 if pattern[index] == '^':
1358 negate = 1
1359 index = index + 1
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001360 if index >= len(pattern):
1361 raise error, 'unclosed character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001362
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001363 if pattern[index] == ']':
1364 set.append(']')
1365 index = index + 1
1366 if index >= len(pattern):
1367 raise error, 'unclosed character class'
1368
1369 elif pattern[index] == '-':
1370 set.append('-')
1371 index = index + 1
1372 if index >= len(pattern):
1373 raise error, 'unclosed character class'
1374
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001375 while (index < len(pattern)) and (pattern[index] != ']'):
1376 next = pattern[index]
1377 index = index + 1
1378 if next == '-':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001379 if index >= len(pattern):
1380 raise error, 'incomplete range in character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001381
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001382 elif pattern[index] == ']':
1383 set.append('-')
Guido van Rossum04a1d741997-07-15 14:38:13 +00001384
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001385 else:
1386 if last == '':
1387 raise error, ('improper use of range in '
1388 'character class')
1389
1390 start = last
1391
1392 if pattern[index] == '\\':
1393 escape_type,
1394 value,
1395 index = expand_escape(pattern,
1396 index + 1,
1397 CHARCLASS)
1398
1399 if escape_type == CHAR:
1400 end = value
1401
1402 else:
1403 raise error, ('illegal escape in character '
1404 'class range')
1405 else:
1406 end = pattern[index]
1407 index = index + 1
1408
1409 if start > end:
1410 raise error, ('range arguments out of order '
1411 'in character class')
1412
1413 for char in map(chr, range(ord(start), ord(end) + 1)):
1414 if char not in set:
1415 set.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001416
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001417 last = ''
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001418
1419 elif next == '\\':
1420 # expand syntax meta-characters and add to set
1421 if index >= len(pattern):
1422 raise error, 'incomplete set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001423
1424 escape_type, value, index = expand_escape(pattern,
1425 index,
1426 CHARCLASS)
1427
1428 if escape_type == CHAR:
1429 set.append(value)
1430 last = value
1431
1432 elif escape_type == SET:
1433 for char in value:
1434 if char not in set:
1435 set.append(char)
1436 last = ''
1437
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001438 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001439 raise error, 'illegal escape type in character class'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001440
1441 else:
1442 if next not in set:
1443 set.append(next)
1444 last = next
Guido van Rossum04a1d741997-07-15 14:38:13 +00001445
1446 if (index >= len(pattern)) or ( pattern[index] != ']'):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001447 raise error, 'incomplete set'
1448
1449 index = index + 1
1450
1451 if negate:
1452 notset = []
1453 for char in map(chr, range(256)):
1454 if char not in set:
1455 notset.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001456 if len(notset) == 0:
1457 raise error, 'empty negated set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001458 stack.append([Set(notset)])
1459 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001460 if len(set) == 0:
1461 raise error, 'empty set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001462 stack.append([Set(set)])
1463
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001464 lastop = '[]'
1465
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001466 else:
1467 stack.append([Exact(char)])
Guido van Rossuma4f1a781997-07-17 22:38:10 +00001468 lastop = char
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001469
1470 code = []
1471 while len(stack) > 0:
1472 if stack[-1][0].name == '(':
1473 raise error, 'too many open parens'
1474 code = stack[-1] + code
1475 del stack[-1]
1476 if len(code) == 0:
1477 raise error, 'no code generated'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001478 code = filter(lambda x: x.name != '|', code)
1479 need_label = 0
1480 for i in range(len(code)):
1481 if (code[i].name == 'jump') and (code[i].label == -1):
1482 code[i] = Jump(label)
1483 need_label = 1
1484 if need_label:
1485 code.append(Label(label))
1486 label = label + 1
1487 code.append(End())
Guido van Rossum71fa97c1997-07-18 04:26:03 +00001488 return RegexObject(pattern, flags, code, register, groupindex)
Guido van Rossum6af4abd1997-08-13 03:25:34 +00001489
1490# Replace expand_escape and _expand functions with their C equivalents.
1491# If you suspect bugs in the C versions, comment out the next two lines
1492expand_escape = reop.expand_escape
1493_expand = reop._expand