blob: 2e1b9205f07dd27515e17c1cec456a7d4fc84962 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
8error = 're error'
9
10# compilation flags
11
12IGNORECASE = I = 0x01
13
14MULTILINE = M = 0x02
15DOTALL = S = 0x04
16VERBOSE = X = 0x08
17
18#
19# Initialize syntax table. This information should really come from the
20# syntax table in regexpr.c rather than being duplicated here.
21#
22
23syntax_table = {}
24
25for char in map(chr, range(0, 256)):
26 syntax_table[char] = []
27
28for char in string.lowercase:
29 syntax_table[char].append('word')
30
31for char in string.uppercase:
32 syntax_table[char].append('word')
33
34for char in string.digits:
35 syntax_table[char].append('word')
36 syntax_table[char].append('digit')
37
38for char in string.whitespace:
39 syntax_table[char].append('whitespace')
40
41syntax_table['_'].append('word')
42
43#
44#
45#
46
Guido van Rossum09bcfd61997-07-15 15:38:20 +000047def valid_identifier(id):
48 if len(id) == 0:
49 return 0
50 if ('word' not in syntax_table[id[0]]) or ('digit' in syntax_table[id[0]]):
51 return 0
52 for char in id[1:]:
53 if 'word' not in syntax_table[char]:
54 return 0
55 return 1
56
57#
58#
59#
60
Guido van Rossum5ca1b711997-07-10 21:00:31 +000061def match(pattern, string, flags=0):
62 return compile(pattern, flags).match(string)
63
64def search(pattern, string, flags=0):
65 return compile(pattern, flags).search(string)
66
67def sub(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000068 return compile(pattern).sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000069
70def subn(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000071 return compile(pattern).subn(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000072
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000073def split(pattern, string, maxsplit=0):
74 return compile(pattern).subn(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000075
76#
77#
78#
79
80class RegexObject:
81 def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
82 self.code = code
83 self.num_regs = num_regs
84 self.flags = flags
85 self.pattern = pattern
86 self.groupindex = groupindex
87 self.callouts = callouts
88 self.fastmap = build_fastmap(code)
89 if code[0].name == 'bol':
90 self.anchor = 1
91 elif code[0].name == 'begbuf':
92 self.anchor = 2
93 else:
94 self.anchor = 0
95 self.buffer = assemble(code)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000096 def search(self, string, pos=0):
97 regs = reop.search(self.buffer,
98 self.num_regs,
99 self.flags,
100 self.fastmap.can_be_null,
101 self.fastmap.fastmap(),
102 self.anchor,
103 string,
104 pos)
105 if regs is None:
106 return None
107 return MatchObject(self,
108 string,
109 pos,
110 regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000111 def match(self, string, pos=0):
112 regs = reop.match(self.buffer,
113 self.num_regs,
114 self.flags,
115 self.fastmap.can_be_null,
116 self.fastmap.fastmap(),
117 self.anchor,
118 string,
119 pos)
120 if regs is None:
121 return None
122 return MatchObject(self,
123 string,
124 pos,
125 regs)
126 def sub(self, repl, string, count=0):
127 pass
128 def subn(self, repl, string, count=0):
129 pass
130 def split(self, string, maxsplit=0):
131 pass
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000132
133class MatchObject:
134 def __init__(self, re, string, pos, regs):
135 self.re = re
136 self.string = string
137 self.pos = pos
138 self.regs = regs
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000139 def start(self, g):
140 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000141 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000142 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000143 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000144 raise IndexError, ('group "' + g + '" is undefined')
145 return self.regs[g][0]
146 def end(self, g):
147 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000148 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000149 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000150 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000151 raise IndexError, ('group "' + g + '" is undefined')
152 return self.regs[g][1]
153 def span(self, g):
154 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000155 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000156 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000157 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000158 raise IndexError, ('group "' + g + '" is undefined')
159 return self.regs[g]
160 def group(self, *groups):
161 if len(groups) == 0:
162 groups = range(1, self.re.num_regs)
Guido van Rossum53109751997-07-15 15:40:29 +0000163 use_all = 1
164 else:
165 use_all = 0
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000166 result = []
167 for g in groups:
168 if type(g) == type(''):
169 try:
170 g = self.re.groupindex[g]
171 except (KeyError, TypeError):
172 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000173 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000174 result.append(None)
175 else:
176 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossum53109751997-07-15 15:40:29 +0000177 if use_all or len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000178 return tuple(result)
179 elif len(result) == 1:
180 return result[0]
181 else:
182 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000183
184#
185# A set of classes to make assembly a bit easier, if a bit verbose.
186#
187
188class Instruction:
189 def __init__(self, opcode, size=1):
190 self.opcode = opcode
191 self.size = size
192 def assemble(self, position, labels):
193 return self.opcode
194 def __repr__(self):
195 return '%-15s' % (self.name)
196
197class FunctionCallout(Instruction):
198 name = 'function'
199 def __init__(self, function):
200 self.function = function
201 Instruction.__init__(self, chr(22), 2 + len(self.function))
202 def assemble(self, position, labels):
203 return self.opcode + chr(len(self.function)) + self.function
204 def __repr__(self):
205 return '%-15s %-10s' % (self.name, self.function)
206
207class End(Instruction):
208 name = 'end'
209 def __init__(self):
210 Instruction.__init__(self, chr(0))
211
212class Bol(Instruction):
213 name = 'bol'
214 def __init__(self):
215 self.name = 'bol'
216 Instruction.__init__(self, chr(1))
217
218class Eol(Instruction):
219 name = 'eol'
220 def __init__(self):
221 Instruction.__init__(self, chr(2))
222
223class Set(Instruction):
224 name = 'set'
225 def __init__(self, set):
226 self.set = set
227 Instruction.__init__(self, chr(3), 33)
Guido van Rossum63e18191997-07-11 11:08:38 +0000228 def assemble(self, position, labels):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000229 result = self.opcode
230 temp = 0
231 for i, c in map(lambda x: (x, chr(x)), range(256)):
Guido van Rossum63e18191997-07-11 11:08:38 +0000232 if c in self.set:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000233 temp = temp | (1 << (i & 7))
234 if (i % 8) == 7:
235 result = result + chr(temp)
236 temp = 0
237 return result
238 def __repr__(self):
239 result = '%-15s' % (self.name)
240 self.set.sort()
241 for char in self.set:
Guido van Rossum63e18191997-07-11 11:08:38 +0000242 result = result + char
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000243 return result
244
245class Exact(Instruction):
246 name = 'exact'
247 def __init__(self, char):
248 self.char = char
249 Instruction.__init__(self, chr(4), 2)
250 def assemble(self, position, labels):
251 return self.opcode + self.char
252 def __repr__(self):
253 return '%-15s %s' % (self.name, `self.char`)
254
255class AnyChar(Instruction):
256 name = 'anychar'
257 def __init__(self):
258 Instruction.__init__(self, chr(5))
259 def assemble(self, position, labels):
260 return self.opcode
261
262class MemoryInstruction(Instruction):
263 def __init__(self, opcode, register):
264 self.register = register
265 Instruction.__init__(self, opcode, 2)
266 def assemble(self, position, labels):
267 return self.opcode + chr(self.register)
268 def __repr__(self):
269 return '%-15s %i' % (self.name, self.register)
270
271class StartMemory(MemoryInstruction):
272 name = 'start_memory'
273 def __init__(self, register):
274 MemoryInstruction.__init__(self, chr(6), register)
275
276class EndMemory(MemoryInstruction):
277 name = 'end_memory'
278 def __init__(self, register):
279 MemoryInstruction.__init__(self, chr(7), register)
280
281class MatchMemory(MemoryInstruction):
282 name = 'match_memory'
283 def __init__(self, register):
284 MemoryInstruction.__init__(self, chr(8), register)
285
286class JumpInstruction(Instruction):
287 def __init__(self, opcode, label):
288 self.label = label
289 Instruction.__init__(self, opcode, 3)
290 def compute_offset(self, start, dest):
291 return dest - (start + 3)
292 def pack_offset(self, offset):
293 if offset > 32767:
294 raise error, 'offset out of range (pos)'
295 elif offset < -32768:
296 raise error, 'offset out of range (neg)'
297 elif offset < 0:
298 offset = offset + 65536
299 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
300 def assemble(self, position, labels):
301 return self.opcode + \
302 self.pack_offset(self.compute_offset(position,
303 labels[self.label]))
304 def __repr__(self):
305 return '%-15s %i' % (self.name, self.label)
306
307class Jump(JumpInstruction):
308 name = 'jump'
309 def __init__(self, label):
310 JumpInstruction.__init__(self, chr(9), label)
311
312class StarJump(JumpInstruction):
313 name = 'star_jump'
314 def __init__(self, label):
315 JumpInstruction.__init__(self, chr(10), label)
316
317class FailureJump(JumpInstruction):
318 name = 'failure_jump'
319 def __init__(self, label):
320 JumpInstruction.__init__(self, chr(11), label)
321
322class UpdateFailureJump(JumpInstruction):
323 name = 'update_failure_jump'
324 def __init__(self, label):
325 JumpInstruction.__init__(self, chr(12), label)
326
327class DummyFailureJump(JumpInstruction):
328 name = 'update_failure_jump'
329 def __init__(self, label):
330 JumpInstruction.__init__(self, chr(13), label)
331
332class BegBuf(Instruction):
333 name = 'begbuf'
334 def __init__(self):
335 Instruction.__init__(self, chr(14))
336
337class EndBuf(Instruction):
338 name = 'endbuf'
339 def __init__(self):
340 Instruction.__init__(self, chr(15))
341
342class WordBeg(Instruction):
343 name = 'wordbeg'
344 def __init__(self):
345 Instruction.__init__(self, chr(16))
346
347class WordEnd(Instruction):
348 name = 'wordend'
349 def __init__(self):
350 Instruction.__init__(self, chr(17))
351
352class WordBound(Instruction):
353 name = 'wordbound'
354 def __init__(self):
355 Instruction.__init__(self, chr(18))
356
357class NotWordBound(Instruction):
358 name = 'notwordbound'
359 def __init__(self):
360 Instruction.__init__(self, chr(18))
361
362class SyntaxSpec(Instruction):
363 name = 'syntaxspec'
364 def __init__(self, syntax):
365 self.syntax = syntax
366 Instruction.__init__(self, chr(20), 2)
367 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000368 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000369 return self.opcode + chr(self.syntax)
370
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000371class NotSyntaxSpec(Instruction):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000372 name = 'notsyntaxspec'
373 def __init__(self, syntax):
374 self.syntax = syntax
375 Instruction.__init__(self, chr(21), 2)
376 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000377 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000378 return self.opcode + chr(self.syntax)
379
380class Label(Instruction):
381 name = 'label'
382 def __init__(self, label):
383 self.label = label
384 Instruction.__init__(self, '', 0)
385 def __repr__(self):
386 return '%-15s %i' % (self.name, self.label)
387
388class OpenParen(Instruction):
389 name = '('
390 def __init__(self, register):
391 self.register = register
392 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000393 def assemble(self, position, labels):
394 raise error, 'unmatched open parenthesis'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000395
396class Alternation(Instruction):
397 name = '|'
398 def __init__(self):
399 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000400 def assemble(self, position, labels):
401 raise error, 'an alternation was not taken care of'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000402
403#
404#
405#
406
407def assemble(instructions):
408 labels = {}
409 position = 0
410 pass1 = []
411 for instruction in instructions:
412 if instruction.name == 'label':
413 labels[instruction.label] = position
414 else:
415 pass1.append((position, instruction))
416 position = position + instruction.size
417 pass2 = ''
418 for position, instruction in pass1:
419 pass2 = pass2 + instruction.assemble(position, labels)
420 return pass2
421
422#
423#
424#
425
426def escape(pattern):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000427 result = []
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000428 for char in pattern:
429 if 'word' not in syntax_table[char]:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000430 result.append('\\')
431 result.append(char)
432 return string.join(result, '')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000433
434#
435#
436#
437
438def registers_used(instructions):
439 result = []
440 for instruction in instructions:
441 if (instruction.name in ['set_memory', 'end_memory']) and \
442 (instruction.register not in result):
443 result.append(instruction.register)
444 return result
445
446#
447#
448#
449
450class Fastmap:
451 def __init__(self):
452 self.map = ['\000']*256
453 self.can_be_null = 0
454 def add(self, char):
455 self.map[ord(char)] = '\001'
456 def fastmap(self):
457 return string.join(self.map, '')
458 def __getitem__(self, char):
459 return ord(self.map[ord(char)])
460 def __repr__(self):
461 self.map.sort()
462 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
463
464#
465#
466#
467
468def find_label(code, label):
469 line = 0
470 for instruction in code:
471 if (instruction.name == 'label') and (instruction.label == label):
472 return line + 1
473 line = line + 1
474
475def build_fastmap_aux(code, pos, visited, fastmap):
476 if visited[pos]:
477 return
478 while 1:
479 instruction = code[pos]
480 visited[pos] = 1
481 pos = pos + 1
482 if instruction.name == 'end':
483 fastmap.can_be_null = 1
484 return
485 elif instruction.name == 'syntaxspec':
486 for char in map(chr, range(256)):
487 if instruction.syntax in syntax_table[char]:
488 fastmap.add(char)
489 return
490 elif instruction.name == 'notsyntaxspec':
491 for char in map(chr, range(256)):
492 if instruction.syntax not in syntax_table[char]:
493 fastmap.add(char)
494 return
495 elif instruction.name == 'eol':
496 fastmap.add('\n')
497 if fastmap.can_be_null == 0:
498 fastmap.can_be_null = 2
499 return
500 elif instruction.name == 'set':
501 for char in instruction.set:
502 fastmap.add(char)
503 return
504 elif instruction.name == 'exact':
505 fastmap.add(instruction.char)
506 elif instruction.name == 'anychar':
507 for char in map(chr, range(256)):
508 if char != '\n':
509 fastmap.add(char)
510 return
511 elif instruction.name == 'match_memory':
512 for char in map(chr, range(256)):
513 fastmap.add(char)
514 fastmap.can_be_null = 1
515 return
516 elif instruction.name in ['jump', 'dummy_failure_jump', \
517 'update_failure_jump', 'star_jump']:
518 pos = find_label(code, instruction.label)
519 if visited[pos]:
520 return
521 visited[pos] = 1
522 elif instruction.name == 'failure_jump':
523 build_fastmap_aux(code,
524 find_label(code, instruction.label),
525 visited,
526 fastmap)
527 elif instruction.name == 'function':
528 for char in map(chr, range(256)):
529 fastmap.add(char)
530 fastmap.can_be_null = 1
531 return
532
533def build_fastmap(code, pos=0):
534 visited = [0] * len(code)
535 fastmap = Fastmap()
536 build_fastmap_aux(code, pos, visited, fastmap)
537 return fastmap
538
539#
540#
541#
542
Guido van Rossum04a1d741997-07-15 14:38:13 +0000543[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
544[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
545 BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)
546
547def expand_escape(pattern, index, context=NORMAL):
548 if index >= len(pattern):
549 raise error, 'escape ends too soon'
550
551 elif pattern[index] == 't':
552 return CHAR, chr(9), index + 1
553
554 elif pattern[index] == 'n':
555 return CHAR, chr(10), index + 1
556
557 elif pattern[index] == 'r':
558 return CHAR, chr(13), index + 1
559
560 elif pattern[index] == 'f':
561 return CHAR, chr(12), index + 1
562
563 elif pattern[index] == 'a':
564 return CHAR, chr(7), index + 1
565
566 elif pattern[index] == 'e':
567 return CHAR, chr(27), index + 1
568
569 elif pattern[index] == 'c':
570 if index + 1 >= len(pattern):
571 raise error, '\\c must be followed by another character'
572 elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz':
573 return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2
574 else:
575 return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2
576
577 elif pattern[index] == 'x':
578 # CAUTION: this is the Python rule, not the Perl rule!
579 end = index
580 while (end < len(pattern)) and (pattern[end] in string.hexdigits):
581 end = end + 1
582 if end == index:
583 raise error, "\\x must be followed by hex digit(s)"
584 # let Python evaluate it, so we don't incorrectly 2nd-guess
585 # what it's doing (and Python in turn passes it on to sscanf,
586 # so that *it* doesn't incorrectly 2nd-guess what C does!)
587 char = eval ('"' + pattern[index-2:end] + '"')
588 assert len(char) == 1
589 return CHAR, char, end
590
591 elif pattern[index] == 'b':
592 if context != NORMAL:
593 return CHAR, chr(8), index + 1
594 else:
595 return WORD_BOUNDARY, '', index + 1
596
597 elif pattern[index] == 'B':
598 if context != NORMAL:
599 return CHAR, 'B', index + 1
600 else:
601 return NOT_WORD_BOUNDARY, '', index + 1
602
603 elif pattern[index] == 'A':
604 if context != NORMAL:
605 return CHAR, 'A', index + 1
606 else:
607 return BEGINNING_OF_BUFFER, '', index + 1
608
609 elif pattern[index] == 'Z':
610 if context != NORMAL:
611 return 'Z', index + 1
612 else:
613 return END_OF_BUFFER, '', index + 1
614
615 elif pattern[index] in 'GluLUQE':
616 raise error, ('\\' + ch + ' is not allowed')
617
618 elif pattern[index] == 'w':
619 if context == NORMAL:
620 return SYNTAX, 'word', index + 1
621 elif context == CHARCLASS:
622 set = []
623 for char in syntax_table.keys():
624 if 'word' in syntax_table[char]:
625 set.append(char)
626 return SET, set, index + 1
627 else:
628 return CHAR, 'w', index + 1
629
630 elif pattern[index] == 'W':
631 if context == NORMAL:
632 return NOT_SYNTAX, 'word', index + 1
633 elif context == CHARCLASS:
634 set = []
635 for char in syntax_table.keys():
636 if 'word' not in syntax_table[char]:
637 set.append(char)
638 return SET, set, index + 1
639 else:
640 return CHAR, 'W', index + 1
641
642 elif pattern[index] == 's':
643 if context == NORMAL:
644 return SYNTAX, 'whitespace', index + 1
645 elif context == CHARCLASS:
646 set = []
647 for char in syntax_table.keys():
648 if 'whitespace' in syntax_table[char]:
649 set.append(char)
650 return SET, set, index + 1
651 else:
652 return CHAR, 's', index + 1
653
654 elif pattern[index] == 'S':
655 if context == NORMAL:
656 return NOT_SYNTAX, 'whitespace', index + 1
657 elif context == CHARCLASS:
658 set = []
659 for char in syntax_table.keys():
660 if 'whitespace' not in syntax_table[char]:
661 set.append(char)
662 return SET, set, index + 1
663 else:
664 return CHAR, 'S', index + 1
665
666 elif pattern[index] == 'd':
667 if context == NORMAL:
668 return SYNTAX, 'digit', index + 1
669 elif context == CHARCLASS:
670 set = []
671 for char in syntax_table.keys():
672 if 'digit' in syntax_table[char]:
673 set.append(char)
674 return SET, set, index + 1
675 else:
676 return CHAR, 'd', index + 1
677
678 elif pattern[index] == 'D':
679 if context == NORMAL:
680 return NOT_SYNTAX, 'digit', index + 1
681 elif context == CHARCLASS:
682 set = []
683 for char in syntax_table.keys():
684 if 'digit' not in syntax_table[char]:
685 set.append(char)
686 return SET, set, index + 1
687 else:
688 return CHAR, 'D', index + 1
689
690 elif pattern[index] in '0123456789':
691 end = index
692 while (end < len(pattern)) and (pattern[end] in string.digits):
693 end = end + 1
694 value = pattern[index:end]
695
696 if (len(value) == 3) or ((len(value) == 2) and (value[0] == '0')):
697 # octal character value
698 value = string.atoi(value, 8)
699 if value > 255:
700 raise error, 'octal char out of range'
701 return CHAR, chr(value), end
702
703 elif value == '0':
704 return CHAR, chr(0), end
705
706 elif len(value) > 3:
707 raise error, ('\\' + value + ' has too many digits')
708
709 else:
710 # \1-\99 - reference a register
711 if context == CHARCLASS:
712 raise error, ('cannot reference a register from '
713 'inside a character class')
714 value = string.atoi(value)
715 if value == 0:
716 raise error, ('register 0 cannot be used '
717 'during match')
718 return MEMORY_REFERENCE, value, end
719
720 else:
721 return CHAR, pattern[index], index + 1
722
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000723def compile(pattern, flags=0):
724 stack = []
725 index = 0
726 label = 0
727 register = 1
728 groupindex = {}
729 callouts = []
730 while (index < len(pattern)):
731 char = pattern[index]
732 index = index + 1
733 if char == '\\':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000734 escape_type, value, index = expand_escape(pattern, index)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000735
Guido van Rossum04a1d741997-07-15 14:38:13 +0000736 if escape_type == CHAR:
737 stack.append([Exact(value)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000738
Guido van Rossum04a1d741997-07-15 14:38:13 +0000739 elif escape_type == MEMORY_REFERENCE:
740 if value >= register:
741 raise error, ('cannot reference a register '
742 'not yet used')
743 stack.append([MatchMemory(value)])
744
745 elif escape_type == BEGINNING_OF_BUFFER:
746 stack.append([BegBuf()])
747
748 elif escape_type == END_OF_BUFFER:
749 stack.append([EndBuf()])
750
751 elif escape_type == WORD_BOUNDARY:
752 stack.append([WordBound()])
753
754 elif escape_type == NOT_WORD_BOUNDARY:
755 stack.append([NotWordBound()])
756
757 elif escape_type == SYNTAX:
758 stack.append([SyntaxSpec(value)])
759
760 elif escape_type == NOT_SYNTAX:
761 stack.append([NotSyntaxSpec(value)])
762
763 elif escape_type == SET:
764 raise error, 'cannot use set escape type here'
765
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000766 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000767 raise error, 'unknown escape type'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000768
769 elif char == '|':
770 if len(stack) == 0:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000771 raise error, 'alternate with nothing on the left'
772 if stack[-1][0].name == '(':
773 raise error, 'alternate with nothing on the left in the group'
774 if stack[-1][0].name == '|':
775 raise error, 'alternates with nothing inbetween them'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000776 expr = []
Guido van Rossum04a1d741997-07-15 14:38:13 +0000777
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000778 while (len(stack) != 0) and \
779 (stack[-1][0].name != '(') and \
780 (stack[-1][0].name != '|'):
781 expr = stack[-1] + expr
782 del stack[-1]
783 stack.append([FailureJump(label)] + \
784 expr + \
785 [Jump(-1),
786 Label(label)])
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000787 stack.append([Alternation()])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000788 label = label + 1
789
790 elif char == '(':
791 if index >= len(pattern):
792 raise error, 'no matching close paren'
793
794 elif pattern[index] == '?':
795 # Perl style (?...) extensions
796 index = index + 1
797 if index >= len(pattern):
798 raise error, 'extension ends prematurely'
799
800 elif pattern[index] == 'P':
801 # Python extensions
802 index = index + 1
803 if index >= len(pattern):
804 raise error, 'extension ends prematurely'
805
806 elif pattern[index] == '<':
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000807 # Handle Python symbolic group names (?P<...>...)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000808 index = index + 1
809 end = string.find(pattern, '>', index)
810 if end == -1:
811 raise error, 'no end to symbolic group name'
812 name = pattern[index:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000813 if not valid_identifier(name):
814 raise error, ('symbolic group name must be a '
815 'valid identifier')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000816 index = end + 1
817 groupindex[name] = register
818 stack.append([OpenParen(register)])
819 register = register + 1
820
821 elif pattern[index] == '=':
822 # backreference to symbolic group name
823 if index >= len(pattern):
824 raise error, '(?P= at the end of the pattern'
825 start = index + 1
826 end = string.find(pattern, ')', start)
827 if end == -1:
828 raise error, 'no ) to end symbolic group name'
829 name = pattern[start:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000830 if name not in groupindex.keys():
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000831 raise error, ('symbolic group name ' + name + \
832 ' has not been used yet')
833 stack.append([MatchMemory(groupindex[name])])
834 index = end + 1
835
836 elif pattern[index] == '!':
837 # function callout
838 if index >= len(pattern):
839 raise error, 'no function callout name'
840 start = index + 1
841 end = string.find(pattern, ')', start)
842 if end == -1:
843 raise error, 'no ) to end function callout name'
844 name = pattern[start:end]
845 if name not in callouts:
846 raise error, ('function callout name not listed '
847 'in callouts dict')
848 stack.append([FunctionCallout(name)])
849
850 else:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000851 raise error, ('unknown Python extension: ' + \
852 pattern[index])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000853
854 elif pattern[index] == ':':
855 # grouping, but no registers
856 index = index + 1
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000857 stack.append([OpenParen(-1)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000858
859 elif pattern[index] == '#':
860 # comment
861 index = index + 1
862 end = string.find(pattern, ')', index)
863 if end == -1:
864 raise error, 'no end to comment'
865 index = end + 1
866
867 elif pattern[index] == '=':
868 raise error, ('zero-width positive lookahead '
869 'assertion is unsupported')
870
871 elif pattern[index] == '!':
872 raise error, ('zero-width negative lookahead '
873 'assertion is unsupported')
874
875 elif pattern[index] in 'iImMsSxX':
876 while (index < len(pattern)) and (pattern[index] != ')'):
Guido van Rossum65c28b71997-07-11 11:10:44 +0000877 if pattern[index] in 'iI':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000878 flags = flags | IGNORECASE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000879 elif pattern[index] in 'mM':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000880 flags = flags | MULTILINE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000881 elif pattern[index] in 'sS':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000882 flags = flags | DOTALL
883 elif pattern[index] in 'xX':
884 flags = flags | VERBOSE
885 else:
886 raise error, 'unknown flag'
887 index = index + 1
888 index = index + 1
889
890 else:
891 raise error, 'unknown extension'
892
893 else:
894 stack.append([OpenParen(register)])
895 register = register + 1
896
897 elif char == ')':
898 # make one expression out of everything on the stack up to
899 # the marker left by the last parenthesis
900 expr = []
901 while (len(stack) > 0) and (stack[-1][0].name != '('):
902 expr = stack[-1] + expr
903 del stack[-1]
904
905 if len(stack) == 0:
906 raise error, 'too many close parens'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000907
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000908 if len(expr) == 0:
909 raise error, 'nothing inside parens'
910
911 # check to see if alternation used correctly
912 if (expr[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000913 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000914
915 # remove markers left by alternation
916 expr = filter(lambda x: x.name != '|', expr)
917
918 # clean up jumps inserted by alternation
919 need_label = 0
920 for i in range(len(expr)):
921 if (expr[i].name == 'jump') and (expr[i].label == -1):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000922 expr[i] = Jump(label)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000923 need_label = 1
924 if need_label:
925 expr.append(Label(label))
926 label = label + 1
927
Guido van Rossum63e18191997-07-11 11:08:38 +0000928 if stack[-1][0].register > 0:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000929 expr = [StartMemory(stack[-1][0].register)] + \
930 expr + \
931 [EndMemory(stack[-1][0].register)]
932 del stack[-1]
933 stack.append(expr)
934
935 elif char == '{':
936 if len(stack) == 0:
937 raise error, 'no expression to repeat'
938 end = string.find(pattern, '}', index)
939 if end == -1:
940 raise error, ('no close curly bracket to match'
941 ' open curly bracket')
942
943 fields = map(string.strip,
944 string.split(pattern[index:end], ','))
945 index = end + 1
946
947 minimal = 0
948 if (index < len(pattern)) and (pattern[index] == '?'):
949 minimal = 1
950 index = index + 1
951
952 if len(fields) == 1:
953 # {n} or {n}? (there's really no difference)
954 try:
955 count = string.atoi(fields[0])
956 except ValueError:
957 raise error, ('count must be an integer '
958 'inside curly braces')
959 if count > 65535:
960 raise error, 'repeat count out of range'
961 expr = []
962 while count > 0:
963 expr = expr + stack[-1]
964 count = count - 1
965 del stack[-1]
966 stack.append(expr)
967
968 elif len(fields) == 2:
969 # {n,} or {n,m}
970 if fields[1] == '':
971 # {n,}
972 try:
973 min = string.atoi(fields[0])
974 except ValueError:
975 raise error, ('minimum must be an integer '
976 'inside curly braces')
977 if min > 65535:
978 raise error, 'minimum repeat count out of range'
979
980 expr = []
981 while min > 0:
982 expr = expr + stack[-1]
983 min = min - 1
984 registers = registers_used(stack[-1])
985 if minimal:
986 expr = expr + \
987 ([Jump(label + 1),
988 Label(label)] + \
989 stack[-1] + \
990 [Label(label + 1),
991 FailureJump(label, registers)])
992 else:
993 expr = expr + \
994 ([Label(label),
995 FailureJump(label + 1, registers)] +
996 stack[-1] +
997 [StarJump(label),
998 Label(label + 1)])
999
1000 del stack[-1]
1001 stack.append(expr)
1002 label = label + 2
1003
1004 else:
1005 # {n,m}
1006 try:
1007 min = string.atoi(fields[0])
1008 except ValueError:
1009 raise error, ('minimum must be an integer '
1010 'inside curly braces')
1011 try:
1012 max = string.atoi(fields[1])
1013 except ValueError:
1014 raise error, ('maximum must be an integer '
1015 'inside curly braces')
1016 if min > 65535:
1017 raise error, ('minumim repeat count out '
1018 'of range')
1019 if max > 65535:
1020 raise error, ('maximum repeat count out '
1021 'of range')
1022 if min > max:
1023 raise error, ('minimum repeat count must be '
1024 'less than the maximum '
1025 'repeat count')
1026 expr = []
1027 while min > 0:
1028 expr = expr + stack[-1]
1029 min = min - 1
1030 max = max - 1
1031 if minimal:
1032 while max > 0:
1033 expr = expr + \
1034 [FailureJump(label),
1035 Jump(label + 1),
1036 Label(label)] + \
1037 stack[-1] + \
1038 [Label(label + 1)]
1039 label = label + 2
1040 del stack[-1]
1041 stack.append(expr)
1042 else:
1043 while max > 0:
1044 expr = expr + \
1045 [FailureJump(label)] + \
1046 stack[-1]
1047 max = max - 1
1048 del stack[-1]
1049 stack.append(expr + [Label(label)])
1050 label = label + 1
1051
1052 else:
1053 raise error, ('there need to be one or two fields '
1054 'in a {} expression')
1055 index = end + 1
1056
1057 elif char == '}':
1058 raise error, 'unbalanced close curly brace'
1059
1060 elif char == '*':
1061 # Kleene closure
1062 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001063 raise error, '* needs something to repeat'
1064 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1065 raise error, '* needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001066 registers = registers_used(stack[-1])
1067 if (index < len(pattern)) and (pattern[index] == '?'):
1068 # non-greedy matching
1069 expr = [JumpInstructions(label + 1),
1070 Label(label)] + \
1071 stack[-1] + \
1072 [Label(label + 1),
1073 FailureJump(label)]
1074 index = index + 1
1075 else:
1076 # greedy matching
1077 expr = [Label(label),
1078 FailureJump(label + 1)] + \
1079 stack[-1] + \
1080 [StarJump(label),
1081 Label(label + 1)]
1082 del stack[-1]
1083 stack.append(expr)
1084 label = label + 2
1085
1086 elif char == '+':
1087 # positive closure
1088 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001089 raise error, '+ needs something to repeat'
1090 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1091 raise error, '+ needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001092 registers = registers_used(stack[-1])
1093 if (index < len(pattern)) and (pattern[index] == '?'):
1094 # non-greedy
1095 expr = [Label(label)] + \
1096 stack[-1] + \
1097 [FailureJump(label)]
1098 label = label + 1
1099 index = index + 1
1100 else:
1101 # greedy
1102 expr = [DummyFailureJump(label + 1),
1103 Label(label),
1104 FailureJump(label + 2),
1105 Label(label + 1)] + \
1106 stack[-1] + \
1107 [StarJump(label),
1108 Label(label + 2)]
1109 label = label + 3
1110 del stack[-1]
1111 stack.append(expr)
1112
1113 elif char == '?':
1114 if len(stack) == 0:
1115 raise error, 'need something to be optional'
1116 registers = registers_used(stack[-1])
1117 if (index < len(pattern)) and (pattern[index] == '?'):
1118 # non-greedy matching
1119 expr = [FailureJump(label),
1120 Jump(label + 1),
1121 Label(label)] + \
1122 stack[-1] + \
1123 [Label(label + 1)]
1124 label = label + 2
1125 index = index + 1
1126 else:
1127 # greedy matching
1128 expr = [FailureJump(label)] + \
1129 stack[-1] + \
1130 [Label(label)]
1131 label = label + 1
1132 del stack[-1]
1133 stack.append(expr)
1134
1135 elif char == '.':
1136 if flags & DOTALL:
1137 stack.append(Set(map(chr, range(256))))
1138 else:
1139 stack.append([AnyChar()])
1140
1141 elif char == '^':
1142 if flags & MULTILINE:
1143 stack.append([Bol()])
1144 else:
1145 stack.append([BegBuf()])
1146
1147 elif char == '$':
1148 if flags & MULTILINE:
1149 stack.append([Eol()])
1150 else:
1151 stack.append([EndBuf()])
1152
1153 elif char == '#':
1154 if flags & VERBOSE:
1155 # comment
1156 index = index + 1
1157 end = string.find(pattern, '\n', index)
1158 if end == -1:
1159 index = len(pattern)
1160 else:
1161 index = end + 1
1162 else:
1163 stack.append([Exact(char)])
1164
1165 elif char in string.whitespace:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001166 if not (flags & VERBOSE):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001167 stack.append([Exact(char)])
1168
1169 elif char == '[':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001170 # compile character class
1171
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001172 if index >= len(pattern):
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001173 raise error, 'unclosed character class'
1174
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001175 negate = 0
1176 last = ''
1177 set = []
Guido van Rossum04a1d741997-07-15 14:38:13 +00001178
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001179 if pattern[index] == '^':
1180 negate = 1
1181 index = index + 1
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001182 if index >= len(pattern):
1183 raise error, 'unclosed character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001184
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001185 if pattern[index] == ']':
1186 set.append(']')
1187 index = index + 1
1188 if index >= len(pattern):
1189 raise error, 'unclosed character class'
1190
1191 elif pattern[index] == '-':
1192 set.append('-')
1193 index = index + 1
1194 if index >= len(pattern):
1195 raise error, 'unclosed character class'
1196
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001197 while (index < len(pattern)) and (pattern[index] != ']'):
1198 next = pattern[index]
1199 index = index + 1
1200 if next == '-':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001201 if index >= len(pattern):
1202 raise error, 'incomplete range in character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001203
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001204 elif pattern[index] == ']':
1205 set.append('-')
Guido van Rossum04a1d741997-07-15 14:38:13 +00001206
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001207 else:
1208 if last == '':
1209 raise error, ('improper use of range in '
1210 'character class')
1211
1212 start = last
1213
1214 if pattern[index] == '\\':
1215 escape_type,
1216 value,
1217 index = expand_escape(pattern,
1218 index + 1,
1219 CHARCLASS)
1220
1221 if escape_type == CHAR:
1222 end = value
1223
1224 else:
1225 raise error, ('illegal escape in character '
1226 'class range')
1227 else:
1228 end = pattern[index]
1229 index = index + 1
1230
1231 if start > end:
1232 raise error, ('range arguments out of order '
1233 'in character class')
1234
1235 for char in map(chr, range(ord(start), ord(end) + 1)):
1236 if char not in set:
1237 set.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001238
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001239 last = ''
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001240
1241 elif next == '\\':
1242 # expand syntax meta-characters and add to set
1243 if index >= len(pattern):
1244 raise error, 'incomplete set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001245
1246 escape_type, value, index = expand_escape(pattern,
1247 index,
1248 CHARCLASS)
1249
1250 if escape_type == CHAR:
1251 set.append(value)
1252 last = value
1253
1254 elif escape_type == SET:
1255 for char in value:
1256 if char not in set:
1257 set.append(char)
1258 last = ''
1259
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001260 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001261 raise error, 'illegal escape type in character class'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001262
1263 else:
1264 if next not in set:
1265 set.append(next)
1266 last = next
Guido van Rossum04a1d741997-07-15 14:38:13 +00001267
1268 if (index >= len(pattern)) or ( pattern[index] != ']'):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001269 raise error, 'incomplete set'
1270
1271 index = index + 1
1272
1273 if negate:
1274 notset = []
1275 for char in map(chr, range(256)):
1276 if char not in set:
1277 notset.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001278 if len(notset) == 0:
1279 raise error, 'empty negated set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001280 stack.append([Set(notset)])
1281 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001282 if len(set) == 0:
1283 raise error, 'empty set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001284 stack.append([Set(set)])
1285
1286 else:
1287 stack.append([Exact(char)])
1288
1289 code = []
1290 while len(stack) > 0:
1291 if stack[-1][0].name == '(':
1292 raise error, 'too many open parens'
1293 code = stack[-1] + code
1294 del stack[-1]
1295 if len(code) == 0:
1296 raise error, 'no code generated'
1297 if (code[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +00001298 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001299 code = filter(lambda x: x.name != '|', code)
1300 need_label = 0
1301 for i in range(len(code)):
1302 if (code[i].name == 'jump') and (code[i].label == -1):
1303 code[i] = Jump(label)
1304 need_label = 1
1305 if need_label:
1306 code.append(Label(label))
1307 label = label + 1
1308 code.append(End())
1309 return RegexObject(pattern, flags, code, register, groupindex, callouts)
1310
1311if __name__ == '__main__':
1312 print compile('a(b)*')
1313 print compile('a{3}')
1314 print compile('(a){2}')
1315 print compile('a{2,4}')
1316 print compile('a|b')
1317 print compile('a(b|c)')
1318 print compile('a*')
1319 print compile('a+')
1320 print compile('a|b|c')
1321 print compile('a(b|c)*')
1322 print compile('\\n')
1323 print compile('a(?# huh huh)b')
1324 print compile('[a-c\\w]')
1325 print compile('[[]')
1326 print compile('[]]')
1327 print compile('(<hello>a)')
1328 print compile('\Q*\e')
1329 print compile('a{0,}')
1330