blob: 016cd68a5ecda7af8ceef00f6c978fb908c428e9 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
8error = 're error'
9
10# compilation flags
11
12IGNORECASE = I = 0x01
13
14MULTILINE = M = 0x02
15DOTALL = S = 0x04
16VERBOSE = X = 0x08
17
18#
19# Initialize syntax table. This information should really come from the
20# syntax table in regexpr.c rather than being duplicated here.
21#
22
23syntax_table = {}
24
25for char in map(chr, range(0, 256)):
26 syntax_table[char] = []
27
28for char in string.lowercase:
29 syntax_table[char].append('word')
30
31for char in string.uppercase:
32 syntax_table[char].append('word')
33
34for char in string.digits:
35 syntax_table[char].append('word')
36 syntax_table[char].append('digit')
37
38for char in string.whitespace:
39 syntax_table[char].append('whitespace')
40
41syntax_table['_'].append('word')
42
43#
44#
45#
46
47def match(pattern, string, flags=0):
48 return compile(pattern, flags).match(string)
49
50def search(pattern, string, flags=0):
51 return compile(pattern, flags).search(string)
52
53def sub(pattern, repl, string, count=0):
54 pass
55
56def subn(pattern, repl, string, count=0):
57 pass
58
59def split(string, pattern, maxsplit=0):
60 pass
61
62#
63#
64#
65
66class RegexObject:
67 def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
68 self.code = code
69 self.num_regs = num_regs
70 self.flags = flags
71 self.pattern = pattern
72 self.groupindex = groupindex
73 self.callouts = callouts
74 self.fastmap = build_fastmap(code)
75 if code[0].name == 'bol':
76 self.anchor = 1
77 elif code[0].name == 'begbuf':
78 self.anchor = 2
79 else:
80 self.anchor = 0
81 self.buffer = assemble(code)
82 def match(self, string, pos=0):
83 regs = reop.match(self.buffer,
84 self.num_regs,
85 self.flags,
86 self.fastmap.can_be_null,
87 self.fastmap.fastmap(),
88 self.anchor,
89 string,
90 pos)
91 if regs is None:
92 return None
93 return MatchObject(self,
94 string,
95 pos,
96 regs)
97 def search(self, string, pos=0):
98 regs = reop.search(self.buffer,
99 self.num_regs,
100 self.flags,
101 self.fastmap.can_be_null,
102 self.fastmap.fastmap(),
103 self.anchor,
104 string,
105 pos)
106 if regs is None:
107 return None
108 return MatchObject(self,
109 string,
110 pos,
111 regs)
112
113class MatchObject:
114 def __init__(self, re, string, pos, regs):
115 self.re = re
116 self.string = string
117 self.pos = pos
118 self.regs = regs
119 def start(self, i):
120 if type(i) == type(''):
121 try:
122 i = self.re.groupindex[i]
123 except (KeyError, TypeError):
124 raise IndexError
125 return self.regs[i][0]
126 def end(self, i):
127 if type(i) == type(''):
128 try:
129 i = self.re.groupindex[i]
130 except (KeyError, TypeError):
131 raise IndexError
132 return self.regs[i][1]
133 def span(self, i):
134 if type(i) == type(''):
135 try:
136 i = self.re.groupindex[i]
137 except (KeyError, TypeError):
138 raise IndexError
139 return self.regs[i]
140 def group(i):
141 if type(i) == type(''):
142 try:
143 i = self.re.groupindex[i]
144 except (KeyError, TypeError):
145 raise IndexError
146 return self.string[self.regs[i][0]:self.regs[i][1]]
147
148#
149# A set of classes to make assembly a bit easier, if a bit verbose.
150#
151
152class Instruction:
153 def __init__(self, opcode, size=1):
154 self.opcode = opcode
155 self.size = size
156 def assemble(self, position, labels):
157 return self.opcode
158 def __repr__(self):
159 return '%-15s' % (self.name)
160
161class FunctionCallout(Instruction):
162 name = 'function'
163 def __init__(self, function):
164 self.function = function
165 Instruction.__init__(self, chr(22), 2 + len(self.function))
166 def assemble(self, position, labels):
167 return self.opcode + chr(len(self.function)) + self.function
168 def __repr__(self):
169 return '%-15s %-10s' % (self.name, self.function)
170
171class End(Instruction):
172 name = 'end'
173 def __init__(self):
174 Instruction.__init__(self, chr(0))
175
176class Bol(Instruction):
177 name = 'bol'
178 def __init__(self):
179 self.name = 'bol'
180 Instruction.__init__(self, chr(1))
181
182class Eol(Instruction):
183 name = 'eol'
184 def __init__(self):
185 Instruction.__init__(self, chr(2))
186
187class Set(Instruction):
188 name = 'set'
189 def __init__(self, set):
190 self.set = set
191 Instruction.__init__(self, chr(3), 33)
192 def assemble_set(self, position, labels):
193 result = self.opcode
194 temp = 0
195 for i, c in map(lambda x: (x, chr(x)), range(256)):
196 if c in self.set[2]:
197 temp = temp | (1 << (i & 7))
198 if (i % 8) == 7:
199 result = result + chr(temp)
200 temp = 0
201 return result
202 def __repr__(self):
203 result = '%-15s' % (self.name)
204 self.set.sort()
205 for char in self.set:
206 result = result + `char`
207 return result
208
209class Exact(Instruction):
210 name = 'exact'
211 def __init__(self, char):
212 self.char = char
213 Instruction.__init__(self, chr(4), 2)
214 def assemble(self, position, labels):
215 return self.opcode + self.char
216 def __repr__(self):
217 return '%-15s %s' % (self.name, `self.char`)
218
219class AnyChar(Instruction):
220 name = 'anychar'
221 def __init__(self):
222 Instruction.__init__(self, chr(5))
223 def assemble(self, position, labels):
224 return self.opcode
225
226class MemoryInstruction(Instruction):
227 def __init__(self, opcode, register):
228 self.register = register
229 Instruction.__init__(self, opcode, 2)
230 def assemble(self, position, labels):
231 return self.opcode + chr(self.register)
232 def __repr__(self):
233 return '%-15s %i' % (self.name, self.register)
234
235class StartMemory(MemoryInstruction):
236 name = 'start_memory'
237 def __init__(self, register):
238 MemoryInstruction.__init__(self, chr(6), register)
239
240class EndMemory(MemoryInstruction):
241 name = 'end_memory'
242 def __init__(self, register):
243 MemoryInstruction.__init__(self, chr(7), register)
244
245class MatchMemory(MemoryInstruction):
246 name = 'match_memory'
247 def __init__(self, register):
248 MemoryInstruction.__init__(self, chr(8), register)
249
250class JumpInstruction(Instruction):
251 def __init__(self, opcode, label):
252 self.label = label
253 Instruction.__init__(self, opcode, 3)
254 def compute_offset(self, start, dest):
255 return dest - (start + 3)
256 def pack_offset(self, offset):
257 if offset > 32767:
258 raise error, 'offset out of range (pos)'
259 elif offset < -32768:
260 raise error, 'offset out of range (neg)'
261 elif offset < 0:
262 offset = offset + 65536
263 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
264 def assemble(self, position, labels):
265 return self.opcode + \
266 self.pack_offset(self.compute_offset(position,
267 labels[self.label]))
268 def __repr__(self):
269 return '%-15s %i' % (self.name, self.label)
270
271class Jump(JumpInstruction):
272 name = 'jump'
273 def __init__(self, label):
274 JumpInstruction.__init__(self, chr(9), label)
275
276class StarJump(JumpInstruction):
277 name = 'star_jump'
278 def __init__(self, label):
279 JumpInstruction.__init__(self, chr(10), label)
280
281class FailureJump(JumpInstruction):
282 name = 'failure_jump'
283 def __init__(self, label):
284 JumpInstruction.__init__(self, chr(11), label)
285
286class UpdateFailureJump(JumpInstruction):
287 name = 'update_failure_jump'
288 def __init__(self, label):
289 JumpInstruction.__init__(self, chr(12), label)
290
291class DummyFailureJump(JumpInstruction):
292 name = 'update_failure_jump'
293 def __init__(self, label):
294 JumpInstruction.__init__(self, chr(13), label)
295
296class BegBuf(Instruction):
297 name = 'begbuf'
298 def __init__(self):
299 Instruction.__init__(self, chr(14))
300
301class EndBuf(Instruction):
302 name = 'endbuf'
303 def __init__(self):
304 Instruction.__init__(self, chr(15))
305
306class WordBeg(Instruction):
307 name = 'wordbeg'
308 def __init__(self):
309 Instruction.__init__(self, chr(16))
310
311class WordEnd(Instruction):
312 name = 'wordend'
313 def __init__(self):
314 Instruction.__init__(self, chr(17))
315
316class WordBound(Instruction):
317 name = 'wordbound'
318 def __init__(self):
319 Instruction.__init__(self, chr(18))
320
321class NotWordBound(Instruction):
322 name = 'notwordbound'
323 def __init__(self):
324 Instruction.__init__(self, chr(18))
325
326class SyntaxSpec(Instruction):
327 name = 'syntaxspec'
328 def __init__(self, syntax):
329 self.syntax = syntax
330 Instruction.__init__(self, chr(20), 2)
331 def assemble(self, postition, labels):
332 return self.opcode + chr(self.syntax)
333
334class SyntaxSpec(Instruction):
335 name = 'notsyntaxspec'
336 def __init__(self, syntax):
337 self.syntax = syntax
338 Instruction.__init__(self, chr(21), 2)
339 def assemble(self, postition, labels):
340 return self.opcode + chr(self.syntax)
341
342class Label(Instruction):
343 name = 'label'
344 def __init__(self, label):
345 self.label = label
346 Instruction.__init__(self, '', 0)
347 def __repr__(self):
348 return '%-15s %i' % (self.name, self.label)
349
350class OpenParen(Instruction):
351 name = '('
352 def __init__(self, register):
353 self.register = register
354 Instruction.__init__(self, '', 0)
355
356class Alternation(Instruction):
357 name = '|'
358 def __init__(self):
359 Instruction.__init__(self, '', 0)
360
361#
362#
363#
364
365def assemble(instructions):
366 labels = {}
367 position = 0
368 pass1 = []
369 for instruction in instructions:
370 if instruction.name == 'label':
371 labels[instruction.label] = position
372 else:
373 pass1.append((position, instruction))
374 position = position + instruction.size
375 pass2 = ''
376 for position, instruction in pass1:
377 pass2 = pass2 + instruction.assemble(position, labels)
378 return pass2
379
380#
381#
382#
383
384def escape(pattern):
385 result = ''
386 for char in pattern:
387 if 'word' not in syntax_table[char]:
388 result = result + '\\' + char
389 else:
390 result = result + char
391 return result
392
393#
394#
395#
396
397def registers_used(instructions):
398 result = []
399 for instruction in instructions:
400 if (instruction.name in ['set_memory', 'end_memory']) and \
401 (instruction.register not in result):
402 result.append(instruction.register)
403 return result
404
405#
406#
407#
408
409class Fastmap:
410 def __init__(self):
411 self.map = ['\000']*256
412 self.can_be_null = 0
413 def add(self, char):
414 self.map[ord(char)] = '\001'
415 def fastmap(self):
416 return string.join(self.map, '')
417 def __getitem__(self, char):
418 return ord(self.map[ord(char)])
419 def __repr__(self):
420 self.map.sort()
421 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
422
423#
424#
425#
426
427def find_label(code, label):
428 line = 0
429 for instruction in code:
430 if (instruction.name == 'label') and (instruction.label == label):
431 return line + 1
432 line = line + 1
433
434def build_fastmap_aux(code, pos, visited, fastmap):
435 if visited[pos]:
436 return
437 while 1:
438 instruction = code[pos]
439 visited[pos] = 1
440 pos = pos + 1
441 if instruction.name == 'end':
442 fastmap.can_be_null = 1
443 return
444 elif instruction.name == 'syntaxspec':
445 for char in map(chr, range(256)):
446 if instruction.syntax in syntax_table[char]:
447 fastmap.add(char)
448 return
449 elif instruction.name == 'notsyntaxspec':
450 for char in map(chr, range(256)):
451 if instruction.syntax not in syntax_table[char]:
452 fastmap.add(char)
453 return
454 elif instruction.name == 'eol':
455 fastmap.add('\n')
456 if fastmap.can_be_null == 0:
457 fastmap.can_be_null = 2
458 return
459 elif instruction.name == 'set':
460 for char in instruction.set:
461 fastmap.add(char)
462 return
463 elif instruction.name == 'exact':
464 fastmap.add(instruction.char)
465 elif instruction.name == 'anychar':
466 for char in map(chr, range(256)):
467 if char != '\n':
468 fastmap.add(char)
469 return
470 elif instruction.name == 'match_memory':
471 for char in map(chr, range(256)):
472 fastmap.add(char)
473 fastmap.can_be_null = 1
474 return
475 elif instruction.name in ['jump', 'dummy_failure_jump', \
476 'update_failure_jump', 'star_jump']:
477 pos = find_label(code, instruction.label)
478 if visited[pos]:
479 return
480 visited[pos] = 1
481 elif instruction.name == 'failure_jump':
482 build_fastmap_aux(code,
483 find_label(code, instruction.label),
484 visited,
485 fastmap)
486 elif instruction.name == 'function':
487 for char in map(chr, range(256)):
488 fastmap.add(char)
489 fastmap.can_be_null = 1
490 return
491
492def build_fastmap(code, pos=0):
493 visited = [0] * len(code)
494 fastmap = Fastmap()
495 build_fastmap_aux(code, pos, visited, fastmap)
496 return fastmap
497
498#
499#
500#
501
502def compile(pattern, flags=0):
503 stack = []
504 index = 0
505 label = 0
506 register = 1
507 groupindex = {}
508 callouts = []
509 while (index < len(pattern)):
510 char = pattern[index]
511 index = index + 1
512 if char == '\\':
513 if index < len(pattern):
514 next = pattern[index]
515 index = index + 1
516 if next == 't':
517 stack.append([Exact(chr(9))])
518
519 elif next == 'n':
520 stack.append([Exact(chr(10))])
521
522 elif next == 'r':
523 stack.append([Exact(chr(13))])
524
525 elif next == 'f':
526 stack.append([Exact(chr(12))])
527
528 elif next == 'a':
529 stack.append([Exact(chr(7))])
530
531 elif next == 'e':
532 stack.append([Exact(chr(27))])
533
534 elif next in '0123456789':
535 value = next
536 while (index < len(pattern)) and \
537 (pattern[index] in string.digits):
538 value = value + pattern[index]
539 index = index + 1
540 if (len(value) == 3) or \
541 ((len(value) == 2) and (value[0] == '0')):
542 value = string.atoi(value, 8)
543 if value > 255:
544 raise error, 'octal char out of range'
545 stack.append([Exact(chr(value))])
546 elif value == '0':
547 stack.append([Exact(chr(0))])
548 elif len(value) > 3:
549 raise error, 'too many digits'
550 else:
551 value = string.atoi(value)
552 if value >= register:
553 raise error, ('cannot reference a register '
554 'not yet used')
555 elif value == 0:
556 raise error, ('register 0 cannot be used '
557 'during match')
558 stack.append([MatchMemory(value)])
559
560 elif next == 'x':
561 value = ''
562 while (index < len(pattern)) and \
563 (pattern[index] in string.hexdigits):
564 value = value + pattern[index]
565 index = index + 1
566 value = string.atoi(value, 16)
567 if value > 255:
568 raise error, 'hex char out of range'
569 stack.append([Exact(chr(value))])
570
571 elif next == 'c':
572 if index >= len(pattern):
573 raise error, '\\c at end of re'
574 elif pattern[index] in 'abcdefghijklmnopqrstuvwxyz':
575 stack.append(Exact(chr(ord(pattern[index]) -
576 ord('a') + 1)))
577 else:
578 stack.append(Exact(chr(ord(pattern[index]) ^ 64)))
579 index = index + 1
580
581 elif next == 'A':
582 stack.append([BegBuf()])
583
584 elif next == 'Z':
585 stack.append([EndBuf()])
586
587 elif next == 'b':
588 stack.append([WordBound()])
589
590 elif next == 'B':
591 stack.append([NotWordBound()])
592
593 elif next == 'w':
594 stack.append([SyntaxSpec('word')])
595
596 elif next == 'W':
597 stack.append([NotSyntaxSpec('word')])
598
599 elif next == 's':
600 stack.append([SyntaxSpec('whitespace')])
601
602 elif next == 'S':
603 stack.append([NotSyntaxSpec('whitespace')])
604
605 elif next == 'd':
606 stack.append([SyntaxSpec('digit')])
607
608 elif next == 'D':
609 stack.append([NotSyntaxSpec('digit')])
610
611 elif next in 'GluLUQE':
612 # some perl-isms that we don't support
613 raise error, '\\' + next + ' not supported'
614
615 else:
616 stack.append([Exact(pattern[index])])
617
618 else:
619 raise error, 'backslash at the end of a string'
620
621 elif char == '|':
622 if len(stack) == 0:
623 raise error, 'nothing to alternate'
624 expr = []
625 while (len(stack) != 0) and \
626 (stack[-1][0].name != '(') and \
627 (stack[-1][0].name != '|'):
628 expr = stack[-1] + expr
629 del stack[-1]
630 stack.append([FailureJump(label)] + \
631 expr + \
632 [Jump(-1),
633 Label(label)])
634 stack.append([('|',)])
635 label = label + 1
636
637 elif char == '(':
638 if index >= len(pattern):
639 raise error, 'no matching close paren'
640
641 elif pattern[index] == '?':
642 # Perl style (?...) extensions
643 index = index + 1
644 if index >= len(pattern):
645 raise error, 'extension ends prematurely'
646
647 elif pattern[index] == 'P':
648 # Python extensions
649 index = index + 1
650 if index >= len(pattern):
651 raise error, 'extension ends prematurely'
652
653 elif pattern[index] == '<':
654 # Handle Python symbolic group names (?<...>...)
655 index = index + 1
656 end = string.find(pattern, '>', index)
657 if end == -1:
658 raise error, 'no end to symbolic group name'
659 name = pattern[index:end]
660 # XXX check syntax of name
661 index = end + 1
662 groupindex[name] = register
663 stack.append([OpenParen(register)])
664 register = register + 1
665
666 elif pattern[index] == '=':
667 # backreference to symbolic group name
668 if index >= len(pattern):
669 raise error, '(?P= at the end of the pattern'
670 start = index + 1
671 end = string.find(pattern, ')', start)
672 if end == -1:
673 raise error, 'no ) to end symbolic group name'
674 name = pattern[start:end]
675 if name not in groupindex:
676 raise error, ('symbolic group name ' + name + \
677 ' has not been used yet')
678 stack.append([MatchMemory(groupindex[name])])
679 index = end + 1
680
681 elif pattern[index] == '!':
682 # function callout
683 if index >= len(pattern):
684 raise error, 'no function callout name'
685 start = index + 1
686 end = string.find(pattern, ')', start)
687 if end == -1:
688 raise error, 'no ) to end function callout name'
689 name = pattern[start:end]
690 if name not in callouts:
691 raise error, ('function callout name not listed '
692 'in callouts dict')
693 stack.append([FunctionCallout(name)])
694
695 else:
696 raise error, 'unknown Python extension'
697
698 elif pattern[index] == ':':
699 # grouping, but no registers
700 index = index + 1
701 stack.append([('(', -1)])
702
703 elif pattern[index] == '#':
704 # comment
705 index = index + 1
706 end = string.find(pattern, ')', index)
707 if end == -1:
708 raise error, 'no end to comment'
709 index = end + 1
710
711 elif pattern[index] == '=':
712 raise error, ('zero-width positive lookahead '
713 'assertion is unsupported')
714
715 elif pattern[index] == '!':
716 raise error, ('zero-width negative lookahead '
717 'assertion is unsupported')
718
719 elif pattern[index] in 'iImMsSxX':
720 while (index < len(pattern)) and (pattern[index] != ')'):
721 if pattern[index] == 'iI':
722 flags = flags | IGNORECASE
723 elif pattern[index] == 'mM':
724 flags = flags | MULTILINE
725 elif pattern[index] == 'sS':
726 flags = flags | DOTALL
727 elif pattern[index] in 'xX':
728 flags = flags | VERBOSE
729 else:
730 raise error, 'unknown flag'
731 index = index + 1
732 index = index + 1
733
734 else:
735 raise error, 'unknown extension'
736
737 else:
738 stack.append([OpenParen(register)])
739 register = register + 1
740
741 elif char == ')':
742 # make one expression out of everything on the stack up to
743 # the marker left by the last parenthesis
744 expr = []
745 while (len(stack) > 0) and (stack[-1][0].name != '('):
746 expr = stack[-1] + expr
747 del stack[-1]
748
749 if len(stack) == 0:
750 raise error, 'too many close parens'
751 if len(expr) == 0:
752 raise error, 'nothing inside parens'
753
754 # check to see if alternation used correctly
755 if (expr[-1].name == '|'):
756 raise error, 'alternation with nothing on the right'
757
758 # remove markers left by alternation
759 expr = filter(lambda x: x.name != '|', expr)
760
761 # clean up jumps inserted by alternation
762 need_label = 0
763 for i in range(len(expr)):
764 if (expr[i].name == 'jump') and (expr[i].label == -1):
765 expr[i] = JumpOpcode(label)
766 need_label = 1
767 if need_label:
768 expr.append(Label(label))
769 label = label + 1
770
771 if stack[-1][0][1] > 0:
772 expr = [StartMemory(stack[-1][0].register)] + \
773 expr + \
774 [EndMemory(stack[-1][0].register)]
775 del stack[-1]
776 stack.append(expr)
777
778 elif char == '{':
779 if len(stack) == 0:
780 raise error, 'no expression to repeat'
781 end = string.find(pattern, '}', index)
782 if end == -1:
783 raise error, ('no close curly bracket to match'
784 ' open curly bracket')
785
786 fields = map(string.strip,
787 string.split(pattern[index:end], ','))
788 index = end + 1
789
790 minimal = 0
791 if (index < len(pattern)) and (pattern[index] == '?'):
792 minimal = 1
793 index = index + 1
794
795 if len(fields) == 1:
796 # {n} or {n}? (there's really no difference)
797 try:
798 count = string.atoi(fields[0])
799 except ValueError:
800 raise error, ('count must be an integer '
801 'inside curly braces')
802 if count > 65535:
803 raise error, 'repeat count out of range'
804 expr = []
805 while count > 0:
806 expr = expr + stack[-1]
807 count = count - 1
808 del stack[-1]
809 stack.append(expr)
810
811 elif len(fields) == 2:
812 # {n,} or {n,m}
813 if fields[1] == '':
814 # {n,}
815 try:
816 min = string.atoi(fields[0])
817 except ValueError:
818 raise error, ('minimum must be an integer '
819 'inside curly braces')
820 if min > 65535:
821 raise error, 'minimum repeat count out of range'
822
823 expr = []
824 while min > 0:
825 expr = expr + stack[-1]
826 min = min - 1
827 registers = registers_used(stack[-1])
828 if minimal:
829 expr = expr + \
830 ([Jump(label + 1),
831 Label(label)] + \
832 stack[-1] + \
833 [Label(label + 1),
834 FailureJump(label, registers)])
835 else:
836 expr = expr + \
837 ([Label(label),
838 FailureJump(label + 1, registers)] +
839 stack[-1] +
840 [StarJump(label),
841 Label(label + 1)])
842
843 del stack[-1]
844 stack.append(expr)
845 label = label + 2
846
847 else:
848 # {n,m}
849 try:
850 min = string.atoi(fields[0])
851 except ValueError:
852 raise error, ('minimum must be an integer '
853 'inside curly braces')
854 try:
855 max = string.atoi(fields[1])
856 except ValueError:
857 raise error, ('maximum must be an integer '
858 'inside curly braces')
859 if min > 65535:
860 raise error, ('minumim repeat count out '
861 'of range')
862 if max > 65535:
863 raise error, ('maximum repeat count out '
864 'of range')
865 if min > max:
866 raise error, ('minimum repeat count must be '
867 'less than the maximum '
868 'repeat count')
869 expr = []
870 while min > 0:
871 expr = expr + stack[-1]
872 min = min - 1
873 max = max - 1
874 if minimal:
875 while max > 0:
876 expr = expr + \
877 [FailureJump(label),
878 Jump(label + 1),
879 Label(label)] + \
880 stack[-1] + \
881 [Label(label + 1)]
882 label = label + 2
883 del stack[-1]
884 stack.append(expr)
885 else:
886 while max > 0:
887 expr = expr + \
888 [FailureJump(label)] + \
889 stack[-1]
890 max = max - 1
891 del stack[-1]
892 stack.append(expr + [Label(label)])
893 label = label + 1
894
895 else:
896 raise error, ('there need to be one or two fields '
897 'in a {} expression')
898 index = end + 1
899
900 elif char == '}':
901 raise error, 'unbalanced close curly brace'
902
903 elif char == '*':
904 # Kleene closure
905 if len(stack) == 0:
906 raise error, 'the Kleene closure needs something to repeat'
907 registers = registers_used(stack[-1])
908 if (index < len(pattern)) and (pattern[index] == '?'):
909 # non-greedy matching
910 expr = [JumpInstructions(label + 1),
911 Label(label)] + \
912 stack[-1] + \
913 [Label(label + 1),
914 FailureJump(label)]
915 index = index + 1
916 else:
917 # greedy matching
918 expr = [Label(label),
919 FailureJump(label + 1)] + \
920 stack[-1] + \
921 [StarJump(label),
922 Label(label + 1)]
923 del stack[-1]
924 stack.append(expr)
925 label = label + 2
926
927 elif char == '+':
928 # positive closure
929 if len(stack) == 0:
930 raise error, 'the positive closure needs something to repeat'
931 registers = registers_used(stack[-1])
932 if (index < len(pattern)) and (pattern[index] == '?'):
933 # non-greedy
934 expr = [Label(label)] + \
935 stack[-1] + \
936 [FailureJump(label)]
937 label = label + 1
938 index = index + 1
939 else:
940 # greedy
941 expr = [DummyFailureJump(label + 1),
942 Label(label),
943 FailureJump(label + 2),
944 Label(label + 1)] + \
945 stack[-1] + \
946 [StarJump(label),
947 Label(label + 2)]
948 label = label + 3
949 del stack[-1]
950 stack.append(expr)
951
952 elif char == '?':
953 if len(stack) == 0:
954 raise error, 'need something to be optional'
955 registers = registers_used(stack[-1])
956 if (index < len(pattern)) and (pattern[index] == '?'):
957 # non-greedy matching
958 expr = [FailureJump(label),
959 Jump(label + 1),
960 Label(label)] + \
961 stack[-1] + \
962 [Label(label + 1)]
963 label = label + 2
964 index = index + 1
965 else:
966 # greedy matching
967 expr = [FailureJump(label)] + \
968 stack[-1] + \
969 [Label(label)]
970 label = label + 1
971 del stack[-1]
972 stack.append(expr)
973
974 elif char == '.':
975 if flags & DOTALL:
976 stack.append(Set(map(chr, range(256))))
977 else:
978 stack.append([AnyChar()])
979
980 elif char == '^':
981 if flags & MULTILINE:
982 stack.append([Bol()])
983 else:
984 stack.append([BegBuf()])
985
986 elif char == '$':
987 if flags & MULTILINE:
988 stack.append([Eol()])
989 else:
990 stack.append([EndBuf()])
991
992 elif char == '#':
993 if flags & VERBOSE:
994 # comment
995 index = index + 1
996 end = string.find(pattern, '\n', index)
997 if end == -1:
998 index = len(pattern)
999 else:
1000 index = end + 1
1001 else:
1002 stack.append([Exact(char)])
1003
1004 elif char in string.whitespace:
1005 if flags & VERBOSE:
1006 stack.append([Exact(char)])
1007
1008 elif char == '[':
1009 if index >= len(pattern):
1010 raise error, 'incomplete set'
1011 negate = 0
1012 last = ''
1013 set = []
1014 if pattern[index] == '^':
1015 negate = 1
1016 index = index + 1
1017 if index >= len(pattern):
1018 raise error, 'incomplete set'
1019 if pattern[index] in ']-':
1020 set.append(pattern[index])
1021 last = pattern[index]
1022 index = index + 1
1023 while (index < len(pattern)) and (pattern[index] != ']'):
1024 next = pattern[index]
1025 index = index + 1
1026 if next == '-':
1027 if (index >= len(pattern)) or (pattern[index] == ']'):
1028 raise error, 'incomplete range in set'
1029 if last > pattern[index]:
1030 raise error, 'range arguments out of order in set'
1031 for next in map(chr, \
1032 range(ord(last), \
1033 ord(pattern[index]) + 1)):
1034 if next not in set:
1035 set.append(next)
1036 last = ''
1037 index = index + 1
1038
1039 elif next == '\\':
1040 # expand syntax meta-characters and add to set
1041 if index >= len(pattern):
1042 raise error, 'incomplete set'
1043 elif (pattern[index] == ']'):
1044 raise error, 'backslash at the end of a set'
1045 elif pattern[index] == 'w':
1046 for next in syntax_table.keys():
1047 if 'word' in syntax_table[next]:
1048 set.append(next)
1049 elif pattern[index] == 'W':
1050 for next in syntax_table.keys():
1051 if 'word' not in syntax_table[next]:
1052 set.append(next)
1053 elif pattern[index] == 'd':
1054 for next in syntax_table.keys():
1055 if 'digit' in syntax_table[next]:
1056 set.append(next)
1057 elif pattern[index] == 'D':
1058 for next in syntax_table.keys():
1059 if 'digit' not in syntax_table[next]:
1060 set.append(next)
1061 elif pattern[index] == 's':
1062 for next in syntax_table.keys():
1063 if 'whitespace' in syntax_table[next]:
1064 set.append(next)
1065 elif pattern[index] == 'S':
1066 for next in syntax_table.keys():
1067 if 'whitespace' not in syntax_table[next]:
1068 set.append(next)
1069 else:
1070 raise error, 'unknown meta in set'
1071 last = ''
1072 index = index + 1
1073
1074 else:
1075 if next not in set:
1076 set.append(next)
1077 last = next
1078
1079 if pattern[index] != ']':
1080 raise error, 'incomplete set'
1081
1082 index = index + 1
1083
1084 if negate:
1085 notset = []
1086 for char in map(chr, range(256)):
1087 if char not in set:
1088 notset.append(char)
1089 stack.append([Set(notset)])
1090 else:
1091 stack.append([Set(set)])
1092
1093 else:
1094 stack.append([Exact(char)])
1095
1096 code = []
1097 while len(stack) > 0:
1098 if stack[-1][0].name == '(':
1099 raise error, 'too many open parens'
1100 code = stack[-1] + code
1101 del stack[-1]
1102 if len(code) == 0:
1103 raise error, 'no code generated'
1104 if (code[-1].name == '|'):
1105 raise error, 'alternation with nothing on the right'
1106 code = filter(lambda x: x.name != '|', code)
1107 need_label = 0
1108 for i in range(len(code)):
1109 if (code[i].name == 'jump') and (code[i].label == -1):
1110 code[i] = Jump(label)
1111 need_label = 1
1112 if need_label:
1113 code.append(Label(label))
1114 label = label + 1
1115 code.append(End())
1116 return RegexObject(pattern, flags, code, register, groupindex, callouts)
1117
1118if __name__ == '__main__':
1119 print compile('a(b)*')
1120 print compile('a{3}')
1121 print compile('(a){2}')
1122 print compile('a{2,4}')
1123 print compile('a|b')
1124 print compile('a(b|c)')
1125 print compile('a*')
1126 print compile('a+')
1127 print compile('a|b|c')
1128 print compile('a(b|c)*')
1129 print compile('\\n')
1130 print compile('a(?# huh huh)b')
1131 print compile('[a-c\\w]')
1132 print compile('[[]')
1133 print compile('[]]')
1134 print compile('(<hello>a)')
1135 print compile('\Q*\e')
1136 print compile('a{0,}')
1137