blob: 60dcb8b3b05ec6161320d968911d1f9f753f4140 [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
8error = 're error'
9
10# compilation flags
11
12IGNORECASE = I = 0x01
13
14MULTILINE = M = 0x02
15DOTALL = S = 0x04
16VERBOSE = X = 0x08
17
18#
19# Initialize syntax table. This information should really come from the
20# syntax table in regexpr.c rather than being duplicated here.
21#
22
23syntax_table = {}
24
25for char in map(chr, range(0, 256)):
26 syntax_table[char] = []
27
28for char in string.lowercase:
29 syntax_table[char].append('word')
30
31for char in string.uppercase:
32 syntax_table[char].append('word')
33
34for char in string.digits:
35 syntax_table[char].append('word')
36 syntax_table[char].append('digit')
37
38for char in string.whitespace:
39 syntax_table[char].append('whitespace')
40
41syntax_table['_'].append('word')
42
43#
44#
45#
46
47def match(pattern, string, flags=0):
48 return compile(pattern, flags).match(string)
49
50def search(pattern, string, flags=0):
51 return compile(pattern, flags).search(string)
52
53def sub(pattern, repl, string, count=0):
54 pass
55
56def subn(pattern, repl, string, count=0):
57 pass
58
59def split(string, pattern, maxsplit=0):
60 pass
61
62#
63#
64#
65
66class RegexObject:
67 def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
68 self.code = code
69 self.num_regs = num_regs
70 self.flags = flags
71 self.pattern = pattern
72 self.groupindex = groupindex
73 self.callouts = callouts
74 self.fastmap = build_fastmap(code)
75 if code[0].name == 'bol':
76 self.anchor = 1
77 elif code[0].name == 'begbuf':
78 self.anchor = 2
79 else:
80 self.anchor = 0
81 self.buffer = assemble(code)
82 def match(self, string, pos=0):
83 regs = reop.match(self.buffer,
84 self.num_regs,
85 self.flags,
86 self.fastmap.can_be_null,
87 self.fastmap.fastmap(),
88 self.anchor,
89 string,
90 pos)
91 if regs is None:
92 return None
93 return MatchObject(self,
94 string,
95 pos,
96 regs)
97 def search(self, string, pos=0):
98 regs = reop.search(self.buffer,
99 self.num_regs,
100 self.flags,
101 self.fastmap.can_be_null,
102 self.fastmap.fastmap(),
103 self.anchor,
104 string,
105 pos)
106 if regs is None:
107 return None
108 return MatchObject(self,
109 string,
110 pos,
111 regs)
112
113class MatchObject:
114 def __init__(self, re, string, pos, regs):
115 self.re = re
116 self.string = string
117 self.pos = pos
118 self.regs = regs
119 def start(self, i):
120 if type(i) == type(''):
121 try:
122 i = self.re.groupindex[i]
123 except (KeyError, TypeError):
124 raise IndexError
125 return self.regs[i][0]
126 def end(self, i):
127 if type(i) == type(''):
128 try:
129 i = self.re.groupindex[i]
130 except (KeyError, TypeError):
131 raise IndexError
132 return self.regs[i][1]
133 def span(self, i):
134 if type(i) == type(''):
135 try:
136 i = self.re.groupindex[i]
137 except (KeyError, TypeError):
138 raise IndexError
139 return self.regs[i]
Guido van Rossum63e18191997-07-11 11:08:38 +0000140 def group(self, i):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000141 if type(i) == type(''):
142 try:
143 i = self.re.groupindex[i]
144 except (KeyError, TypeError):
145 raise IndexError
146 return self.string[self.regs[i][0]:self.regs[i][1]]
147
148#
149# A set of classes to make assembly a bit easier, if a bit verbose.
150#
151
152class Instruction:
153 def __init__(self, opcode, size=1):
154 self.opcode = opcode
155 self.size = size
156 def assemble(self, position, labels):
157 return self.opcode
158 def __repr__(self):
159 return '%-15s' % (self.name)
160
161class FunctionCallout(Instruction):
162 name = 'function'
163 def __init__(self, function):
164 self.function = function
165 Instruction.__init__(self, chr(22), 2 + len(self.function))
166 def assemble(self, position, labels):
167 return self.opcode + chr(len(self.function)) + self.function
168 def __repr__(self):
169 return '%-15s %-10s' % (self.name, self.function)
170
171class End(Instruction):
172 name = 'end'
173 def __init__(self):
174 Instruction.__init__(self, chr(0))
175
176class Bol(Instruction):
177 name = 'bol'
178 def __init__(self):
179 self.name = 'bol'
180 Instruction.__init__(self, chr(1))
181
182class Eol(Instruction):
183 name = 'eol'
184 def __init__(self):
185 Instruction.__init__(self, chr(2))
186
187class Set(Instruction):
188 name = 'set'
189 def __init__(self, set):
190 self.set = set
Guido van Rossum63e18191997-07-11 11:08:38 +0000191 print set
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000192 Instruction.__init__(self, chr(3), 33)
Guido van Rossum63e18191997-07-11 11:08:38 +0000193 def assemble(self, position, labels):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000194 result = self.opcode
195 temp = 0
196 for i, c in map(lambda x: (x, chr(x)), range(256)):
Guido van Rossum63e18191997-07-11 11:08:38 +0000197 if c in self.set:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000198 temp = temp | (1 << (i & 7))
199 if (i % 8) == 7:
200 result = result + chr(temp)
201 temp = 0
202 return result
203 def __repr__(self):
204 result = '%-15s' % (self.name)
205 self.set.sort()
206 for char in self.set:
Guido van Rossum63e18191997-07-11 11:08:38 +0000207 result = result + char
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000208 return result
209
210class Exact(Instruction):
211 name = 'exact'
212 def __init__(self, char):
213 self.char = char
214 Instruction.__init__(self, chr(4), 2)
215 def assemble(self, position, labels):
216 return self.opcode + self.char
217 def __repr__(self):
218 return '%-15s %s' % (self.name, `self.char`)
219
220class AnyChar(Instruction):
221 name = 'anychar'
222 def __init__(self):
223 Instruction.__init__(self, chr(5))
224 def assemble(self, position, labels):
225 return self.opcode
226
227class MemoryInstruction(Instruction):
228 def __init__(self, opcode, register):
229 self.register = register
230 Instruction.__init__(self, opcode, 2)
231 def assemble(self, position, labels):
232 return self.opcode + chr(self.register)
233 def __repr__(self):
234 return '%-15s %i' % (self.name, self.register)
235
236class StartMemory(MemoryInstruction):
237 name = 'start_memory'
238 def __init__(self, register):
239 MemoryInstruction.__init__(self, chr(6), register)
240
241class EndMemory(MemoryInstruction):
242 name = 'end_memory'
243 def __init__(self, register):
244 MemoryInstruction.__init__(self, chr(7), register)
245
246class MatchMemory(MemoryInstruction):
247 name = 'match_memory'
248 def __init__(self, register):
249 MemoryInstruction.__init__(self, chr(8), register)
250
251class JumpInstruction(Instruction):
252 def __init__(self, opcode, label):
253 self.label = label
254 Instruction.__init__(self, opcode, 3)
255 def compute_offset(self, start, dest):
256 return dest - (start + 3)
257 def pack_offset(self, offset):
258 if offset > 32767:
259 raise error, 'offset out of range (pos)'
260 elif offset < -32768:
261 raise error, 'offset out of range (neg)'
262 elif offset < 0:
263 offset = offset + 65536
264 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
265 def assemble(self, position, labels):
266 return self.opcode + \
267 self.pack_offset(self.compute_offset(position,
268 labels[self.label]))
269 def __repr__(self):
270 return '%-15s %i' % (self.name, self.label)
271
272class Jump(JumpInstruction):
273 name = 'jump'
274 def __init__(self, label):
275 JumpInstruction.__init__(self, chr(9), label)
276
277class StarJump(JumpInstruction):
278 name = 'star_jump'
279 def __init__(self, label):
280 JumpInstruction.__init__(self, chr(10), label)
281
282class FailureJump(JumpInstruction):
283 name = 'failure_jump'
284 def __init__(self, label):
285 JumpInstruction.__init__(self, chr(11), label)
286
287class UpdateFailureJump(JumpInstruction):
288 name = 'update_failure_jump'
289 def __init__(self, label):
290 JumpInstruction.__init__(self, chr(12), label)
291
292class DummyFailureJump(JumpInstruction):
293 name = 'update_failure_jump'
294 def __init__(self, label):
295 JumpInstruction.__init__(self, chr(13), label)
296
297class BegBuf(Instruction):
298 name = 'begbuf'
299 def __init__(self):
300 Instruction.__init__(self, chr(14))
301
302class EndBuf(Instruction):
303 name = 'endbuf'
304 def __init__(self):
305 Instruction.__init__(self, chr(15))
306
307class WordBeg(Instruction):
308 name = 'wordbeg'
309 def __init__(self):
310 Instruction.__init__(self, chr(16))
311
312class WordEnd(Instruction):
313 name = 'wordend'
314 def __init__(self):
315 Instruction.__init__(self, chr(17))
316
317class WordBound(Instruction):
318 name = 'wordbound'
319 def __init__(self):
320 Instruction.__init__(self, chr(18))
321
322class NotWordBound(Instruction):
323 name = 'notwordbound'
324 def __init__(self):
325 Instruction.__init__(self, chr(18))
326
327class SyntaxSpec(Instruction):
328 name = 'syntaxspec'
329 def __init__(self, syntax):
330 self.syntax = syntax
331 Instruction.__init__(self, chr(20), 2)
332 def assemble(self, postition, labels):
333 return self.opcode + chr(self.syntax)
334
335class SyntaxSpec(Instruction):
336 name = 'notsyntaxspec'
337 def __init__(self, syntax):
338 self.syntax = syntax
339 Instruction.__init__(self, chr(21), 2)
340 def assemble(self, postition, labels):
341 return self.opcode + chr(self.syntax)
342
343class Label(Instruction):
344 name = 'label'
345 def __init__(self, label):
346 self.label = label
347 Instruction.__init__(self, '', 0)
348 def __repr__(self):
349 return '%-15s %i' % (self.name, self.label)
350
351class OpenParen(Instruction):
352 name = '('
353 def __init__(self, register):
354 self.register = register
355 Instruction.__init__(self, '', 0)
356
357class Alternation(Instruction):
358 name = '|'
359 def __init__(self):
360 Instruction.__init__(self, '', 0)
361
362#
363#
364#
365
366def assemble(instructions):
367 labels = {}
368 position = 0
369 pass1 = []
370 for instruction in instructions:
371 if instruction.name == 'label':
372 labels[instruction.label] = position
373 else:
374 pass1.append((position, instruction))
375 position = position + instruction.size
376 pass2 = ''
377 for position, instruction in pass1:
378 pass2 = pass2 + instruction.assemble(position, labels)
379 return pass2
380
381#
382#
383#
384
385def escape(pattern):
386 result = ''
387 for char in pattern:
388 if 'word' not in syntax_table[char]:
389 result = result + '\\' + char
390 else:
391 result = result + char
392 return result
393
394#
395#
396#
397
398def registers_used(instructions):
399 result = []
400 for instruction in instructions:
401 if (instruction.name in ['set_memory', 'end_memory']) and \
402 (instruction.register not in result):
403 result.append(instruction.register)
404 return result
405
406#
407#
408#
409
410class Fastmap:
411 def __init__(self):
412 self.map = ['\000']*256
413 self.can_be_null = 0
414 def add(self, char):
415 self.map[ord(char)] = '\001'
416 def fastmap(self):
417 return string.join(self.map, '')
418 def __getitem__(self, char):
419 return ord(self.map[ord(char)])
420 def __repr__(self):
421 self.map.sort()
422 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
423
424#
425#
426#
427
428def find_label(code, label):
429 line = 0
430 for instruction in code:
431 if (instruction.name == 'label') and (instruction.label == label):
432 return line + 1
433 line = line + 1
434
435def build_fastmap_aux(code, pos, visited, fastmap):
436 if visited[pos]:
437 return
438 while 1:
439 instruction = code[pos]
440 visited[pos] = 1
441 pos = pos + 1
442 if instruction.name == 'end':
443 fastmap.can_be_null = 1
444 return
445 elif instruction.name == 'syntaxspec':
446 for char in map(chr, range(256)):
447 if instruction.syntax in syntax_table[char]:
448 fastmap.add(char)
449 return
450 elif instruction.name == 'notsyntaxspec':
451 for char in map(chr, range(256)):
452 if instruction.syntax not in syntax_table[char]:
453 fastmap.add(char)
454 return
455 elif instruction.name == 'eol':
456 fastmap.add('\n')
457 if fastmap.can_be_null == 0:
458 fastmap.can_be_null = 2
459 return
460 elif instruction.name == 'set':
461 for char in instruction.set:
462 fastmap.add(char)
463 return
464 elif instruction.name == 'exact':
465 fastmap.add(instruction.char)
466 elif instruction.name == 'anychar':
467 for char in map(chr, range(256)):
468 if char != '\n':
469 fastmap.add(char)
470 return
471 elif instruction.name == 'match_memory':
472 for char in map(chr, range(256)):
473 fastmap.add(char)
474 fastmap.can_be_null = 1
475 return
476 elif instruction.name in ['jump', 'dummy_failure_jump', \
477 'update_failure_jump', 'star_jump']:
478 pos = find_label(code, instruction.label)
479 if visited[pos]:
480 return
481 visited[pos] = 1
482 elif instruction.name == 'failure_jump':
483 build_fastmap_aux(code,
484 find_label(code, instruction.label),
485 visited,
486 fastmap)
487 elif instruction.name == 'function':
488 for char in map(chr, range(256)):
489 fastmap.add(char)
490 fastmap.can_be_null = 1
491 return
492
493def build_fastmap(code, pos=0):
494 visited = [0] * len(code)
495 fastmap = Fastmap()
496 build_fastmap_aux(code, pos, visited, fastmap)
497 return fastmap
498
499#
500#
501#
502
503def compile(pattern, flags=0):
504 stack = []
505 index = 0
506 label = 0
507 register = 1
508 groupindex = {}
509 callouts = []
510 while (index < len(pattern)):
511 char = pattern[index]
512 index = index + 1
513 if char == '\\':
514 if index < len(pattern):
515 next = pattern[index]
516 index = index + 1
517 if next == 't':
518 stack.append([Exact(chr(9))])
519
520 elif next == 'n':
521 stack.append([Exact(chr(10))])
522
523 elif next == 'r':
524 stack.append([Exact(chr(13))])
525
526 elif next == 'f':
527 stack.append([Exact(chr(12))])
528
529 elif next == 'a':
530 stack.append([Exact(chr(7))])
531
532 elif next == 'e':
533 stack.append([Exact(chr(27))])
534
535 elif next in '0123456789':
536 value = next
537 while (index < len(pattern)) and \
538 (pattern[index] in string.digits):
539 value = value + pattern[index]
540 index = index + 1
541 if (len(value) == 3) or \
542 ((len(value) == 2) and (value[0] == '0')):
543 value = string.atoi(value, 8)
544 if value > 255:
545 raise error, 'octal char out of range'
546 stack.append([Exact(chr(value))])
547 elif value == '0':
548 stack.append([Exact(chr(0))])
549 elif len(value) > 3:
550 raise error, 'too many digits'
551 else:
552 value = string.atoi(value)
553 if value >= register:
554 raise error, ('cannot reference a register '
555 'not yet used')
556 elif value == 0:
557 raise error, ('register 0 cannot be used '
558 'during match')
559 stack.append([MatchMemory(value)])
560
561 elif next == 'x':
562 value = ''
563 while (index < len(pattern)) and \
564 (pattern[index] in string.hexdigits):
565 value = value + pattern[index]
566 index = index + 1
567 value = string.atoi(value, 16)
568 if value > 255:
569 raise error, 'hex char out of range'
570 stack.append([Exact(chr(value))])
571
572 elif next == 'c':
573 if index >= len(pattern):
574 raise error, '\\c at end of re'
575 elif pattern[index] in 'abcdefghijklmnopqrstuvwxyz':
576 stack.append(Exact(chr(ord(pattern[index]) -
577 ord('a') + 1)))
578 else:
579 stack.append(Exact(chr(ord(pattern[index]) ^ 64)))
580 index = index + 1
581
582 elif next == 'A':
583 stack.append([BegBuf()])
584
585 elif next == 'Z':
586 stack.append([EndBuf()])
587
588 elif next == 'b':
589 stack.append([WordBound()])
590
591 elif next == 'B':
592 stack.append([NotWordBound()])
593
594 elif next == 'w':
595 stack.append([SyntaxSpec('word')])
596
597 elif next == 'W':
598 stack.append([NotSyntaxSpec('word')])
599
600 elif next == 's':
601 stack.append([SyntaxSpec('whitespace')])
602
603 elif next == 'S':
604 stack.append([NotSyntaxSpec('whitespace')])
605
606 elif next == 'd':
607 stack.append([SyntaxSpec('digit')])
608
609 elif next == 'D':
610 stack.append([NotSyntaxSpec('digit')])
611
612 elif next in 'GluLUQE':
613 # some perl-isms that we don't support
614 raise error, '\\' + next + ' not supported'
615
616 else:
617 stack.append([Exact(pattern[index])])
618
619 else:
620 raise error, 'backslash at the end of a string'
621
622 elif char == '|':
623 if len(stack) == 0:
624 raise error, 'nothing to alternate'
625 expr = []
626 while (len(stack) != 0) and \
627 (stack[-1][0].name != '(') and \
628 (stack[-1][0].name != '|'):
629 expr = stack[-1] + expr
630 del stack[-1]
631 stack.append([FailureJump(label)] + \
632 expr + \
633 [Jump(-1),
634 Label(label)])
635 stack.append([('|',)])
636 label = label + 1
637
638 elif char == '(':
639 if index >= len(pattern):
640 raise error, 'no matching close paren'
641
642 elif pattern[index] == '?':
643 # Perl style (?...) extensions
644 index = index + 1
645 if index >= len(pattern):
646 raise error, 'extension ends prematurely'
647
648 elif pattern[index] == 'P':
649 # Python extensions
650 index = index + 1
651 if index >= len(pattern):
652 raise error, 'extension ends prematurely'
653
654 elif pattern[index] == '<':
655 # Handle Python symbolic group names (?<...>...)
656 index = index + 1
657 end = string.find(pattern, '>', index)
658 if end == -1:
659 raise error, 'no end to symbolic group name'
660 name = pattern[index:end]
661 # XXX check syntax of name
662 index = end + 1
663 groupindex[name] = register
664 stack.append([OpenParen(register)])
665 register = register + 1
666
667 elif pattern[index] == '=':
668 # backreference to symbolic group name
669 if index >= len(pattern):
670 raise error, '(?P= at the end of the pattern'
671 start = index + 1
672 end = string.find(pattern, ')', start)
673 if end == -1:
674 raise error, 'no ) to end symbolic group name'
675 name = pattern[start:end]
676 if name not in groupindex:
677 raise error, ('symbolic group name ' + name + \
678 ' has not been used yet')
679 stack.append([MatchMemory(groupindex[name])])
680 index = end + 1
681
682 elif pattern[index] == '!':
683 # function callout
684 if index >= len(pattern):
685 raise error, 'no function callout name'
686 start = index + 1
687 end = string.find(pattern, ')', start)
688 if end == -1:
689 raise error, 'no ) to end function callout name'
690 name = pattern[start:end]
691 if name not in callouts:
692 raise error, ('function callout name not listed '
693 'in callouts dict')
694 stack.append([FunctionCallout(name)])
695
696 else:
697 raise error, 'unknown Python extension'
698
699 elif pattern[index] == ':':
700 # grouping, but no registers
701 index = index + 1
702 stack.append([('(', -1)])
703
704 elif pattern[index] == '#':
705 # comment
706 index = index + 1
707 end = string.find(pattern, ')', index)
708 if end == -1:
709 raise error, 'no end to comment'
710 index = end + 1
711
712 elif pattern[index] == '=':
713 raise error, ('zero-width positive lookahead '
714 'assertion is unsupported')
715
716 elif pattern[index] == '!':
717 raise error, ('zero-width negative lookahead '
718 'assertion is unsupported')
719
720 elif pattern[index] in 'iImMsSxX':
721 while (index < len(pattern)) and (pattern[index] != ')'):
722 if pattern[index] == 'iI':
723 flags = flags | IGNORECASE
724 elif pattern[index] == 'mM':
725 flags = flags | MULTILINE
726 elif pattern[index] == 'sS':
727 flags = flags | DOTALL
728 elif pattern[index] in 'xX':
729 flags = flags | VERBOSE
730 else:
731 raise error, 'unknown flag'
732 index = index + 1
733 index = index + 1
734
735 else:
736 raise error, 'unknown extension'
737
738 else:
739 stack.append([OpenParen(register)])
740 register = register + 1
741
742 elif char == ')':
743 # make one expression out of everything on the stack up to
744 # the marker left by the last parenthesis
745 expr = []
746 while (len(stack) > 0) and (stack[-1][0].name != '('):
747 expr = stack[-1] + expr
748 del stack[-1]
749
750 if len(stack) == 0:
751 raise error, 'too many close parens'
752 if len(expr) == 0:
753 raise error, 'nothing inside parens'
754
755 # check to see if alternation used correctly
756 if (expr[-1].name == '|'):
757 raise error, 'alternation with nothing on the right'
758
759 # remove markers left by alternation
760 expr = filter(lambda x: x.name != '|', expr)
761
762 # clean up jumps inserted by alternation
763 need_label = 0
764 for i in range(len(expr)):
765 if (expr[i].name == 'jump') and (expr[i].label == -1):
766 expr[i] = JumpOpcode(label)
767 need_label = 1
768 if need_label:
769 expr.append(Label(label))
770 label = label + 1
771
Guido van Rossum63e18191997-07-11 11:08:38 +0000772 if stack[-1][0].register > 0:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000773 expr = [StartMemory(stack[-1][0].register)] + \
774 expr + \
775 [EndMemory(stack[-1][0].register)]
776 del stack[-1]
777 stack.append(expr)
778
779 elif char == '{':
780 if len(stack) == 0:
781 raise error, 'no expression to repeat'
782 end = string.find(pattern, '}', index)
783 if end == -1:
784 raise error, ('no close curly bracket to match'
785 ' open curly bracket')
786
787 fields = map(string.strip,
788 string.split(pattern[index:end], ','))
789 index = end + 1
790
791 minimal = 0
792 if (index < len(pattern)) and (pattern[index] == '?'):
793 minimal = 1
794 index = index + 1
795
796 if len(fields) == 1:
797 # {n} or {n}? (there's really no difference)
798 try:
799 count = string.atoi(fields[0])
800 except ValueError:
801 raise error, ('count must be an integer '
802 'inside curly braces')
803 if count > 65535:
804 raise error, 'repeat count out of range'
805 expr = []
806 while count > 0:
807 expr = expr + stack[-1]
808 count = count - 1
809 del stack[-1]
810 stack.append(expr)
811
812 elif len(fields) == 2:
813 # {n,} or {n,m}
814 if fields[1] == '':
815 # {n,}
816 try:
817 min = string.atoi(fields[0])
818 except ValueError:
819 raise error, ('minimum must be an integer '
820 'inside curly braces')
821 if min > 65535:
822 raise error, 'minimum repeat count out of range'
823
824 expr = []
825 while min > 0:
826 expr = expr + stack[-1]
827 min = min - 1
828 registers = registers_used(stack[-1])
829 if minimal:
830 expr = expr + \
831 ([Jump(label + 1),
832 Label(label)] + \
833 stack[-1] + \
834 [Label(label + 1),
835 FailureJump(label, registers)])
836 else:
837 expr = expr + \
838 ([Label(label),
839 FailureJump(label + 1, registers)] +
840 stack[-1] +
841 [StarJump(label),
842 Label(label + 1)])
843
844 del stack[-1]
845 stack.append(expr)
846 label = label + 2
847
848 else:
849 # {n,m}
850 try:
851 min = string.atoi(fields[0])
852 except ValueError:
853 raise error, ('minimum must be an integer '
854 'inside curly braces')
855 try:
856 max = string.atoi(fields[1])
857 except ValueError:
858 raise error, ('maximum must be an integer '
859 'inside curly braces')
860 if min > 65535:
861 raise error, ('minumim repeat count out '
862 'of range')
863 if max > 65535:
864 raise error, ('maximum repeat count out '
865 'of range')
866 if min > max:
867 raise error, ('minimum repeat count must be '
868 'less than the maximum '
869 'repeat count')
870 expr = []
871 while min > 0:
872 expr = expr + stack[-1]
873 min = min - 1
874 max = max - 1
875 if minimal:
876 while max > 0:
877 expr = expr + \
878 [FailureJump(label),
879 Jump(label + 1),
880 Label(label)] + \
881 stack[-1] + \
882 [Label(label + 1)]
883 label = label + 2
884 del stack[-1]
885 stack.append(expr)
886 else:
887 while max > 0:
888 expr = expr + \
889 [FailureJump(label)] + \
890 stack[-1]
891 max = max - 1
892 del stack[-1]
893 stack.append(expr + [Label(label)])
894 label = label + 1
895
896 else:
897 raise error, ('there need to be one or two fields '
898 'in a {} expression')
899 index = end + 1
900
901 elif char == '}':
902 raise error, 'unbalanced close curly brace'
903
904 elif char == '*':
905 # Kleene closure
906 if len(stack) == 0:
907 raise error, 'the Kleene closure needs something to repeat'
908 registers = registers_used(stack[-1])
909 if (index < len(pattern)) and (pattern[index] == '?'):
910 # non-greedy matching
911 expr = [JumpInstructions(label + 1),
912 Label(label)] + \
913 stack[-1] + \
914 [Label(label + 1),
915 FailureJump(label)]
916 index = index + 1
917 else:
918 # greedy matching
919 expr = [Label(label),
920 FailureJump(label + 1)] + \
921 stack[-1] + \
922 [StarJump(label),
923 Label(label + 1)]
924 del stack[-1]
925 stack.append(expr)
926 label = label + 2
927
928 elif char == '+':
929 # positive closure
930 if len(stack) == 0:
931 raise error, 'the positive closure needs something to repeat'
932 registers = registers_used(stack[-1])
933 if (index < len(pattern)) and (pattern[index] == '?'):
934 # non-greedy
935 expr = [Label(label)] + \
936 stack[-1] + \
937 [FailureJump(label)]
938 label = label + 1
939 index = index + 1
940 else:
941 # greedy
942 expr = [DummyFailureJump(label + 1),
943 Label(label),
944 FailureJump(label + 2),
945 Label(label + 1)] + \
946 stack[-1] + \
947 [StarJump(label),
948 Label(label + 2)]
949 label = label + 3
950 del stack[-1]
951 stack.append(expr)
952
953 elif char == '?':
954 if len(stack) == 0:
955 raise error, 'need something to be optional'
956 registers = registers_used(stack[-1])
957 if (index < len(pattern)) and (pattern[index] == '?'):
958 # non-greedy matching
959 expr = [FailureJump(label),
960 Jump(label + 1),
961 Label(label)] + \
962 stack[-1] + \
963 [Label(label + 1)]
964 label = label + 2
965 index = index + 1
966 else:
967 # greedy matching
968 expr = [FailureJump(label)] + \
969 stack[-1] + \
970 [Label(label)]
971 label = label + 1
972 del stack[-1]
973 stack.append(expr)
974
975 elif char == '.':
976 if flags & DOTALL:
977 stack.append(Set(map(chr, range(256))))
978 else:
979 stack.append([AnyChar()])
980
981 elif char == '^':
982 if flags & MULTILINE:
983 stack.append([Bol()])
984 else:
985 stack.append([BegBuf()])
986
987 elif char == '$':
988 if flags & MULTILINE:
989 stack.append([Eol()])
990 else:
991 stack.append([EndBuf()])
992
993 elif char == '#':
994 if flags & VERBOSE:
995 # comment
996 index = index + 1
997 end = string.find(pattern, '\n', index)
998 if end == -1:
999 index = len(pattern)
1000 else:
1001 index = end + 1
1002 else:
1003 stack.append([Exact(char)])
1004
1005 elif char in string.whitespace:
1006 if flags & VERBOSE:
1007 stack.append([Exact(char)])
1008
1009 elif char == '[':
1010 if index >= len(pattern):
1011 raise error, 'incomplete set'
1012 negate = 0
1013 last = ''
1014 set = []
1015 if pattern[index] == '^':
1016 negate = 1
1017 index = index + 1
1018 if index >= len(pattern):
1019 raise error, 'incomplete set'
1020 if pattern[index] in ']-':
1021 set.append(pattern[index])
1022 last = pattern[index]
1023 index = index + 1
1024 while (index < len(pattern)) and (pattern[index] != ']'):
1025 next = pattern[index]
1026 index = index + 1
1027 if next == '-':
1028 if (index >= len(pattern)) or (pattern[index] == ']'):
1029 raise error, 'incomplete range in set'
1030 if last > pattern[index]:
1031 raise error, 'range arguments out of order in set'
1032 for next in map(chr, \
1033 range(ord(last), \
1034 ord(pattern[index]) + 1)):
1035 if next not in set:
1036 set.append(next)
1037 last = ''
1038 index = index + 1
1039
1040 elif next == '\\':
1041 # expand syntax meta-characters and add to set
1042 if index >= len(pattern):
1043 raise error, 'incomplete set'
1044 elif (pattern[index] == ']'):
1045 raise error, 'backslash at the end of a set'
1046 elif pattern[index] == 'w':
1047 for next in syntax_table.keys():
1048 if 'word' in syntax_table[next]:
1049 set.append(next)
1050 elif pattern[index] == 'W':
1051 for next in syntax_table.keys():
1052 if 'word' not in syntax_table[next]:
1053 set.append(next)
1054 elif pattern[index] == 'd':
1055 for next in syntax_table.keys():
1056 if 'digit' in syntax_table[next]:
1057 set.append(next)
1058 elif pattern[index] == 'D':
1059 for next in syntax_table.keys():
1060 if 'digit' not in syntax_table[next]:
1061 set.append(next)
1062 elif pattern[index] == 's':
1063 for next in syntax_table.keys():
1064 if 'whitespace' in syntax_table[next]:
1065 set.append(next)
1066 elif pattern[index] == 'S':
1067 for next in syntax_table.keys():
1068 if 'whitespace' not in syntax_table[next]:
1069 set.append(next)
1070 else:
1071 raise error, 'unknown meta in set'
1072 last = ''
1073 index = index + 1
1074
1075 else:
1076 if next not in set:
1077 set.append(next)
1078 last = next
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001079 if pattern[index] != ']':
1080 raise error, 'incomplete set'
1081
1082 index = index + 1
1083
1084 if negate:
1085 notset = []
1086 for char in map(chr, range(256)):
1087 if char not in set:
1088 notset.append(char)
1089 stack.append([Set(notset)])
1090 else:
1091 stack.append([Set(set)])
1092
1093 else:
1094 stack.append([Exact(char)])
1095
1096 code = []
1097 while len(stack) > 0:
1098 if stack[-1][0].name == '(':
1099 raise error, 'too many open parens'
1100 code = stack[-1] + code
1101 del stack[-1]
1102 if len(code) == 0:
1103 raise error, 'no code generated'
1104 if (code[-1].name == '|'):
1105 raise error, 'alternation with nothing on the right'
1106 code = filter(lambda x: x.name != '|', code)
1107 need_label = 0
1108 for i in range(len(code)):
1109 if (code[i].name == 'jump') and (code[i].label == -1):
1110 code[i] = Jump(label)
1111 need_label = 1
1112 if need_label:
1113 code.append(Label(label))
1114 label = label + 1
1115 code.append(End())
1116 return RegexObject(pattern, flags, code, register, groupindex, callouts)
1117
1118if __name__ == '__main__':
1119 print compile('a(b)*')
1120 print compile('a{3}')
1121 print compile('(a){2}')
1122 print compile('a{2,4}')
1123 print compile('a|b')
1124 print compile('a(b|c)')
1125 print compile('a*')
1126 print compile('a+')
1127 print compile('a|b|c')
1128 print compile('a(b|c)*')
1129 print compile('\\n')
1130 print compile('a(?# huh huh)b')
1131 print compile('[a-c\\w]')
1132 print compile('[[]')
1133 print compile('[]]')
1134 print compile('(<hello>a)')
1135 print compile('\Q*\e')
1136 print compile('a{0,}')
1137