blob: b701bb67fbff3423fc8fa7358180ce45f8e0e46a [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
8error = 're error'
9
10# compilation flags
11
12IGNORECASE = I = 0x01
13
14MULTILINE = M = 0x02
15DOTALL = S = 0x04
16VERBOSE = X = 0x08
17
18#
19# Initialize syntax table. This information should really come from the
20# syntax table in regexpr.c rather than being duplicated here.
21#
22
23syntax_table = {}
24
25for char in map(chr, range(0, 256)):
26 syntax_table[char] = []
27
28for char in string.lowercase:
29 syntax_table[char].append('word')
30
31for char in string.uppercase:
32 syntax_table[char].append('word')
33
34for char in string.digits:
35 syntax_table[char].append('word')
36 syntax_table[char].append('digit')
37
38for char in string.whitespace:
39 syntax_table[char].append('whitespace')
40
41syntax_table['_'].append('word')
42
43#
44#
45#
46
47def match(pattern, string, flags=0):
48 return compile(pattern, flags).match(string)
49
50def search(pattern, string, flags=0):
51 return compile(pattern, flags).search(string)
52
53def sub(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000054 return compile(pattern).sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000055
56def subn(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000057 return compile(pattern).subn(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000058
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000059def split(pattern, string, maxsplit=0):
60 return compile(pattern).subn(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000061
62#
63#
64#
65
66class RegexObject:
67 def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
68 self.code = code
69 self.num_regs = num_regs
70 self.flags = flags
71 self.pattern = pattern
72 self.groupindex = groupindex
73 self.callouts = callouts
74 self.fastmap = build_fastmap(code)
75 if code[0].name == 'bol':
76 self.anchor = 1
77 elif code[0].name == 'begbuf':
78 self.anchor = 2
79 else:
80 self.anchor = 0
81 self.buffer = assemble(code)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000082 def search(self, string, pos=0):
83 regs = reop.search(self.buffer,
84 self.num_regs,
85 self.flags,
86 self.fastmap.can_be_null,
87 self.fastmap.fastmap(),
88 self.anchor,
89 string,
90 pos)
91 if regs is None:
92 return None
93 return MatchObject(self,
94 string,
95 pos,
96 regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000097 def match(self, string, pos=0):
98 regs = reop.match(self.buffer,
99 self.num_regs,
100 self.flags,
101 self.fastmap.can_be_null,
102 self.fastmap.fastmap(),
103 self.anchor,
104 string,
105 pos)
106 if regs is None:
107 return None
108 return MatchObject(self,
109 string,
110 pos,
111 regs)
112 def sub(self, repl, string, count=0):
113 pass
114 def subn(self, repl, string, count=0):
115 pass
116 def split(self, string, maxsplit=0):
117 pass
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000118
119class MatchObject:
120 def __init__(self, re, string, pos, regs):
121 self.re = re
122 self.string = string
123 self.pos = pos
124 self.regs = regs
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000125 def start(self, g):
126 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000127 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000128 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000129 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000130 raise IndexError, ('group "' + g + '" is undefined')
131 return self.regs[g][0]
132 def end(self, g):
133 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000134 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000135 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000136 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000137 raise IndexError, ('group "' + g + '" is undefined')
138 return self.regs[g][1]
139 def span(self, g):
140 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000141 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000142 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000143 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000144 raise IndexError, ('group "' + g + '" is undefined')
145 return self.regs[g]
146 def group(self, *groups):
147 if len(groups) == 0:
148 groups = range(1, self.re.num_regs)
149 result = []
150 for g in groups:
151 if type(g) == type(''):
152 try:
153 g = self.re.groupindex[g]
154 except (KeyError, TypeError):
155 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000156 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000157 result.append(None)
158 else:
159 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
160 if len(result) > 1:
161 return tuple(result)
162 elif len(result) == 1:
163 return result[0]
164 else:
165 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000166
167#
168# A set of classes to make assembly a bit easier, if a bit verbose.
169#
170
171class Instruction:
172 def __init__(self, opcode, size=1):
173 self.opcode = opcode
174 self.size = size
175 def assemble(self, position, labels):
176 return self.opcode
177 def __repr__(self):
178 return '%-15s' % (self.name)
179
180class FunctionCallout(Instruction):
181 name = 'function'
182 def __init__(self, function):
183 self.function = function
184 Instruction.__init__(self, chr(22), 2 + len(self.function))
185 def assemble(self, position, labels):
186 return self.opcode + chr(len(self.function)) + self.function
187 def __repr__(self):
188 return '%-15s %-10s' % (self.name, self.function)
189
190class End(Instruction):
191 name = 'end'
192 def __init__(self):
193 Instruction.__init__(self, chr(0))
194
195class Bol(Instruction):
196 name = 'bol'
197 def __init__(self):
198 self.name = 'bol'
199 Instruction.__init__(self, chr(1))
200
201class Eol(Instruction):
202 name = 'eol'
203 def __init__(self):
204 Instruction.__init__(self, chr(2))
205
206class Set(Instruction):
207 name = 'set'
208 def __init__(self, set):
209 self.set = set
210 Instruction.__init__(self, chr(3), 33)
Guido van Rossum63e18191997-07-11 11:08:38 +0000211 def assemble(self, position, labels):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000212 result = self.opcode
213 temp = 0
214 for i, c in map(lambda x: (x, chr(x)), range(256)):
Guido van Rossum63e18191997-07-11 11:08:38 +0000215 if c in self.set:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000216 temp = temp | (1 << (i & 7))
217 if (i % 8) == 7:
218 result = result + chr(temp)
219 temp = 0
220 return result
221 def __repr__(self):
222 result = '%-15s' % (self.name)
223 self.set.sort()
224 for char in self.set:
Guido van Rossum63e18191997-07-11 11:08:38 +0000225 result = result + char
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000226 return result
227
228class Exact(Instruction):
229 name = 'exact'
230 def __init__(self, char):
231 self.char = char
232 Instruction.__init__(self, chr(4), 2)
233 def assemble(self, position, labels):
234 return self.opcode + self.char
235 def __repr__(self):
236 return '%-15s %s' % (self.name, `self.char`)
237
238class AnyChar(Instruction):
239 name = 'anychar'
240 def __init__(self):
241 Instruction.__init__(self, chr(5))
242 def assemble(self, position, labels):
243 return self.opcode
244
245class MemoryInstruction(Instruction):
246 def __init__(self, opcode, register):
247 self.register = register
248 Instruction.__init__(self, opcode, 2)
249 def assemble(self, position, labels):
250 return self.opcode + chr(self.register)
251 def __repr__(self):
252 return '%-15s %i' % (self.name, self.register)
253
254class StartMemory(MemoryInstruction):
255 name = 'start_memory'
256 def __init__(self, register):
257 MemoryInstruction.__init__(self, chr(6), register)
258
259class EndMemory(MemoryInstruction):
260 name = 'end_memory'
261 def __init__(self, register):
262 MemoryInstruction.__init__(self, chr(7), register)
263
264class MatchMemory(MemoryInstruction):
265 name = 'match_memory'
266 def __init__(self, register):
267 MemoryInstruction.__init__(self, chr(8), register)
268
269class JumpInstruction(Instruction):
270 def __init__(self, opcode, label):
271 self.label = label
272 Instruction.__init__(self, opcode, 3)
273 def compute_offset(self, start, dest):
274 return dest - (start + 3)
275 def pack_offset(self, offset):
276 if offset > 32767:
277 raise error, 'offset out of range (pos)'
278 elif offset < -32768:
279 raise error, 'offset out of range (neg)'
280 elif offset < 0:
281 offset = offset + 65536
282 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
283 def assemble(self, position, labels):
284 return self.opcode + \
285 self.pack_offset(self.compute_offset(position,
286 labels[self.label]))
287 def __repr__(self):
288 return '%-15s %i' % (self.name, self.label)
289
290class Jump(JumpInstruction):
291 name = 'jump'
292 def __init__(self, label):
293 JumpInstruction.__init__(self, chr(9), label)
294
295class StarJump(JumpInstruction):
296 name = 'star_jump'
297 def __init__(self, label):
298 JumpInstruction.__init__(self, chr(10), label)
299
300class FailureJump(JumpInstruction):
301 name = 'failure_jump'
302 def __init__(self, label):
303 JumpInstruction.__init__(self, chr(11), label)
304
305class UpdateFailureJump(JumpInstruction):
306 name = 'update_failure_jump'
307 def __init__(self, label):
308 JumpInstruction.__init__(self, chr(12), label)
309
310class DummyFailureJump(JumpInstruction):
311 name = 'update_failure_jump'
312 def __init__(self, label):
313 JumpInstruction.__init__(self, chr(13), label)
314
315class BegBuf(Instruction):
316 name = 'begbuf'
317 def __init__(self):
318 Instruction.__init__(self, chr(14))
319
320class EndBuf(Instruction):
321 name = 'endbuf'
322 def __init__(self):
323 Instruction.__init__(self, chr(15))
324
325class WordBeg(Instruction):
326 name = 'wordbeg'
327 def __init__(self):
328 Instruction.__init__(self, chr(16))
329
330class WordEnd(Instruction):
331 name = 'wordend'
332 def __init__(self):
333 Instruction.__init__(self, chr(17))
334
335class WordBound(Instruction):
336 name = 'wordbound'
337 def __init__(self):
338 Instruction.__init__(self, chr(18))
339
340class NotWordBound(Instruction):
341 name = 'notwordbound'
342 def __init__(self):
343 Instruction.__init__(self, chr(18))
344
345class SyntaxSpec(Instruction):
346 name = 'syntaxspec'
347 def __init__(self, syntax):
348 self.syntax = syntax
349 Instruction.__init__(self, chr(20), 2)
350 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000351 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000352 return self.opcode + chr(self.syntax)
353
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000354class NotSyntaxSpec(Instruction):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000355 name = 'notsyntaxspec'
356 def __init__(self, syntax):
357 self.syntax = syntax
358 Instruction.__init__(self, chr(21), 2)
359 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000360 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000361 return self.opcode + chr(self.syntax)
362
363class Label(Instruction):
364 name = 'label'
365 def __init__(self, label):
366 self.label = label
367 Instruction.__init__(self, '', 0)
368 def __repr__(self):
369 return '%-15s %i' % (self.name, self.label)
370
371class OpenParen(Instruction):
372 name = '('
373 def __init__(self, register):
374 self.register = register
375 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000376 def assemble(self, position, labels):
377 raise error, 'unmatched open parenthesis'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000378
379class Alternation(Instruction):
380 name = '|'
381 def __init__(self):
382 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000383 def assemble(self, position, labels):
384 raise error, 'an alternation was not taken care of'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000385
386#
387#
388#
389
390def assemble(instructions):
391 labels = {}
392 position = 0
393 pass1 = []
394 for instruction in instructions:
395 if instruction.name == 'label':
396 labels[instruction.label] = position
397 else:
398 pass1.append((position, instruction))
399 position = position + instruction.size
400 pass2 = ''
401 for position, instruction in pass1:
402 pass2 = pass2 + instruction.assemble(position, labels)
403 return pass2
404
405#
406#
407#
408
409def escape(pattern):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000410 result = []
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000411 for char in pattern:
412 if 'word' not in syntax_table[char]:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000413 result.append('\\')
414 result.append(char)
415 return string.join(result, '')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000416
417#
418#
419#
420
421def registers_used(instructions):
422 result = []
423 for instruction in instructions:
424 if (instruction.name in ['set_memory', 'end_memory']) and \
425 (instruction.register not in result):
426 result.append(instruction.register)
427 return result
428
429#
430#
431#
432
433class Fastmap:
434 def __init__(self):
435 self.map = ['\000']*256
436 self.can_be_null = 0
437 def add(self, char):
438 self.map[ord(char)] = '\001'
439 def fastmap(self):
440 return string.join(self.map, '')
441 def __getitem__(self, char):
442 return ord(self.map[ord(char)])
443 def __repr__(self):
444 self.map.sort()
445 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
446
447#
448#
449#
450
451def find_label(code, label):
452 line = 0
453 for instruction in code:
454 if (instruction.name == 'label') and (instruction.label == label):
455 return line + 1
456 line = line + 1
457
458def build_fastmap_aux(code, pos, visited, fastmap):
459 if visited[pos]:
460 return
461 while 1:
462 instruction = code[pos]
463 visited[pos] = 1
464 pos = pos + 1
465 if instruction.name == 'end':
466 fastmap.can_be_null = 1
467 return
468 elif instruction.name == 'syntaxspec':
469 for char in map(chr, range(256)):
470 if instruction.syntax in syntax_table[char]:
471 fastmap.add(char)
472 return
473 elif instruction.name == 'notsyntaxspec':
474 for char in map(chr, range(256)):
475 if instruction.syntax not in syntax_table[char]:
476 fastmap.add(char)
477 return
478 elif instruction.name == 'eol':
479 fastmap.add('\n')
480 if fastmap.can_be_null == 0:
481 fastmap.can_be_null = 2
482 return
483 elif instruction.name == 'set':
484 for char in instruction.set:
485 fastmap.add(char)
486 return
487 elif instruction.name == 'exact':
488 fastmap.add(instruction.char)
489 elif instruction.name == 'anychar':
490 for char in map(chr, range(256)):
491 if char != '\n':
492 fastmap.add(char)
493 return
494 elif instruction.name == 'match_memory':
495 for char in map(chr, range(256)):
496 fastmap.add(char)
497 fastmap.can_be_null = 1
498 return
499 elif instruction.name in ['jump', 'dummy_failure_jump', \
500 'update_failure_jump', 'star_jump']:
501 pos = find_label(code, instruction.label)
502 if visited[pos]:
503 return
504 visited[pos] = 1
505 elif instruction.name == 'failure_jump':
506 build_fastmap_aux(code,
507 find_label(code, instruction.label),
508 visited,
509 fastmap)
510 elif instruction.name == 'function':
511 for char in map(chr, range(256)):
512 fastmap.add(char)
513 fastmap.can_be_null = 1
514 return
515
516def build_fastmap(code, pos=0):
517 visited = [0] * len(code)
518 fastmap = Fastmap()
519 build_fastmap_aux(code, pos, visited, fastmap)
520 return fastmap
521
522#
523#
524#
525
Guido van Rossum04a1d741997-07-15 14:38:13 +0000526[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
527[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
528 BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)
529
530def expand_escape(pattern, index, context=NORMAL):
531 if index >= len(pattern):
532 raise error, 'escape ends too soon'
533
534 elif pattern[index] == 't':
535 return CHAR, chr(9), index + 1
536
537 elif pattern[index] == 'n':
538 return CHAR, chr(10), index + 1
539
540 elif pattern[index] == 'r':
541 return CHAR, chr(13), index + 1
542
543 elif pattern[index] == 'f':
544 return CHAR, chr(12), index + 1
545
546 elif pattern[index] == 'a':
547 return CHAR, chr(7), index + 1
548
549 elif pattern[index] == 'e':
550 return CHAR, chr(27), index + 1
551
552 elif pattern[index] == 'c':
553 if index + 1 >= len(pattern):
554 raise error, '\\c must be followed by another character'
555 elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz':
556 return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2
557 else:
558 return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2
559
560 elif pattern[index] == 'x':
561 # CAUTION: this is the Python rule, not the Perl rule!
562 end = index
563 while (end < len(pattern)) and (pattern[end] in string.hexdigits):
564 end = end + 1
565 if end == index:
566 raise error, "\\x must be followed by hex digit(s)"
567 # let Python evaluate it, so we don't incorrectly 2nd-guess
568 # what it's doing (and Python in turn passes it on to sscanf,
569 # so that *it* doesn't incorrectly 2nd-guess what C does!)
570 char = eval ('"' + pattern[index-2:end] + '"')
571 assert len(char) == 1
572 return CHAR, char, end
573
574 elif pattern[index] == 'b':
575 if context != NORMAL:
576 return CHAR, chr(8), index + 1
577 else:
578 return WORD_BOUNDARY, '', index + 1
579
580 elif pattern[index] == 'B':
581 if context != NORMAL:
582 return CHAR, 'B', index + 1
583 else:
584 return NOT_WORD_BOUNDARY, '', index + 1
585
586 elif pattern[index] == 'A':
587 if context != NORMAL:
588 return CHAR, 'A', index + 1
589 else:
590 return BEGINNING_OF_BUFFER, '', index + 1
591
592 elif pattern[index] == 'Z':
593 if context != NORMAL:
594 return 'Z', index + 1
595 else:
596 return END_OF_BUFFER, '', index + 1
597
598 elif pattern[index] in 'GluLUQE':
599 raise error, ('\\' + ch + ' is not allowed')
600
601 elif pattern[index] == 'w':
602 if context == NORMAL:
603 return SYNTAX, 'word', index + 1
604 elif context == CHARCLASS:
605 set = []
606 for char in syntax_table.keys():
607 if 'word' in syntax_table[char]:
608 set.append(char)
609 return SET, set, index + 1
610 else:
611 return CHAR, 'w', index + 1
612
613 elif pattern[index] == 'W':
614 if context == NORMAL:
615 return NOT_SYNTAX, 'word', index + 1
616 elif context == CHARCLASS:
617 set = []
618 for char in syntax_table.keys():
619 if 'word' not in syntax_table[char]:
620 set.append(char)
621 return SET, set, index + 1
622 else:
623 return CHAR, 'W', index + 1
624
625 elif pattern[index] == 's':
626 if context == NORMAL:
627 return SYNTAX, 'whitespace', index + 1
628 elif context == CHARCLASS:
629 set = []
630 for char in syntax_table.keys():
631 if 'whitespace' in syntax_table[char]:
632 set.append(char)
633 return SET, set, index + 1
634 else:
635 return CHAR, 's', index + 1
636
637 elif pattern[index] == 'S':
638 if context == NORMAL:
639 return NOT_SYNTAX, 'whitespace', index + 1
640 elif context == CHARCLASS:
641 set = []
642 for char in syntax_table.keys():
643 if 'whitespace' not in syntax_table[char]:
644 set.append(char)
645 return SET, set, index + 1
646 else:
647 return CHAR, 'S', index + 1
648
649 elif pattern[index] == 'd':
650 if context == NORMAL:
651 return SYNTAX, 'digit', index + 1
652 elif context == CHARCLASS:
653 set = []
654 for char in syntax_table.keys():
655 if 'digit' in syntax_table[char]:
656 set.append(char)
657 return SET, set, index + 1
658 else:
659 return CHAR, 'd', index + 1
660
661 elif pattern[index] == 'D':
662 if context == NORMAL:
663 return NOT_SYNTAX, 'digit', index + 1
664 elif context == CHARCLASS:
665 set = []
666 for char in syntax_table.keys():
667 if 'digit' not in syntax_table[char]:
668 set.append(char)
669 return SET, set, index + 1
670 else:
671 return CHAR, 'D', index + 1
672
673 elif pattern[index] in '0123456789':
674 end = index
675 while (end < len(pattern)) and (pattern[end] in string.digits):
676 end = end + 1
677 value = pattern[index:end]
678
679 if (len(value) == 3) or ((len(value) == 2) and (value[0] == '0')):
680 # octal character value
681 value = string.atoi(value, 8)
682 if value > 255:
683 raise error, 'octal char out of range'
684 return CHAR, chr(value), end
685
686 elif value == '0':
687 return CHAR, chr(0), end
688
689 elif len(value) > 3:
690 raise error, ('\\' + value + ' has too many digits')
691
692 else:
693 # \1-\99 - reference a register
694 if context == CHARCLASS:
695 raise error, ('cannot reference a register from '
696 'inside a character class')
697 value = string.atoi(value)
698 if value == 0:
699 raise error, ('register 0 cannot be used '
700 'during match')
701 return MEMORY_REFERENCE, value, end
702
703 else:
704 return CHAR, pattern[index], index + 1
705
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000706def compile(pattern, flags=0):
707 stack = []
708 index = 0
709 label = 0
710 register = 1
711 groupindex = {}
712 callouts = []
713 while (index < len(pattern)):
714 char = pattern[index]
715 index = index + 1
716 if char == '\\':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000717 escape_type, value, index = expand_escape(pattern, index)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000718
Guido van Rossum04a1d741997-07-15 14:38:13 +0000719 if escape_type == CHAR:
720 stack.append([Exact(value)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000721
Guido van Rossum04a1d741997-07-15 14:38:13 +0000722 elif escape_type == MEMORY_REFERENCE:
723 if value >= register:
724 raise error, ('cannot reference a register '
725 'not yet used')
726 stack.append([MatchMemory(value)])
727
728 elif escape_type == BEGINNING_OF_BUFFER:
729 stack.append([BegBuf()])
730
731 elif escape_type == END_OF_BUFFER:
732 stack.append([EndBuf()])
733
734 elif escape_type == WORD_BOUNDARY:
735 stack.append([WordBound()])
736
737 elif escape_type == NOT_WORD_BOUNDARY:
738 stack.append([NotWordBound()])
739
740 elif escape_type == SYNTAX:
741 stack.append([SyntaxSpec(value)])
742
743 elif escape_type == NOT_SYNTAX:
744 stack.append([NotSyntaxSpec(value)])
745
746 elif escape_type == SET:
747 raise error, 'cannot use set escape type here'
748
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000749 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000750 raise error, 'unknown escape type'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000751
752 elif char == '|':
753 if len(stack) == 0:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000754 raise error, 'alternate with nothing on the left'
755 if stack[-1][0].name == '(':
756 raise error, 'alternate with nothing on the left in the group'
757 if stack[-1][0].name == '|':
758 raise error, 'alternates with nothing inbetween them'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000759 expr = []
Guido van Rossum04a1d741997-07-15 14:38:13 +0000760
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000761 while (len(stack) != 0) and \
762 (stack[-1][0].name != '(') and \
763 (stack[-1][0].name != '|'):
764 expr = stack[-1] + expr
765 del stack[-1]
766 stack.append([FailureJump(label)] + \
767 expr + \
768 [Jump(-1),
769 Label(label)])
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000770 stack.append([Alternation()])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000771 label = label + 1
772
773 elif char == '(':
774 if index >= len(pattern):
775 raise error, 'no matching close paren'
776
777 elif pattern[index] == '?':
778 # Perl style (?...) extensions
779 index = index + 1
780 if index >= len(pattern):
781 raise error, 'extension ends prematurely'
782
783 elif pattern[index] == 'P':
784 # Python extensions
785 index = index + 1
786 if index >= len(pattern):
787 raise error, 'extension ends prematurely'
788
789 elif pattern[index] == '<':
790 # Handle Python symbolic group names (?<...>...)
791 index = index + 1
792 end = string.find(pattern, '>', index)
793 if end == -1:
794 raise error, 'no end to symbolic group name'
795 name = pattern[index:end]
796 # XXX check syntax of name
797 index = end + 1
798 groupindex[name] = register
799 stack.append([OpenParen(register)])
800 register = register + 1
801
802 elif pattern[index] == '=':
803 # backreference to symbolic group name
804 if index >= len(pattern):
805 raise error, '(?P= at the end of the pattern'
806 start = index + 1
807 end = string.find(pattern, ')', start)
808 if end == -1:
809 raise error, 'no ) to end symbolic group name'
810 name = pattern[start:end]
811 if name not in groupindex:
812 raise error, ('symbolic group name ' + name + \
813 ' has not been used yet')
814 stack.append([MatchMemory(groupindex[name])])
815 index = end + 1
816
817 elif pattern[index] == '!':
818 # function callout
819 if index >= len(pattern):
820 raise error, 'no function callout name'
821 start = index + 1
822 end = string.find(pattern, ')', start)
823 if end == -1:
824 raise error, 'no ) to end function callout name'
825 name = pattern[start:end]
826 if name not in callouts:
827 raise error, ('function callout name not listed '
828 'in callouts dict')
829 stack.append([FunctionCallout(name)])
830
831 else:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000832 raise error, ('unknown Python extension: ' + \
833 pattern[index])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000834
835 elif pattern[index] == ':':
836 # grouping, but no registers
837 index = index + 1
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000838 stack.append([OpenParen(-1)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000839
840 elif pattern[index] == '#':
841 # comment
842 index = index + 1
843 end = string.find(pattern, ')', index)
844 if end == -1:
845 raise error, 'no end to comment'
846 index = end + 1
847
848 elif pattern[index] == '=':
849 raise error, ('zero-width positive lookahead '
850 'assertion is unsupported')
851
852 elif pattern[index] == '!':
853 raise error, ('zero-width negative lookahead '
854 'assertion is unsupported')
855
856 elif pattern[index] in 'iImMsSxX':
857 while (index < len(pattern)) and (pattern[index] != ')'):
Guido van Rossum65c28b71997-07-11 11:10:44 +0000858 if pattern[index] in 'iI':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000859 flags = flags | IGNORECASE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000860 elif pattern[index] in 'mM':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000861 flags = flags | MULTILINE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000862 elif pattern[index] in 'sS':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000863 flags = flags | DOTALL
864 elif pattern[index] in 'xX':
865 flags = flags | VERBOSE
866 else:
867 raise error, 'unknown flag'
868 index = index + 1
869 index = index + 1
870
871 else:
872 raise error, 'unknown extension'
873
874 else:
875 stack.append([OpenParen(register)])
876 register = register + 1
877
878 elif char == ')':
879 # make one expression out of everything on the stack up to
880 # the marker left by the last parenthesis
881 expr = []
882 while (len(stack) > 0) and (stack[-1][0].name != '('):
883 expr = stack[-1] + expr
884 del stack[-1]
885
886 if len(stack) == 0:
887 raise error, 'too many close parens'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000888
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000889 if len(expr) == 0:
890 raise error, 'nothing inside parens'
891
892 # check to see if alternation used correctly
893 if (expr[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000894 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000895
896 # remove markers left by alternation
897 expr = filter(lambda x: x.name != '|', expr)
898
899 # clean up jumps inserted by alternation
900 need_label = 0
901 for i in range(len(expr)):
902 if (expr[i].name == 'jump') and (expr[i].label == -1):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000903 expr[i] = Jump(label)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000904 need_label = 1
905 if need_label:
906 expr.append(Label(label))
907 label = label + 1
908
Guido van Rossum63e18191997-07-11 11:08:38 +0000909 if stack[-1][0].register > 0:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000910 expr = [StartMemory(stack[-1][0].register)] + \
911 expr + \
912 [EndMemory(stack[-1][0].register)]
913 del stack[-1]
914 stack.append(expr)
915
916 elif char == '{':
917 if len(stack) == 0:
918 raise error, 'no expression to repeat'
919 end = string.find(pattern, '}', index)
920 if end == -1:
921 raise error, ('no close curly bracket to match'
922 ' open curly bracket')
923
924 fields = map(string.strip,
925 string.split(pattern[index:end], ','))
926 index = end + 1
927
928 minimal = 0
929 if (index < len(pattern)) and (pattern[index] == '?'):
930 minimal = 1
931 index = index + 1
932
933 if len(fields) == 1:
934 # {n} or {n}? (there's really no difference)
935 try:
936 count = string.atoi(fields[0])
937 except ValueError:
938 raise error, ('count must be an integer '
939 'inside curly braces')
940 if count > 65535:
941 raise error, 'repeat count out of range'
942 expr = []
943 while count > 0:
944 expr = expr + stack[-1]
945 count = count - 1
946 del stack[-1]
947 stack.append(expr)
948
949 elif len(fields) == 2:
950 # {n,} or {n,m}
951 if fields[1] == '':
952 # {n,}
953 try:
954 min = string.atoi(fields[0])
955 except ValueError:
956 raise error, ('minimum must be an integer '
957 'inside curly braces')
958 if min > 65535:
959 raise error, 'minimum repeat count out of range'
960
961 expr = []
962 while min > 0:
963 expr = expr + stack[-1]
964 min = min - 1
965 registers = registers_used(stack[-1])
966 if minimal:
967 expr = expr + \
968 ([Jump(label + 1),
969 Label(label)] + \
970 stack[-1] + \
971 [Label(label + 1),
972 FailureJump(label, registers)])
973 else:
974 expr = expr + \
975 ([Label(label),
976 FailureJump(label + 1, registers)] +
977 stack[-1] +
978 [StarJump(label),
979 Label(label + 1)])
980
981 del stack[-1]
982 stack.append(expr)
983 label = label + 2
984
985 else:
986 # {n,m}
987 try:
988 min = string.atoi(fields[0])
989 except ValueError:
990 raise error, ('minimum must be an integer '
991 'inside curly braces')
992 try:
993 max = string.atoi(fields[1])
994 except ValueError:
995 raise error, ('maximum must be an integer '
996 'inside curly braces')
997 if min > 65535:
998 raise error, ('minumim repeat count out '
999 'of range')
1000 if max > 65535:
1001 raise error, ('maximum repeat count out '
1002 'of range')
1003 if min > max:
1004 raise error, ('minimum repeat count must be '
1005 'less than the maximum '
1006 'repeat count')
1007 expr = []
1008 while min > 0:
1009 expr = expr + stack[-1]
1010 min = min - 1
1011 max = max - 1
1012 if minimal:
1013 while max > 0:
1014 expr = expr + \
1015 [FailureJump(label),
1016 Jump(label + 1),
1017 Label(label)] + \
1018 stack[-1] + \
1019 [Label(label + 1)]
1020 label = label + 2
1021 del stack[-1]
1022 stack.append(expr)
1023 else:
1024 while max > 0:
1025 expr = expr + \
1026 [FailureJump(label)] + \
1027 stack[-1]
1028 max = max - 1
1029 del stack[-1]
1030 stack.append(expr + [Label(label)])
1031 label = label + 1
1032
1033 else:
1034 raise error, ('there need to be one or two fields '
1035 'in a {} expression')
1036 index = end + 1
1037
1038 elif char == '}':
1039 raise error, 'unbalanced close curly brace'
1040
1041 elif char == '*':
1042 # Kleene closure
1043 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001044 raise error, '* needs something to repeat'
1045 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1046 raise error, '* needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001047 registers = registers_used(stack[-1])
1048 if (index < len(pattern)) and (pattern[index] == '?'):
1049 # non-greedy matching
1050 expr = [JumpInstructions(label + 1),
1051 Label(label)] + \
1052 stack[-1] + \
1053 [Label(label + 1),
1054 FailureJump(label)]
1055 index = index + 1
1056 else:
1057 # greedy matching
1058 expr = [Label(label),
1059 FailureJump(label + 1)] + \
1060 stack[-1] + \
1061 [StarJump(label),
1062 Label(label + 1)]
1063 del stack[-1]
1064 stack.append(expr)
1065 label = label + 2
1066
1067 elif char == '+':
1068 # positive closure
1069 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001070 raise error, '+ needs something to repeat'
1071 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1072 raise error, '+ needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001073 registers = registers_used(stack[-1])
1074 if (index < len(pattern)) and (pattern[index] == '?'):
1075 # non-greedy
1076 expr = [Label(label)] + \
1077 stack[-1] + \
1078 [FailureJump(label)]
1079 label = label + 1
1080 index = index + 1
1081 else:
1082 # greedy
1083 expr = [DummyFailureJump(label + 1),
1084 Label(label),
1085 FailureJump(label + 2),
1086 Label(label + 1)] + \
1087 stack[-1] + \
1088 [StarJump(label),
1089 Label(label + 2)]
1090 label = label + 3
1091 del stack[-1]
1092 stack.append(expr)
1093
1094 elif char == '?':
1095 if len(stack) == 0:
1096 raise error, 'need something to be optional'
1097 registers = registers_used(stack[-1])
1098 if (index < len(pattern)) and (pattern[index] == '?'):
1099 # non-greedy matching
1100 expr = [FailureJump(label),
1101 Jump(label + 1),
1102 Label(label)] + \
1103 stack[-1] + \
1104 [Label(label + 1)]
1105 label = label + 2
1106 index = index + 1
1107 else:
1108 # greedy matching
1109 expr = [FailureJump(label)] + \
1110 stack[-1] + \
1111 [Label(label)]
1112 label = label + 1
1113 del stack[-1]
1114 stack.append(expr)
1115
1116 elif char == '.':
1117 if flags & DOTALL:
1118 stack.append(Set(map(chr, range(256))))
1119 else:
1120 stack.append([AnyChar()])
1121
1122 elif char == '^':
1123 if flags & MULTILINE:
1124 stack.append([Bol()])
1125 else:
1126 stack.append([BegBuf()])
1127
1128 elif char == '$':
1129 if flags & MULTILINE:
1130 stack.append([Eol()])
1131 else:
1132 stack.append([EndBuf()])
1133
1134 elif char == '#':
1135 if flags & VERBOSE:
1136 # comment
1137 index = index + 1
1138 end = string.find(pattern, '\n', index)
1139 if end == -1:
1140 index = len(pattern)
1141 else:
1142 index = end + 1
1143 else:
1144 stack.append([Exact(char)])
1145
1146 elif char in string.whitespace:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001147 if not (flags & VERBOSE):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001148 stack.append([Exact(char)])
1149
1150 elif char == '[':
1151 if index >= len(pattern):
1152 raise error, 'incomplete set'
1153 negate = 0
1154 last = ''
1155 set = []
Guido van Rossum04a1d741997-07-15 14:38:13 +00001156
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001157 if pattern[index] == '^':
1158 negate = 1
1159 index = index + 1
1160 if index >= len(pattern):
1161 raise error, 'incomplete set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001162
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001163 while (index < len(pattern)) and (pattern[index] != ']'):
1164 next = pattern[index]
1165 index = index + 1
1166 if next == '-':
Guido van Rossum04a1d741997-07-15 14:38:13 +00001167 if last == '':
1168 raise error, 'improper use of range in character set'
1169
1170 start = last
1171
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001172 if (index >= len(pattern)) or (pattern[index] == ']'):
1173 raise error, 'incomplete range in set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001174
1175 if pattern[index] == '\\':
1176 escape_type, value, index = expand_escape(pattern,
1177 index + 1,
1178 CHARCLASS)
1179
1180 if escape_type == CHAR:
1181 end = value
1182 else:
1183 raise error, ('illegal escape in character '
1184 'class range')
1185 else:
1186 end = pattern[index]
1187
1188 if start > end:
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001189 raise error, 'range arguments out of order in set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001190 for char in map(chr, range(ord(start), ord(end) + 1)):
1191 if char not in set:
1192 set.append(char)
1193
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001194 last = ''
1195 index = index + 1
1196
1197 elif next == '\\':
1198 # expand syntax meta-characters and add to set
1199 if index >= len(pattern):
1200 raise error, 'incomplete set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001201
1202 escape_type, value, index = expand_escape(pattern,
1203 index,
1204 CHARCLASS)
1205
1206 if escape_type == CHAR:
1207 set.append(value)
1208 last = value
1209
1210 elif escape_type == SET:
1211 for char in value:
1212 if char not in set:
1213 set.append(char)
1214 last = ''
1215
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001216 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001217 raise error, 'illegal escape type in character class'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001218
1219 else:
1220 if next not in set:
1221 set.append(next)
1222 last = next
Guido van Rossum04a1d741997-07-15 14:38:13 +00001223
1224 if (index >= len(pattern)) or ( pattern[index] != ']'):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001225 raise error, 'incomplete set'
1226
1227 index = index + 1
1228
1229 if negate:
1230 notset = []
1231 for char in map(chr, range(256)):
1232 if char not in set:
1233 notset.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001234 if len(notset) == 0:
1235 raise error, 'empty negated set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001236 stack.append([Set(notset)])
1237 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001238 if len(set) == 0:
1239 raise error, 'empty set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001240 stack.append([Set(set)])
1241
1242 else:
1243 stack.append([Exact(char)])
1244
1245 code = []
1246 while len(stack) > 0:
1247 if stack[-1][0].name == '(':
1248 raise error, 'too many open parens'
1249 code = stack[-1] + code
1250 del stack[-1]
1251 if len(code) == 0:
1252 raise error, 'no code generated'
1253 if (code[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +00001254 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001255 code = filter(lambda x: x.name != '|', code)
1256 need_label = 0
1257 for i in range(len(code)):
1258 if (code[i].name == 'jump') and (code[i].label == -1):
1259 code[i] = Jump(label)
1260 need_label = 1
1261 if need_label:
1262 code.append(Label(label))
1263 label = label + 1
1264 code.append(End())
1265 return RegexObject(pattern, flags, code, register, groupindex, callouts)
1266
1267if __name__ == '__main__':
1268 print compile('a(b)*')
1269 print compile('a{3}')
1270 print compile('(a){2}')
1271 print compile('a{2,4}')
1272 print compile('a|b')
1273 print compile('a(b|c)')
1274 print compile('a*')
1275 print compile('a+')
1276 print compile('a|b|c')
1277 print compile('a(b|c)*')
1278 print compile('\\n')
1279 print compile('a(?# huh huh)b')
1280 print compile('[a-c\\w]')
1281 print compile('[[]')
1282 print compile('[]]')
1283 print compile('(<hello>a)')
1284 print compile('\Q*\e')
1285 print compile('a{0,}')
1286