blob: e9b20c5c86b301e790c0a2b56945c4f74a64a0cf [file] [log] [blame]
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001#!/usr/bin/env python
2# -*- mode: python -*-
3# $Id$
4
5import string
6import reop
7
8error = 're error'
9
10# compilation flags
11
12IGNORECASE = I = 0x01
13
14MULTILINE = M = 0x02
15DOTALL = S = 0x04
16VERBOSE = X = 0x08
17
18#
19# Initialize syntax table. This information should really come from the
20# syntax table in regexpr.c rather than being duplicated here.
21#
22
23syntax_table = {}
24
25for char in map(chr, range(0, 256)):
26 syntax_table[char] = []
27
28for char in string.lowercase:
29 syntax_table[char].append('word')
30
31for char in string.uppercase:
32 syntax_table[char].append('word')
33
34for char in string.digits:
35 syntax_table[char].append('word')
36 syntax_table[char].append('digit')
37
38for char in string.whitespace:
39 syntax_table[char].append('whitespace')
40
41syntax_table['_'].append('word')
42
43#
44#
45#
46
Guido van Rossum09bcfd61997-07-15 15:38:20 +000047def valid_identifier(id):
48 if len(id) == 0:
49 return 0
50 if ('word' not in syntax_table[id[0]]) or ('digit' in syntax_table[id[0]]):
51 return 0
52 for char in id[1:]:
53 if 'word' not in syntax_table[char]:
54 return 0
55 return 1
56
57#
58#
59#
60
Guido van Rossum5ca1b711997-07-10 21:00:31 +000061def match(pattern, string, flags=0):
62 return compile(pattern, flags).match(string)
63
64def search(pattern, string, flags=0):
65 return compile(pattern, flags).search(string)
66
67def sub(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000068 return compile(pattern).sub(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000069
70def subn(pattern, repl, string, count=0):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000071 return compile(pattern).subn(repl, string, count)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000072
Guido van Rossum8a9a4a21997-07-11 20:48:25 +000073def split(pattern, string, maxsplit=0):
74 return compile(pattern).subn(string, maxsplit)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000075
76#
77#
78#
79
80class RegexObject:
81 def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
82 self.code = code
83 self.num_regs = num_regs
84 self.flags = flags
85 self.pattern = pattern
86 self.groupindex = groupindex
87 self.callouts = callouts
88 self.fastmap = build_fastmap(code)
89 if code[0].name == 'bol':
90 self.anchor = 1
91 elif code[0].name == 'begbuf':
92 self.anchor = 2
93 else:
94 self.anchor = 0
95 self.buffer = assemble(code)
Guido van Rossum5ca1b711997-07-10 21:00:31 +000096 def search(self, string, pos=0):
97 regs = reop.search(self.buffer,
98 self.num_regs,
99 self.flags,
100 self.fastmap.can_be_null,
101 self.fastmap.fastmap(),
102 self.anchor,
103 string,
104 pos)
105 if regs is None:
106 return None
107 return MatchObject(self,
108 string,
109 pos,
110 regs)
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000111 def match(self, string, pos=0):
112 regs = reop.match(self.buffer,
113 self.num_regs,
114 self.flags,
115 self.fastmap.can_be_null,
116 self.fastmap.fastmap(),
117 self.anchor,
118 string,
119 pos)
120 if regs is None:
121 return None
122 return MatchObject(self,
123 string,
124 pos,
125 regs)
126 def sub(self, repl, string, count=0):
127 pass
128 def subn(self, repl, string, count=0):
129 pass
130 def split(self, string, maxsplit=0):
131 pass
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000132
133class MatchObject:
134 def __init__(self, re, string, pos, regs):
135 self.re = re
136 self.string = string
137 self.pos = pos
138 self.regs = regs
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000139 def start(self, g):
140 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000141 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000142 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000143 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000144 raise IndexError, ('group "' + g + '" is undefined')
145 return self.regs[g][0]
146 def end(self, g):
147 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000148 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000149 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000150 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000151 raise IndexError, ('group "' + g + '" is undefined')
152 return self.regs[g][1]
153 def span(self, g):
154 if type(g) == type(''):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000155 try:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000156 g = self.re.groupindex[g]
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000157 except (KeyError, TypeError):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000158 raise IndexError, ('group "' + g + '" is undefined')
159 return self.regs[g]
160 def group(self, *groups):
161 if len(groups) == 0:
162 groups = range(1, self.re.num_regs)
Guido van Rossum53109751997-07-15 15:40:29 +0000163 use_all = 1
164 else:
165 use_all = 0
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000166 result = []
167 for g in groups:
168 if type(g) == type(''):
169 try:
170 g = self.re.groupindex[g]
171 except (KeyError, TypeError):
172 raise IndexError, ('group "' + g + '" is undefined')
Guido van Rossum04a1d741997-07-15 14:38:13 +0000173 if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000174 result.append(None)
175 else:
176 result.append(self.string[self.regs[g][0]:self.regs[g][1]])
Guido van Rossum53109751997-07-15 15:40:29 +0000177 if use_all or len(result) > 1:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000178 return tuple(result)
179 elif len(result) == 1:
180 return result[0]
181 else:
182 return ()
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000183
184#
185# A set of classes to make assembly a bit easier, if a bit verbose.
186#
187
188class Instruction:
189 def __init__(self, opcode, size=1):
190 self.opcode = opcode
191 self.size = size
192 def assemble(self, position, labels):
193 return self.opcode
194 def __repr__(self):
195 return '%-15s' % (self.name)
196
197class FunctionCallout(Instruction):
198 name = 'function'
199 def __init__(self, function):
200 self.function = function
201 Instruction.__init__(self, chr(22), 2 + len(self.function))
202 def assemble(self, position, labels):
203 return self.opcode + chr(len(self.function)) + self.function
204 def __repr__(self):
205 return '%-15s %-10s' % (self.name, self.function)
206
207class End(Instruction):
208 name = 'end'
209 def __init__(self):
210 Instruction.__init__(self, chr(0))
211
212class Bol(Instruction):
213 name = 'bol'
214 def __init__(self):
215 self.name = 'bol'
216 Instruction.__init__(self, chr(1))
217
218class Eol(Instruction):
219 name = 'eol'
220 def __init__(self):
221 Instruction.__init__(self, chr(2))
222
223class Set(Instruction):
224 name = 'set'
225 def __init__(self, set):
226 self.set = set
227 Instruction.__init__(self, chr(3), 33)
Guido van Rossum63e18191997-07-11 11:08:38 +0000228 def assemble(self, position, labels):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000229 result = self.opcode
230 temp = 0
231 for i, c in map(lambda x: (x, chr(x)), range(256)):
Guido van Rossum63e18191997-07-11 11:08:38 +0000232 if c in self.set:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000233 temp = temp | (1 << (i & 7))
234 if (i % 8) == 7:
235 result = result + chr(temp)
236 temp = 0
237 return result
238 def __repr__(self):
239 result = '%-15s' % (self.name)
240 self.set.sort()
241 for char in self.set:
Guido van Rossum63e18191997-07-11 11:08:38 +0000242 result = result + char
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000243 return result
244
245class Exact(Instruction):
246 name = 'exact'
247 def __init__(self, char):
248 self.char = char
249 Instruction.__init__(self, chr(4), 2)
250 def assemble(self, position, labels):
251 return self.opcode + self.char
252 def __repr__(self):
253 return '%-15s %s' % (self.name, `self.char`)
254
255class AnyChar(Instruction):
256 name = 'anychar'
257 def __init__(self):
258 Instruction.__init__(self, chr(5))
259 def assemble(self, position, labels):
260 return self.opcode
261
262class MemoryInstruction(Instruction):
263 def __init__(self, opcode, register):
264 self.register = register
265 Instruction.__init__(self, opcode, 2)
266 def assemble(self, position, labels):
267 return self.opcode + chr(self.register)
268 def __repr__(self):
269 return '%-15s %i' % (self.name, self.register)
270
271class StartMemory(MemoryInstruction):
272 name = 'start_memory'
273 def __init__(self, register):
274 MemoryInstruction.__init__(self, chr(6), register)
275
276class EndMemory(MemoryInstruction):
277 name = 'end_memory'
278 def __init__(self, register):
279 MemoryInstruction.__init__(self, chr(7), register)
280
281class MatchMemory(MemoryInstruction):
282 name = 'match_memory'
283 def __init__(self, register):
284 MemoryInstruction.__init__(self, chr(8), register)
285
286class JumpInstruction(Instruction):
287 def __init__(self, opcode, label):
288 self.label = label
289 Instruction.__init__(self, opcode, 3)
290 def compute_offset(self, start, dest):
291 return dest - (start + 3)
292 def pack_offset(self, offset):
293 if offset > 32767:
294 raise error, 'offset out of range (pos)'
295 elif offset < -32768:
296 raise error, 'offset out of range (neg)'
297 elif offset < 0:
298 offset = offset + 65536
299 return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
300 def assemble(self, position, labels):
301 return self.opcode + \
302 self.pack_offset(self.compute_offset(position,
303 labels[self.label]))
304 def __repr__(self):
305 return '%-15s %i' % (self.name, self.label)
306
307class Jump(JumpInstruction):
308 name = 'jump'
309 def __init__(self, label):
310 JumpInstruction.__init__(self, chr(9), label)
311
312class StarJump(JumpInstruction):
313 name = 'star_jump'
314 def __init__(self, label):
315 JumpInstruction.__init__(self, chr(10), label)
316
317class FailureJump(JumpInstruction):
318 name = 'failure_jump'
319 def __init__(self, label):
320 JumpInstruction.__init__(self, chr(11), label)
321
322class UpdateFailureJump(JumpInstruction):
323 name = 'update_failure_jump'
324 def __init__(self, label):
325 JumpInstruction.__init__(self, chr(12), label)
326
327class DummyFailureJump(JumpInstruction):
328 name = 'update_failure_jump'
329 def __init__(self, label):
330 JumpInstruction.__init__(self, chr(13), label)
331
332class BegBuf(Instruction):
333 name = 'begbuf'
334 def __init__(self):
335 Instruction.__init__(self, chr(14))
336
337class EndBuf(Instruction):
338 name = 'endbuf'
339 def __init__(self):
340 Instruction.__init__(self, chr(15))
341
342class WordBeg(Instruction):
343 name = 'wordbeg'
344 def __init__(self):
345 Instruction.__init__(self, chr(16))
346
347class WordEnd(Instruction):
348 name = 'wordend'
349 def __init__(self):
350 Instruction.__init__(self, chr(17))
351
352class WordBound(Instruction):
353 name = 'wordbound'
354 def __init__(self):
355 Instruction.__init__(self, chr(18))
356
357class NotWordBound(Instruction):
358 name = 'notwordbound'
359 def __init__(self):
360 Instruction.__init__(self, chr(18))
361
362class SyntaxSpec(Instruction):
363 name = 'syntaxspec'
364 def __init__(self, syntax):
365 self.syntax = syntax
366 Instruction.__init__(self, chr(20), 2)
367 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000368 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000369 return self.opcode + chr(self.syntax)
370
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000371class NotSyntaxSpec(Instruction):
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000372 name = 'notsyntaxspec'
373 def __init__(self, syntax):
374 self.syntax = syntax
375 Instruction.__init__(self, chr(21), 2)
376 def assemble(self, postition, labels):
Guido van Rossum5d6de251997-07-11 21:10:17 +0000377 # XXX
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000378 return self.opcode + chr(self.syntax)
379
380class Label(Instruction):
381 name = 'label'
382 def __init__(self, label):
383 self.label = label
384 Instruction.__init__(self, '', 0)
385 def __repr__(self):
386 return '%-15s %i' % (self.name, self.label)
387
388class OpenParen(Instruction):
389 name = '('
390 def __init__(self, register):
391 self.register = register
392 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000393 def assemble(self, position, labels):
394 raise error, 'unmatched open parenthesis'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000395
396class Alternation(Instruction):
397 name = '|'
398 def __init__(self):
399 Instruction.__init__(self, '', 0)
Guido van Rossum5d6de251997-07-11 21:10:17 +0000400 def assemble(self, position, labels):
401 raise error, 'an alternation was not taken care of'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000402
403#
404#
405#
406
407def assemble(instructions):
408 labels = {}
409 position = 0
410 pass1 = []
411 for instruction in instructions:
412 if instruction.name == 'label':
413 labels[instruction.label] = position
414 else:
415 pass1.append((position, instruction))
416 position = position + instruction.size
417 pass2 = ''
418 for position, instruction in pass1:
419 pass2 = pass2 + instruction.assemble(position, labels)
420 return pass2
421
422#
423#
424#
425
426def escape(pattern):
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000427 result = []
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000428 for char in pattern:
429 if 'word' not in syntax_table[char]:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000430 result.append('\\')
431 result.append(char)
432 return string.join(result, '')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000433
434#
435#
436#
437
438def registers_used(instructions):
439 result = []
440 for instruction in instructions:
441 if (instruction.name in ['set_memory', 'end_memory']) and \
442 (instruction.register not in result):
443 result.append(instruction.register)
444 return result
445
446#
447#
448#
449
450class Fastmap:
451 def __init__(self):
452 self.map = ['\000']*256
453 self.can_be_null = 0
454 def add(self, char):
455 self.map[ord(char)] = '\001'
456 def fastmap(self):
457 return string.join(self.map, '')
458 def __getitem__(self, char):
459 return ord(self.map[ord(char)])
460 def __repr__(self):
461 self.map.sort()
462 return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
463
464#
465#
466#
467
468def find_label(code, label):
469 line = 0
470 for instruction in code:
471 if (instruction.name == 'label') and (instruction.label == label):
472 return line + 1
473 line = line + 1
474
475def build_fastmap_aux(code, pos, visited, fastmap):
476 if visited[pos]:
477 return
478 while 1:
479 instruction = code[pos]
480 visited[pos] = 1
481 pos = pos + 1
482 if instruction.name == 'end':
483 fastmap.can_be_null = 1
484 return
485 elif instruction.name == 'syntaxspec':
486 for char in map(chr, range(256)):
487 if instruction.syntax in syntax_table[char]:
488 fastmap.add(char)
489 return
490 elif instruction.name == 'notsyntaxspec':
491 for char in map(chr, range(256)):
492 if instruction.syntax not in syntax_table[char]:
493 fastmap.add(char)
494 return
495 elif instruction.name == 'eol':
496 fastmap.add('\n')
497 if fastmap.can_be_null == 0:
498 fastmap.can_be_null = 2
499 return
500 elif instruction.name == 'set':
501 for char in instruction.set:
502 fastmap.add(char)
503 return
504 elif instruction.name == 'exact':
505 fastmap.add(instruction.char)
506 elif instruction.name == 'anychar':
507 for char in map(chr, range(256)):
508 if char != '\n':
509 fastmap.add(char)
510 return
511 elif instruction.name == 'match_memory':
512 for char in map(chr, range(256)):
513 fastmap.add(char)
514 fastmap.can_be_null = 1
515 return
516 elif instruction.name in ['jump', 'dummy_failure_jump', \
517 'update_failure_jump', 'star_jump']:
518 pos = find_label(code, instruction.label)
519 if visited[pos]:
520 return
521 visited[pos] = 1
522 elif instruction.name == 'failure_jump':
523 build_fastmap_aux(code,
524 find_label(code, instruction.label),
525 visited,
526 fastmap)
527 elif instruction.name == 'function':
528 for char in map(chr, range(256)):
529 fastmap.add(char)
530 fastmap.can_be_null = 1
531 return
532
533def build_fastmap(code, pos=0):
534 visited = [0] * len(code)
535 fastmap = Fastmap()
536 build_fastmap_aux(code, pos, visited, fastmap)
537 return fastmap
538
539#
540#
541#
542
Guido van Rossum04a1d741997-07-15 14:38:13 +0000543[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
544[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
545 BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)
546
547def expand_escape(pattern, index, context=NORMAL):
548 if index >= len(pattern):
549 raise error, 'escape ends too soon'
550
551 elif pattern[index] == 't':
552 return CHAR, chr(9), index + 1
553
554 elif pattern[index] == 'n':
555 return CHAR, chr(10), index + 1
556
557 elif pattern[index] == 'r':
558 return CHAR, chr(13), index + 1
559
560 elif pattern[index] == 'f':
561 return CHAR, chr(12), index + 1
562
563 elif pattern[index] == 'a':
564 return CHAR, chr(7), index + 1
565
566 elif pattern[index] == 'e':
567 return CHAR, chr(27), index + 1
568
569 elif pattern[index] == 'c':
570 if index + 1 >= len(pattern):
571 raise error, '\\c must be followed by another character'
572 elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz':
573 return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2
574 else:
575 return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2
576
577 elif pattern[index] == 'x':
578 # CAUTION: this is the Python rule, not the Perl rule!
579 end = index
580 while (end < len(pattern)) and (pattern[end] in string.hexdigits):
581 end = end + 1
582 if end == index:
583 raise error, "\\x must be followed by hex digit(s)"
584 # let Python evaluate it, so we don't incorrectly 2nd-guess
585 # what it's doing (and Python in turn passes it on to sscanf,
586 # so that *it* doesn't incorrectly 2nd-guess what C does!)
587 char = eval ('"' + pattern[index-2:end] + '"')
588 assert len(char) == 1
589 return CHAR, char, end
590
591 elif pattern[index] == 'b':
592 if context != NORMAL:
593 return CHAR, chr(8), index + 1
594 else:
595 return WORD_BOUNDARY, '', index + 1
596
597 elif pattern[index] == 'B':
598 if context != NORMAL:
599 return CHAR, 'B', index + 1
600 else:
601 return NOT_WORD_BOUNDARY, '', index + 1
602
603 elif pattern[index] == 'A':
604 if context != NORMAL:
605 return CHAR, 'A', index + 1
606 else:
607 return BEGINNING_OF_BUFFER, '', index + 1
608
609 elif pattern[index] == 'Z':
610 if context != NORMAL:
611 return 'Z', index + 1
612 else:
613 return END_OF_BUFFER, '', index + 1
614
615 elif pattern[index] in 'GluLUQE':
616 raise error, ('\\' + ch + ' is not allowed')
617
618 elif pattern[index] == 'w':
619 if context == NORMAL:
620 return SYNTAX, 'word', index + 1
621 elif context == CHARCLASS:
622 set = []
623 for char in syntax_table.keys():
624 if 'word' in syntax_table[char]:
625 set.append(char)
626 return SET, set, index + 1
627 else:
628 return CHAR, 'w', index + 1
629
630 elif pattern[index] == 'W':
631 if context == NORMAL:
632 return NOT_SYNTAX, 'word', index + 1
633 elif context == CHARCLASS:
634 set = []
635 for char in syntax_table.keys():
636 if 'word' not in syntax_table[char]:
637 set.append(char)
638 return SET, set, index + 1
639 else:
640 return CHAR, 'W', index + 1
641
642 elif pattern[index] == 's':
643 if context == NORMAL:
644 return SYNTAX, 'whitespace', index + 1
645 elif context == CHARCLASS:
646 set = []
647 for char in syntax_table.keys():
648 if 'whitespace' in syntax_table[char]:
649 set.append(char)
650 return SET, set, index + 1
651 else:
652 return CHAR, 's', index + 1
653
654 elif pattern[index] == 'S':
655 if context == NORMAL:
656 return NOT_SYNTAX, 'whitespace', index + 1
657 elif context == CHARCLASS:
658 set = []
659 for char in syntax_table.keys():
660 if 'whitespace' not in syntax_table[char]:
661 set.append(char)
662 return SET, set, index + 1
663 else:
664 return CHAR, 'S', index + 1
665
666 elif pattern[index] == 'd':
667 if context == NORMAL:
668 return SYNTAX, 'digit', index + 1
669 elif context == CHARCLASS:
670 set = []
671 for char in syntax_table.keys():
672 if 'digit' in syntax_table[char]:
673 set.append(char)
674 return SET, set, index + 1
675 else:
676 return CHAR, 'd', index + 1
677
678 elif pattern[index] == 'D':
679 if context == NORMAL:
680 return NOT_SYNTAX, 'digit', index + 1
681 elif context == CHARCLASS:
682 set = []
683 for char in syntax_table.keys():
684 if 'digit' not in syntax_table[char]:
685 set.append(char)
686 return SET, set, index + 1
687 else:
688 return CHAR, 'D', index + 1
689
690 elif pattern[index] in '0123456789':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000691
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000692 if pattern[index] == '0':
693 if (index + 1 < len(pattern)) and \
694 (pattern[index + 1] in string.octdigits):
695 if (index + 2 < len(pattern)) and \
696 (pattern[index + 2] in string.octdigits):
697 value = string.atoi(pattern[index:index + 3], 8)
698 index = index + 3
699
700 else:
701 value = string.atoi(pattern[index:index + 2], 8)
702 index = index + 2
703
704 else:
705 value = 0
706 index = index + 1
707
Guido van Rossum04a1d741997-07-15 14:38:13 +0000708 if value > 255:
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000709 raise error, 'octal value out of range'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000710
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000711 return CHAR, chr(value), index
712
Guido van Rossum04a1d741997-07-15 14:38:13 +0000713 else:
Guido van Rossum9f845ec1997-07-15 18:11:42 +0000714 if (index + 1 < len(pattern)) and \
715 (pattern[index + 1] in string.digits):
716 if (index + 2 < len(pattern)) and \
717 (pattern[index + 2] in string.octdigits) and \
718 (pattern[index + 1] in string.octdigits) and \
719 (pattern[index] in string.octdigits):
720 value = string.atoi(pattern[index:index + 3], 8)
721 if value > 255:
722 raise error, 'octal value out of range'
723
724 return CHAR, chr(value), index + 3
725
726 else:
727 value = string.atoi(pattern[index:index + 2])
728 if (value < 1) or (value > 99):
729 raise error, 'memory reference out of range'
730
731 if context == CHARCLASS:
732 raise error, ('cannot reference a register from '
733 'inside a character class')
734 return MEMORY_REFERENCE, value, index + 2
735
736 else:
737 if context == CHARCLASS:
738 raise error, ('cannot reference a register from '
739 'inside a character class')
740
741 value = string.atoi(pattern[index])
742 return MEMORY_REFERENCE, value, index + 1
743
744 while (end < len(pattern)) and (pattern[end] in string.digits):
745 end = end + 1
746 value = pattern[index:end]
Guido van Rossum04a1d741997-07-15 14:38:13 +0000747
748 else:
749 return CHAR, pattern[index], index + 1
750
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000751def compile(pattern, flags=0):
752 stack = []
753 index = 0
754 label = 0
755 register = 1
756 groupindex = {}
757 callouts = []
758 while (index < len(pattern)):
759 char = pattern[index]
760 index = index + 1
761 if char == '\\':
Guido van Rossum04a1d741997-07-15 14:38:13 +0000762 escape_type, value, index = expand_escape(pattern, index)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000763
Guido van Rossum04a1d741997-07-15 14:38:13 +0000764 if escape_type == CHAR:
765 stack.append([Exact(value)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000766
Guido van Rossum04a1d741997-07-15 14:38:13 +0000767 elif escape_type == MEMORY_REFERENCE:
768 if value >= register:
769 raise error, ('cannot reference a register '
770 'not yet used')
771 stack.append([MatchMemory(value)])
772
773 elif escape_type == BEGINNING_OF_BUFFER:
774 stack.append([BegBuf()])
775
776 elif escape_type == END_OF_BUFFER:
777 stack.append([EndBuf()])
778
779 elif escape_type == WORD_BOUNDARY:
780 stack.append([WordBound()])
781
782 elif escape_type == NOT_WORD_BOUNDARY:
783 stack.append([NotWordBound()])
784
785 elif escape_type == SYNTAX:
786 stack.append([SyntaxSpec(value)])
787
788 elif escape_type == NOT_SYNTAX:
789 stack.append([NotSyntaxSpec(value)])
790
791 elif escape_type == SET:
792 raise error, 'cannot use set escape type here'
793
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000794 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000795 raise error, 'unknown escape type'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000796
797 elif char == '|':
798 if len(stack) == 0:
Guido van Rossum04a1d741997-07-15 14:38:13 +0000799 raise error, 'alternate with nothing on the left'
800 if stack[-1][0].name == '(':
801 raise error, 'alternate with nothing on the left in the group'
802 if stack[-1][0].name == '|':
803 raise error, 'alternates with nothing inbetween them'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000804 expr = []
Guido van Rossum04a1d741997-07-15 14:38:13 +0000805
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000806 while (len(stack) != 0) and \
807 (stack[-1][0].name != '(') and \
808 (stack[-1][0].name != '|'):
809 expr = stack[-1] + expr
810 del stack[-1]
811 stack.append([FailureJump(label)] + \
812 expr + \
813 [Jump(-1),
814 Label(label)])
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000815 stack.append([Alternation()])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000816 label = label + 1
817
818 elif char == '(':
819 if index >= len(pattern):
820 raise error, 'no matching close paren'
821
822 elif pattern[index] == '?':
823 # Perl style (?...) extensions
824 index = index + 1
825 if index >= len(pattern):
826 raise error, 'extension ends prematurely'
827
828 elif pattern[index] == 'P':
829 # Python extensions
830 index = index + 1
831 if index >= len(pattern):
832 raise error, 'extension ends prematurely'
833
834 elif pattern[index] == '<':
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000835 # Handle Python symbolic group names (?P<...>...)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000836 index = index + 1
837 end = string.find(pattern, '>', index)
838 if end == -1:
839 raise error, 'no end to symbolic group name'
840 name = pattern[index:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000841 if not valid_identifier(name):
842 raise error, ('symbolic group name must be a '
843 'valid identifier')
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000844 index = end + 1
845 groupindex[name] = register
846 stack.append([OpenParen(register)])
847 register = register + 1
848
849 elif pattern[index] == '=':
850 # backreference to symbolic group name
851 if index >= len(pattern):
852 raise error, '(?P= at the end of the pattern'
853 start = index + 1
854 end = string.find(pattern, ')', start)
855 if end == -1:
856 raise error, 'no ) to end symbolic group name'
857 name = pattern[start:end]
Guido van Rossum09bcfd61997-07-15 15:38:20 +0000858 if name not in groupindex.keys():
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000859 raise error, ('symbolic group name ' + name + \
860 ' has not been used yet')
861 stack.append([MatchMemory(groupindex[name])])
862 index = end + 1
863
864 elif pattern[index] == '!':
865 # function callout
866 if index >= len(pattern):
867 raise error, 'no function callout name'
868 start = index + 1
869 end = string.find(pattern, ')', start)
870 if end == -1:
871 raise error, 'no ) to end function callout name'
872 name = pattern[start:end]
873 if name not in callouts:
874 raise error, ('function callout name not listed '
875 'in callouts dict')
876 stack.append([FunctionCallout(name)])
877
878 else:
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000879 raise error, ('unknown Python extension: ' + \
880 pattern[index])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000881
882 elif pattern[index] == ':':
883 # grouping, but no registers
884 index = index + 1
Guido van Rossum8a9a4a21997-07-11 20:48:25 +0000885 stack.append([OpenParen(-1)])
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000886
887 elif pattern[index] == '#':
888 # comment
889 index = index + 1
890 end = string.find(pattern, ')', index)
891 if end == -1:
892 raise error, 'no end to comment'
893 index = end + 1
894
895 elif pattern[index] == '=':
896 raise error, ('zero-width positive lookahead '
897 'assertion is unsupported')
898
899 elif pattern[index] == '!':
900 raise error, ('zero-width negative lookahead '
901 'assertion is unsupported')
902
903 elif pattern[index] in 'iImMsSxX':
904 while (index < len(pattern)) and (pattern[index] != ')'):
Guido van Rossum65c28b71997-07-11 11:10:44 +0000905 if pattern[index] in 'iI':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000906 flags = flags | IGNORECASE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000907 elif pattern[index] in 'mM':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000908 flags = flags | MULTILINE
Guido van Rossum65c28b71997-07-11 11:10:44 +0000909 elif pattern[index] in 'sS':
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000910 flags = flags | DOTALL
911 elif pattern[index] in 'xX':
912 flags = flags | VERBOSE
913 else:
914 raise error, 'unknown flag'
915 index = index + 1
916 index = index + 1
917
918 else:
919 raise error, 'unknown extension'
920
921 else:
922 stack.append([OpenParen(register)])
923 register = register + 1
924
925 elif char == ')':
926 # make one expression out of everything on the stack up to
927 # the marker left by the last parenthesis
928 expr = []
929 while (len(stack) > 0) and (stack[-1][0].name != '('):
930 expr = stack[-1] + expr
931 del stack[-1]
932
933 if len(stack) == 0:
934 raise error, 'too many close parens'
Guido van Rossum04a1d741997-07-15 14:38:13 +0000935
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000936 if len(expr) == 0:
937 raise error, 'nothing inside parens'
938
939 # check to see if alternation used correctly
940 if (expr[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000941 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000942
943 # remove markers left by alternation
944 expr = filter(lambda x: x.name != '|', expr)
945
946 # clean up jumps inserted by alternation
947 need_label = 0
948 for i in range(len(expr)):
949 if (expr[i].name == 'jump') and (expr[i].label == -1):
Guido van Rossum04a1d741997-07-15 14:38:13 +0000950 expr[i] = Jump(label)
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000951 need_label = 1
952 if need_label:
953 expr.append(Label(label))
954 label = label + 1
955
Guido van Rossum63e18191997-07-11 11:08:38 +0000956 if stack[-1][0].register > 0:
Guido van Rossum5ca1b711997-07-10 21:00:31 +0000957 expr = [StartMemory(stack[-1][0].register)] + \
958 expr + \
959 [EndMemory(stack[-1][0].register)]
960 del stack[-1]
961 stack.append(expr)
962
963 elif char == '{':
964 if len(stack) == 0:
965 raise error, 'no expression to repeat'
966 end = string.find(pattern, '}', index)
967 if end == -1:
968 raise error, ('no close curly bracket to match'
969 ' open curly bracket')
970
971 fields = map(string.strip,
972 string.split(pattern[index:end], ','))
973 index = end + 1
974
975 minimal = 0
976 if (index < len(pattern)) and (pattern[index] == '?'):
977 minimal = 1
978 index = index + 1
979
980 if len(fields) == 1:
981 # {n} or {n}? (there's really no difference)
982 try:
983 count = string.atoi(fields[0])
984 except ValueError:
985 raise error, ('count must be an integer '
986 'inside curly braces')
987 if count > 65535:
988 raise error, 'repeat count out of range'
989 expr = []
990 while count > 0:
991 expr = expr + stack[-1]
992 count = count - 1
993 del stack[-1]
994 stack.append(expr)
995
996 elif len(fields) == 2:
997 # {n,} or {n,m}
998 if fields[1] == '':
999 # {n,}
1000 try:
1001 min = string.atoi(fields[0])
1002 except ValueError:
1003 raise error, ('minimum must be an integer '
1004 'inside curly braces')
1005 if min > 65535:
1006 raise error, 'minimum repeat count out of range'
1007
1008 expr = []
1009 while min > 0:
1010 expr = expr + stack[-1]
1011 min = min - 1
1012 registers = registers_used(stack[-1])
1013 if minimal:
1014 expr = expr + \
1015 ([Jump(label + 1),
1016 Label(label)] + \
1017 stack[-1] + \
1018 [Label(label + 1),
1019 FailureJump(label, registers)])
1020 else:
1021 expr = expr + \
1022 ([Label(label),
1023 FailureJump(label + 1, registers)] +
1024 stack[-1] +
1025 [StarJump(label),
1026 Label(label + 1)])
1027
1028 del stack[-1]
1029 stack.append(expr)
1030 label = label + 2
1031
1032 else:
1033 # {n,m}
1034 try:
1035 min = string.atoi(fields[0])
1036 except ValueError:
1037 raise error, ('minimum must be an integer '
1038 'inside curly braces')
1039 try:
1040 max = string.atoi(fields[1])
1041 except ValueError:
1042 raise error, ('maximum must be an integer '
1043 'inside curly braces')
1044 if min > 65535:
1045 raise error, ('minumim repeat count out '
1046 'of range')
1047 if max > 65535:
1048 raise error, ('maximum repeat count out '
1049 'of range')
1050 if min > max:
1051 raise error, ('minimum repeat count must be '
1052 'less than the maximum '
1053 'repeat count')
1054 expr = []
1055 while min > 0:
1056 expr = expr + stack[-1]
1057 min = min - 1
1058 max = max - 1
1059 if minimal:
1060 while max > 0:
1061 expr = expr + \
1062 [FailureJump(label),
1063 Jump(label + 1),
1064 Label(label)] + \
1065 stack[-1] + \
1066 [Label(label + 1)]
1067 label = label + 2
1068 del stack[-1]
1069 stack.append(expr)
1070 else:
1071 while max > 0:
1072 expr = expr + \
1073 [FailureJump(label)] + \
1074 stack[-1]
1075 max = max - 1
1076 del stack[-1]
1077 stack.append(expr + [Label(label)])
1078 label = label + 1
1079
1080 else:
1081 raise error, ('there need to be one or two fields '
1082 'in a {} expression')
1083 index = end + 1
1084
1085 elif char == '}':
1086 raise error, 'unbalanced close curly brace'
1087
1088 elif char == '*':
1089 # Kleene closure
1090 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001091 raise error, '* needs something to repeat'
1092 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1093 raise error, '* needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001094 registers = registers_used(stack[-1])
1095 if (index < len(pattern)) and (pattern[index] == '?'):
1096 # non-greedy matching
1097 expr = [JumpInstructions(label + 1),
1098 Label(label)] + \
1099 stack[-1] + \
1100 [Label(label + 1),
1101 FailureJump(label)]
1102 index = index + 1
1103 else:
1104 # greedy matching
1105 expr = [Label(label),
1106 FailureJump(label + 1)] + \
1107 stack[-1] + \
1108 [StarJump(label),
1109 Label(label + 1)]
1110 del stack[-1]
1111 stack.append(expr)
1112 label = label + 2
1113
1114 elif char == '+':
1115 # positive closure
1116 if len(stack) == 0:
Guido van Rossum5d6de251997-07-11 21:10:17 +00001117 raise error, '+ needs something to repeat'
1118 if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
1119 raise error, '+ needs something to repeat'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001120 registers = registers_used(stack[-1])
1121 if (index < len(pattern)) and (pattern[index] == '?'):
1122 # non-greedy
1123 expr = [Label(label)] + \
1124 stack[-1] + \
1125 [FailureJump(label)]
1126 label = label + 1
1127 index = index + 1
1128 else:
1129 # greedy
1130 expr = [DummyFailureJump(label + 1),
1131 Label(label),
1132 FailureJump(label + 2),
1133 Label(label + 1)] + \
1134 stack[-1] + \
1135 [StarJump(label),
1136 Label(label + 2)]
1137 label = label + 3
1138 del stack[-1]
1139 stack.append(expr)
1140
1141 elif char == '?':
1142 if len(stack) == 0:
1143 raise error, 'need something to be optional'
1144 registers = registers_used(stack[-1])
1145 if (index < len(pattern)) and (pattern[index] == '?'):
1146 # non-greedy matching
1147 expr = [FailureJump(label),
1148 Jump(label + 1),
1149 Label(label)] + \
1150 stack[-1] + \
1151 [Label(label + 1)]
1152 label = label + 2
1153 index = index + 1
1154 else:
1155 # greedy matching
1156 expr = [FailureJump(label)] + \
1157 stack[-1] + \
1158 [Label(label)]
1159 label = label + 1
1160 del stack[-1]
1161 stack.append(expr)
1162
1163 elif char == '.':
1164 if flags & DOTALL:
1165 stack.append(Set(map(chr, range(256))))
1166 else:
1167 stack.append([AnyChar()])
1168
1169 elif char == '^':
1170 if flags & MULTILINE:
1171 stack.append([Bol()])
1172 else:
1173 stack.append([BegBuf()])
1174
1175 elif char == '$':
1176 if flags & MULTILINE:
1177 stack.append([Eol()])
1178 else:
1179 stack.append([EndBuf()])
1180
1181 elif char == '#':
1182 if flags & VERBOSE:
1183 # comment
1184 index = index + 1
1185 end = string.find(pattern, '\n', index)
1186 if end == -1:
1187 index = len(pattern)
1188 else:
1189 index = end + 1
1190 else:
1191 stack.append([Exact(char)])
1192
1193 elif char in string.whitespace:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001194 if not (flags & VERBOSE):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001195 stack.append([Exact(char)])
1196
1197 elif char == '[':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001198 # compile character class
1199
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001200 if index >= len(pattern):
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001201 raise error, 'unclosed character class'
1202
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001203 negate = 0
1204 last = ''
1205 set = []
Guido van Rossum04a1d741997-07-15 14:38:13 +00001206
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001207 if pattern[index] == '^':
1208 negate = 1
1209 index = index + 1
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001210 if index >= len(pattern):
1211 raise error, 'unclosed character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001212
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001213 if pattern[index] == ']':
1214 set.append(']')
1215 index = index + 1
1216 if index >= len(pattern):
1217 raise error, 'unclosed character class'
1218
1219 elif pattern[index] == '-':
1220 set.append('-')
1221 index = index + 1
1222 if index >= len(pattern):
1223 raise error, 'unclosed character class'
1224
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001225 while (index < len(pattern)) and (pattern[index] != ']'):
1226 next = pattern[index]
1227 index = index + 1
1228 if next == '-':
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001229 if index >= len(pattern):
1230 raise error, 'incomplete range in character class'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001231
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001232 elif pattern[index] == ']':
1233 set.append('-')
Guido van Rossum04a1d741997-07-15 14:38:13 +00001234
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001235 else:
1236 if last == '':
1237 raise error, ('improper use of range in '
1238 'character class')
1239
1240 start = last
1241
1242 if pattern[index] == '\\':
1243 escape_type,
1244 value,
1245 index = expand_escape(pattern,
1246 index + 1,
1247 CHARCLASS)
1248
1249 if escape_type == CHAR:
1250 end = value
1251
1252 else:
1253 raise error, ('illegal escape in character '
1254 'class range')
1255 else:
1256 end = pattern[index]
1257 index = index + 1
1258
1259 if start > end:
1260 raise error, ('range arguments out of order '
1261 'in character class')
1262
1263 for char in map(chr, range(ord(start), ord(end) + 1)):
1264 if char not in set:
1265 set.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001266
Guido van Rossum09bcfd61997-07-15 15:38:20 +00001267 last = ''
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001268
1269 elif next == '\\':
1270 # expand syntax meta-characters and add to set
1271 if index >= len(pattern):
1272 raise error, 'incomplete set'
Guido van Rossum04a1d741997-07-15 14:38:13 +00001273
1274 escape_type, value, index = expand_escape(pattern,
1275 index,
1276 CHARCLASS)
1277
1278 if escape_type == CHAR:
1279 set.append(value)
1280 last = value
1281
1282 elif escape_type == SET:
1283 for char in value:
1284 if char not in set:
1285 set.append(char)
1286 last = ''
1287
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001288 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001289 raise error, 'illegal escape type in character class'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001290
1291 else:
1292 if next not in set:
1293 set.append(next)
1294 last = next
Guido van Rossum04a1d741997-07-15 14:38:13 +00001295
1296 if (index >= len(pattern)) or ( pattern[index] != ']'):
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001297 raise error, 'incomplete set'
1298
1299 index = index + 1
1300
1301 if negate:
1302 notset = []
1303 for char in map(chr, range(256)):
1304 if char not in set:
1305 notset.append(char)
Guido van Rossum04a1d741997-07-15 14:38:13 +00001306 if len(notset) == 0:
1307 raise error, 'empty negated set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001308 stack.append([Set(notset)])
1309 else:
Guido van Rossum04a1d741997-07-15 14:38:13 +00001310 if len(set) == 0:
1311 raise error, 'empty set'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001312 stack.append([Set(set)])
1313
1314 else:
1315 stack.append([Exact(char)])
1316
1317 code = []
1318 while len(stack) > 0:
1319 if stack[-1][0].name == '(':
1320 raise error, 'too many open parens'
1321 code = stack[-1] + code
1322 del stack[-1]
1323 if len(code) == 0:
1324 raise error, 'no code generated'
1325 if (code[-1].name == '|'):
Guido van Rossum04a1d741997-07-15 14:38:13 +00001326 raise error, 'alternate with nothing on the right'
Guido van Rossum5ca1b711997-07-10 21:00:31 +00001327 code = filter(lambda x: x.name != '|', code)
1328 need_label = 0
1329 for i in range(len(code)):
1330 if (code[i].name == 'jump') and (code[i].label == -1):
1331 code[i] = Jump(label)
1332 need_label = 1
1333 if need_label:
1334 code.append(Label(label))
1335 label = label + 1
1336 code.append(End())
1337 return RegexObject(pattern, flags, code, register, groupindex, callouts)
1338
1339if __name__ == '__main__':
1340 print compile('a(b)*')
1341 print compile('a{3}')
1342 print compile('(a){2}')
1343 print compile('a{2,4}')
1344 print compile('a|b')
1345 print compile('a(b|c)')
1346 print compile('a*')
1347 print compile('a+')
1348 print compile('a|b|c')
1349 print compile('a(b|c)*')
1350 print compile('\\n')
1351 print compile('a(?# huh huh)b')
1352 print compile('[a-c\\w]')
1353 print compile('[[]')
1354 print compile('[]]')
1355 print compile('(<hello>a)')
1356 print compile('\Q*\e')
1357 print compile('a{0,}')
1358