blob: f36647e177d8f87a4ceb5c0309c2a6db15efe8ef [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* $Id$ */
33
34/* Regular expression objects */
35/* This uses Tatu Ylonen's copyleft-free reimplementation of
36 GNU regular expressions */
37
38#include "Python.h"
39
40#include <ctype.h>
41
42#include "regexpr.h"
43
44static PyObject *ReopError; /* Exception */
45
Guido van Rossum74fb3031997-07-17 22:41:38 +000046#define IGNORECASE 0x01
47#define MULTILINE 0x02
48#define DOTALL 0x04
49#define VERBOSE 0x08
50
Guido van Rossumc24f0381997-08-13 03:24:53 +000051#define NORMAL 0
52#define CHARCLASS 1
53#define REPLACEMENT 2
54
55#define CHAR 0
56#define MEMORY_REFERENCE 1
57#define SYNTAX 2
58#define NOT_SYNTAX 3
59#define SET 4
60#define WORD_BOUNDARY 5
61#define NOT_WORD_BOUNDARY 6
62#define BEGINNING_OF_BUFFER 7
63#define END_OF_BUFFER 8
64
Guido van Rossum95e80531997-08-13 22:34:14 +000065static unsigned char *reop_casefold;
Guido van Rossum74fb3031997-07-17 22:41:38 +000066
Guido van Rossumdb25f321997-07-10 14:31:32 +000067static PyObject *
68makeresult(regs, num_regs)
69 struct re_registers *regs;
70 int num_regs;
71{
72 PyObject *v;
73 int i;
74 static PyObject *filler = NULL;
75
76 if (filler == NULL) {
77 filler = Py_BuildValue("(ii)", -1, -1);
78 if (filler == NULL)
79 return NULL;
80 }
81 v = PyTuple_New(num_regs);
82 if (v == NULL)
83 return NULL;
84
85 for (i = 0; i < num_regs; i++) {
86 int lo = regs->start[i];
87 int hi = regs->end[i];
88 PyObject *w;
89 if (lo == -1 && hi == -1) {
90 w = filler;
91 Py_INCREF(w);
92 }
93 else
94 w = Py_BuildValue("(ii)", lo, hi);
95 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
96 Py_DECREF(v);
97 return NULL;
98 }
99 }
100 return v;
101}
102
103static PyObject *
104reop_match(self, args)
105 PyObject *self;
106 PyObject *args;
107{
Guido van Rossum95e80531997-08-13 22:34:14 +0000108 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000109 int fastmaplen, stringlen;
110 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000111 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000112 struct re_pattern_buffer bufp;
113 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000114 PyObject *modules = NULL;
115 PyObject *reopmodule = NULL;
116 PyObject *reopdict = NULL;
117 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118
119 if (!PyArg_Parse(args, "(s#iiis#is#i)",
120 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000121 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000122 &(bufp.fastmap), &fastmaplen,
123 &anchor,
124 &string, &stringlen,
125 &pos))
126 return NULL;
127
128 /* XXX sanity-check the input data */
129 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000130 if (flags & IGNORECASE)
131 {
132 if ((modules = PyImport_GetModuleDict()) == NULL)
133 return NULL;
134
135 if ((reopmodule = PyDict_GetItemString(modules,
136 "reop")) == NULL)
137 return NULL;
138
139 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
140 return NULL;
141
142 if ((casefold = PyDict_GetItemString(reopdict,
143 "casefold")) == NULL)
144 return NULL;
145
146 bufp.translate = PyString_AsString(casefold);
147 }
148 else
149 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000150 bufp.fastmap_accurate=1;
151 bufp.can_be_null=can_be_null;
152 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000153 bufp.anchor=anchor;
154
Guido van Rossum74fb3031997-07-17 22:41:38 +0000155 for(i=0; i<bufp.num_registers; i++) {
156 re_regs.start[i]=-1;
157 re_regs.end[i]=-1;
158 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000159
160 result = re_match(&bufp,
161 string, stringlen, pos,
162 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000163
Guido van Rossumdb25f321997-07-10 14:31:32 +0000164 if (result < -1) {
165 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000166 if (!PyErr_Occurred())
167 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000168 return NULL;
169 }
Guido van Rossum63e18191997-07-11 11:08:38 +0000170 if (result == -1) {
171 Py_INCREF(Py_None);
172 return Py_None;
173 }
Guido van Rossum04a1d741997-07-15 14:38:13 +0000174 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000175}
176
Guido van Rossum95e80531997-08-13 22:34:14 +0000177#if 0
178static PyObject *
179reop_optimize(self, args)
180 PyObject *self;
181 PyObject *args;
182{
183 unsigned char *buffer;
184 int buflen;
185 struct re_pattern_buffer bufp;
186
187 PyObject *opt_code;
188
189 if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL;
190 /* Create a new string for the optimized code */
191 opt_code=PyString_FromStringAndSize(buffer, buflen);
192 if (opt_code!=NULL)
193 {
194 bufp.buffer = PyString_AsString(opt_code);
195 bufp.used=bufp.allocated=buflen;
196
197 }
198 return opt_code;
199
200}
201#endif
202
Guido van Rossumdb25f321997-07-10 14:31:32 +0000203static PyObject *
204reop_search(self, args)
205 PyObject *self;
206 PyObject *args;
207{
Guido van Rossum95e80531997-08-13 22:34:14 +0000208 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000209 int fastmaplen, stringlen;
210 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000211 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000212 struct re_pattern_buffer bufp;
213 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000214 PyObject *modules = NULL;
215 PyObject *reopmodule = NULL;
216 PyObject *reopdict = NULL;
217 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000218
219 if (!PyArg_Parse(args, "(s#iiis#is#i)",
220 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000221 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000222 &(bufp.fastmap), &fastmaplen,
223 &anchor,
224 &string, &stringlen,
225 &pos))
226 return NULL;
227
228 /* XXX sanity-check the input data */
229 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000230 if (flags & IGNORECASE)
231 {
232 if ((modules = PyImport_GetModuleDict()) == NULL)
233 return NULL;
234
235 if ((reopmodule = PyDict_GetItemString(modules,
236 "reop")) == NULL)
237 return NULL;
238
239 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
240 return NULL;
241
242 if ((casefold = PyDict_GetItemString(reopdict,
243 "casefold")) == NULL)
244 return NULL;
245
246 bufp.translate = PyString_AsString(casefold);
247 }
248 else
249 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000250 bufp.fastmap_accurate=1;
251 bufp.can_be_null=can_be_null;
252 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000253 bufp.anchor=anchor;
254
Guido van Rossum74fb3031997-07-17 22:41:38 +0000255 for(i = 0; i < bufp.num_registers; i++) {
256 re_regs.start[i] = -1;
257 re_regs.end[i] = -1;
258 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000259
260 result = re_search(&bufp,
261 string, stringlen, pos, stringlen-pos,
262 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000263
Guido van Rossumdb25f321997-07-10 14:31:32 +0000264 if (result < -1) {
265 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000266 if (!PyErr_Occurred())
267 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000268 return NULL;
269 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000270
Guido van Rossum63e18191997-07-11 11:08:38 +0000271 if (result == -1) {
272 Py_INCREF(Py_None);
273 return Py_None;
274 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000275
Guido van Rossum04a1d741997-07-15 14:38:13 +0000276 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000277}
278
Guido van Rossumc24f0381997-08-13 03:24:53 +0000279static PyObject *
280reop_expand_escape(self, args)
281 PyObject *self;
282 PyObject *args;
283{
284 unsigned char c, *pattern;
285 int index, context=NORMAL, pattern_len;
286
287 if (!PyArg_ParseTuple(args, "s#i|i", &pattern, &pattern_len, &index,
288 &context))
289 return NULL;
290 if (pattern_len<=index)
291 {
292 PyErr_SetString(ReopError, "escape ends too soon");
293 return NULL;
294 }
295 c=pattern[index]; index++;
296 switch (c)
297 {
298 case('t'):
299 return Py_BuildValue("ici", CHAR, (char)9, index);
300 break;
301 case('n'):
302 return Py_BuildValue("ici", CHAR, (char)10, index);
303 break;
304 case('v'):
305 return Py_BuildValue("ici", CHAR, (char)11, index);
306 break;
307 case('r'):
308 return Py_BuildValue("ici", CHAR, (char)13, index);
309 break;
310 case('f'):
311 return Py_BuildValue("ici", CHAR, (char)12, index);
312 break;
313 case('a'):
314 return Py_BuildValue("ici", CHAR, (char)7, index);
315 break;
316 case('x'):
317 {
318 int end, length;
319 unsigned char *string;
320 PyObject *v, *result;
321
322 end=index;
323 while (end<pattern_len &&
324 ( re_syntax_table[ pattern[end] ] & Shexdigit ) )
325 end++;
326 if (end==index)
327 {
328 PyErr_SetString(ReopError, "\\x must be followed by hex digits");
329 return NULL;
330 }
331 length=end-index;
332 string=malloc(length+4+1);
333 if (string==NULL)
334 {
335 PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
336 return NULL;
337 }
338 /* Create a string containing "\x<hexdigits>", which will be
339 passed to eval() */
340 string[0]=string[length+3]='"';
341 string[1]='\\';
342 string[length+4]='\0';
343 memcpy(string+2, pattern+index-1, length+1);
344 v=PyRun_String(string, Py_eval_input,
345 PyEval_GetGlobals(), PyEval_GetLocals());
346 free(string);
347 /* The evaluation raised an exception */
348 if (v==NULL) return NULL;
349 result=Py_BuildValue("iOi", CHAR, v, end);
350 Py_DECREF(v);
351 return result;
352 }
353 break;
354
355 case('b'):
356 if (context!=NORMAL)
357 return Py_BuildValue("ici", CHAR, (char)8, index);
358 else
359 {
360 unsigned char empty_string[1];
361 empty_string[0]='\0';
362 return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
363 }
364 break;
365 case('B'):
366 if (context!=NORMAL)
367 return Py_BuildValue("ici", CHAR, 'B', index);
368 else
369 {
370 unsigned char empty_string[1];
371 empty_string[0]='\0';
372 return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
373 }
374 break;
375 case('A'):
376 if (context!=NORMAL)
377 return Py_BuildValue("ici", CHAR, 'A', index);
378 else
379 {
380 unsigned char empty_string[1];
381 empty_string[0]='\0';
382 return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
383 }
384 break;
385 case('Z'):
386 if (context!=NORMAL)
387 return Py_BuildValue("ici", CHAR, 'Z', index);
388 else
389 {
390 unsigned char empty_string[1];
391 empty_string[0]='\0';
392 return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
393 }
394 break;
395 case('E'): case('G'): case('L'): case('Q'):
396 case('U'): case('l'): case('u'):
397 {
398 char message[50];
399 sprintf(message, "\\%c is not allowed", c);
400 PyErr_SetString(ReopError, message);
401 return NULL;
402 }
403
404 case ('w'):
405 if (context==NORMAL)
406 return Py_BuildValue("iii", SYNTAX, Sword, index);
407 if (context!=CHARCLASS)
408 return Py_BuildValue("ici", CHAR, 'w', index);
409 {
410 /* context==CHARCLASS */
411 unsigned char set[256];
412 int i, j;
413 for(i=j=0; i<256; i++)
414 if (re_syntax_table[i] & Sword)
415 {
416 set[j++] = i;
417 }
418 return Py_BuildValue("is#i", SET, set, j, index);
419 }
420 break;
421 case ('W'):
422 if (context==NORMAL)
423 return Py_BuildValue("iii", NOT_SYNTAX, Sword, index);
424 if (context!=CHARCLASS)
425 return Py_BuildValue("ici", CHAR, 'W', index);
426 {
427 /* context==CHARCLASS */
428 unsigned char set[256];
429 int i, j;
430 for(i=j=0; i<256; i++)
431 if (! (re_syntax_table[i] & Sword))
432 {
433 set[j++] = i;
434 }
435 return Py_BuildValue("is#i", SET, set, j, index);
436 }
437 break;
438 case ('s'):
439 if (context==NORMAL)
440 return Py_BuildValue("iii", SYNTAX, Swhitespace, index);
441 if (context!=CHARCLASS)
442 return Py_BuildValue("ici", CHAR, 's', index);
443 {
444 /* context==CHARCLASS */
445 unsigned char set[256];
446 int i, j;
447 for(i=j=0; i<256; i++)
448 if (re_syntax_table[i] & Swhitespace)
449 {
450 set[j++] = i;
451 }
452 return Py_BuildValue("is#i", SET, set, j, index);
453 }
454 break;
455 case ('S'):
456 if (context==NORMAL)
457 return Py_BuildValue("iii", NOT_SYNTAX, Swhitespace, index);
458 if (context!=CHARCLASS)
459 return Py_BuildValue("ici", CHAR, 'S', index);
460 {
461 /* context==CHARCLASS */
462 unsigned char set[256];
463 int i, j;
464 for(i=j=0; i<256; i++)
465 if (! (re_syntax_table[i] & Swhitespace) )
466 {
467 set[j++] = i;
468 }
469 return Py_BuildValue("is#i", SET, set, j, index);
470 }
471 break;
472
473 case ('d'):
474 if (context==NORMAL)
475 return Py_BuildValue("iii", SYNTAX, Sdigit, index);
476 if (context!=CHARCLASS)
477 return Py_BuildValue("ici", CHAR, 'd', index);
478 {
479 /* context==CHARCLASS */
480 unsigned char set[256];
481 int i, j;
482 for(i=j=0; i<256; i++)
483 if (re_syntax_table[i] & Sdigit)
484 {
485 set[j++] = i;
486 }
487 return Py_BuildValue("is#i", SET, set, j, index);
488 }
489 break;
490 case ('D'):
491 if (context==NORMAL)
492 return Py_BuildValue("iii", NOT_SYNTAX, Sdigit, index);
493 if (context!=CHARCLASS)
494 return Py_BuildValue("ici", CHAR, 'D', index);
495 {
496 /* context==CHARCLASS */
497 unsigned char set[256];
498 int i, j;
499 for(i=j=0; i<256; i++)
500 if ( !(re_syntax_table[i] & Sdigit) )
501 {
502 set[j++] = i;
503 }
504 return Py_BuildValue("is#i", SET, set, j, index);
505 }
506 break;
507
508 case('g'):
509 {
510 int end, valid, i;
511 if (context!=REPLACEMENT)
512 return Py_BuildValue("ici", CHAR, 'g', index);
513 if (pattern_len<=index)
514 {
515 PyErr_SetString(ReopError, "unfinished symbolic reference");
516 return NULL;
517 }
518 if (pattern[index]!='<')
519 {
520 PyErr_SetString(ReopError, "missing < in symbolic reference");
521 return NULL;
522 }
523 index++;
524 end=index;
525 while (end<pattern_len && pattern[end]!='>')
526 end++;
527 if (end==pattern_len)
528 {
529 PyErr_SetString(ReopError, "unfinished symbolic reference");
530 return NULL;
531 }
532 valid=1;
533 if (index==end /* Zero-length name */
534 || !(re_syntax_table[pattern[index]] & Sword) /* First char. not alphanumeric */
535 || (re_syntax_table[pattern[index]] & Sdigit) ) /* First char. a digit */
536 valid=0;
537
538 for(i=index+1; i<end; i++)
539 {
540 if (!(re_syntax_table[pattern[i]] & Sword) )
541 valid=0;
542 }
543 if (!valid)
544 {
545 /* XXX should include the text of the reference */
546 PyErr_SetString(ReopError, "illegal symbolic reference");
547 return NULL;
548 }
549
550 return Py_BuildValue("is#i", MEMORY_REFERENCE,
551 pattern+index, end-index,
552 end+1);
553 }
554 break;
555
556 case('0'):
557 {
558 /* \0 always indicates an octal escape, so we consume up to 3
559 characters, as long as they're all octal digits */
560 int octval=0, i;
561 index--;
562 for(i=index;
563 i<=index+2 && i<pattern_len
564 && (re_syntax_table[ pattern[i] ] & Soctaldigit );
565 i++)
566 {
567 octval = octval * 8 + pattern[i] - '0';
568 }
569 if (octval>255)
570 {
571 PyErr_SetString(ReopError, "octal value out of range");
572 return NULL;
573 }
574 return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
575 }
576 break;
577 case('1'): case('2'): case('3'): case('4'):
578 case('5'): case('6'): case('7'): case('8'):
579 case('9'):
580 {
581 /* Handle \?, where ? is from 1 through 9 */
582 int value=0;
583 index--;
584 /* If it's at least a two-digit reference, like \34, it might
585 either be a 3-digit octal escape (\123) or a 2-digit
586 decimal memory reference (\34) */
587
588 if ( (index+1) <pattern_len &&
589 (re_syntax_table[ pattern[index+1] ] & Sdigit) )
590 {
591 if ( (index+2) <pattern_len &&
592 (re_syntax_table[ pattern[index+2] ] & Soctaldigit) &&
593 (re_syntax_table[ pattern[index+1] ] & Soctaldigit) &&
594 (re_syntax_table[ pattern[index ] ] & Soctaldigit)
595 )
596 {
597 /* 3 octal digits */
598 value= 8*8*(pattern[index ]-'0') +
599 8*(pattern[index+1]-'0') +
600 (pattern[index+2]-'0');
601 if (value>255)
602 {
603 PyErr_SetString(ReopError, "octal value out of range");
604 return NULL;
605 }
606 return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
607 }
608 else
609 {
610 /* 2-digit form, so it's a memory reference */
611 if (context==CHARCLASS)
612 {
613 PyErr_SetString(ReopError, "cannot reference a register "
614 "from inside a character class");
615 return NULL;
616 }
617 value= 10*(pattern[index ]-'0') +
618 (pattern[index+1]-'0');
619 if (value<1 || RE_NREGS<=value)
620 {
621 PyErr_SetString(ReopError, "memory reference out of range");
622 return NULL;
623 }
624 return Py_BuildValue("iii", MEMORY_REFERENCE,
625 value, index+2);
626 }
627 }
628 else
629 {
630 /* Single-digit form, like \2, so it's a memory reference */
631 if (context==CHARCLASS)
632 {
633 PyErr_SetString(ReopError, "cannot reference a register "
634 "from inside a character class");
635 return NULL;
636 }
637 return Py_BuildValue("iii", MEMORY_REFERENCE,
638 pattern[index]-'0', index+1);
639 }
640 }
641 break;
642
643 default:
644 return Py_BuildValue("ici", CHAR, c, index);
645 break;
646 }
647}
648
649static PyObject *
650reop__expand(self, args)
651 PyObject *self;
652 PyObject *args;
653{
654 PyObject *results, *match_obj;
655 PyObject *repl_obj, *newstring;
Guido van Rossum95e80531997-08-13 22:34:14 +0000656 unsigned char *repl;
Guido van Rossumc24f0381997-08-13 03:24:53 +0000657 int size, total_len, i, start, pos;
658
659 if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
660 return NULL;
661
662 repl=PyString_AsString(repl_obj);
663 size=PyString_Size(repl_obj);
664 results=PyList_New(0);
665 if (results==NULL) return NULL;
666 for(start=total_len=i=0; i<size; i++)
667 {
668 if (repl[i]=='\\')
669 {
670 PyObject *args, *t, *value;
671 int escape_type;
672
673 if (start!=i)
674 {
675 PyList_Append(results,
676 PyString_FromStringAndSize(repl+start, i-start));
677 total_len += i-start;
678 }
679 i++;
680 args=Py_BuildValue("Oii", repl_obj, i, REPLACEMENT);
681 t=reop_expand_escape(NULL, args);
682 Py_DECREF(args);
683 if (t==NULL)
684 {
685 /* reop_expand_escape triggered an exception of some sort,
686 so just return */
687 Py_DECREF(results);
688 return NULL;
689 }
690 value=PyTuple_GetItem(t, 1);
691 escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
692 switch (escape_type)
693 {
694 case (CHAR):
695 PyList_Append(results, value);
696 total_len += PyString_Size(value);
697 break;
698 case(MEMORY_REFERENCE):
699 {
700 PyObject *r, *tuple, *result;
701 r=PyObject_GetAttrString(match_obj, "group");
702 tuple=PyTuple_New(1);
Guido van Rossumf1c018d1997-08-14 21:19:13 +0000703 Py_INCREF(value);
Guido van Rossumc24f0381997-08-13 03:24:53 +0000704 PyTuple_SetItem(tuple, 0, value);
705 result=PyEval_CallObject(r, tuple);
706 Py_DECREF(r); Py_DECREF(tuple);
707 if (result==NULL)
708 {
709 /* The group() method trigged an exception of some sort */
710 Py_DECREF(results);
711 return NULL;
712 }
713 if (result==Py_None)
714 {
715 char message[50];
716 sprintf(message,
717 "group %li did not contribute to the match",
718 PyInt_AsLong(value));
719 PyErr_SetString(ReopError,
720 message);
721 Py_DECREF(result);
722 Py_DECREF(t);
723 Py_DECREF(results);
724 return NULL;
725 }
726 /* xxx typecheck that it's a string! */
727 PyList_Append(results, result);
728 total_len += PyString_Size(result);
729 Py_DECREF(result);
730 }
731 break;
732 default:
733 Py_DECREF(t);
734 Py_DECREF(results);
735 PyErr_SetString(ReopError,
736 "bad escape in replacement");
737 return NULL;
738 }
739 i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
740 i--; /* Decrement now, because the 'for' loop will increment it */
741 Py_DECREF(t);
742 }
743 } /* endif repl[i]!='\\' */
744
745 if (start!=i)
746 {
747 PyList_Append(results, PyString_FromStringAndSize(repl+start, i-start));
748 total_len += i-start;
749 }
750
751 /* Whew! Now we've constructed a list containing various pieces of
752 strings that will make up our final result. So, iterate over
753 the list concatenating them. A new string measuring total_len
754 bytes is allocated and filled in. */
755
756 newstring=PyString_FromStringAndSize(NULL, total_len);
757 if (newstring==NULL)
758 {
759 Py_DECREF(results);
760 return NULL;
761 }
762
763 repl=PyString_AsString(newstring);
764 for (pos=i=0; i<PyList_Size(results); i++)
765 {
766 PyObject *item=PyList_GetItem(results, i);
767 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
768 pos += PyString_Size(item);
769 }
770 Py_DECREF(results);
771 return newstring;
772}
773
774
Guido van Rossumdb25f321997-07-10 14:31:32 +0000775#if 0
776/* Functions originally in the regsub module.
777 Added June 1, 1997.
778 */
779
780/* A cache of previously used patterns is maintained. Notice that if
781 you change the reop syntax flag, entries in the cache are
782 invalidated.
783 XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
784 every so often, or once it gets past a certain size?
785*/
786
787static PyObject *cache_dict=NULL;
788
789/* Accept an object; if it's a reop pattern, Py_INCREF it and return
790 it. If it's a string, a reop object is compiled and cached.
791*/
792
793static reopobject *
794cached_compile(pattern)
795 PyObject *pattern;
796{
797 reopobject *p2;
798
799 if (!PyString_Check(pattern))
800 {
801 /* It's not a string, so assume it's a compiled reop object */
802 /* XXX check that! */
803 Py_INCREF(pattern);
804 return (reopobject*)pattern;
805 }
806 if (cache_dict==NULL)
807 {
808 cache_dict=PyDict_New();
809 if (cache_dict==NULL)
810 {
811 return (reopobject*)NULL;
812 }
813 }
814
815 /* See if the pattern has already been cached; if so, return that
816 reop object */
817 p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
818 if (p2)
819 {
820 Py_INCREF(p2);
821 return (reopobject*)p2;
822 }
823
824 /* Compile the pattern and cache it */
825 p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
826 if (!p2) return p2;
827 PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
828 return p2;
829}
830
831
832static PyObject *
833internal_split(args, retain)
834 PyObject *args;
835 int retain;
836{
837 PyObject *newlist, *s;
838 reopobject *pattern;
839 int maxsplit=0, count=0, length, next=0, result;
840 int match_end=0; /* match_start is defined below */
Guido van Rossum95e80531997-08-13 22:34:14 +0000841 unsigned char *start;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000842
843 if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
844 &maxsplit))
845 {
846 PyErr_Clear();
847 if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
848 return NULL;
849 }
850 pattern=cached_compile((PyObject *)pattern);
851 if (!pattern) return NULL;
852
853 newlist=PyList_New(0);
854 if (!newlist) return NULL;
855
856 do
857 {
858 result = re_search(&pattern->re_patbuf,
859 start, length, next, length-next,
860 &pattern->re_regs);
861 if (result < -1)
862 { /* Erk... an error happened during the reop search */
863 Py_DECREF(newlist);
864 PyErr_SetString(ReopError, "match failure");
865 return NULL;
866 }
867 if (next<=result)
868 {
869 int match_start=pattern->re_regs.start[0];
870 int oldmatch_end=match_end;
871 match_end=pattern->re_regs.end[0];
872
873 if (match_start==match_end)
874 { /* A zero-length match; increment to the next position */
875 next=result+1;
876 match_end=oldmatch_end;
877 continue;
878 }
879
880 /* Append the string up to the start of the match */
881 s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
882 if (!s)
883 {
884 Py_DECREF(newlist);
885 return NULL;
886 }
887 PyList_Append(newlist, s);
888 Py_DECREF(s);
889
890 if (retain)
891 {
892 /* Append a string containing whatever matched */
893 s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
894 if (!s)
895 {
896 Py_DECREF(newlist);
897 return NULL;
898 }
899 PyList_Append(newlist, s);
900 Py_DECREF(s);
901 }
902 /* Update the pointer, and increment the count of splits */
903 next=match_end; count++;
904 }
905 } while (result!=-1 && !(maxsplit && maxsplit==count) &&
906 next<length);
907 s=PyString_FromStringAndSize(start+match_end, length-match_end);
908 if (!s)
909 {
910 Py_DECREF(newlist);
911 return NULL;
912 }
913 PyList_Append(newlist, s);
914 Py_DECREF(s);
915 Py_DECREF(pattern);
916 return newlist;
917}
918
919static PyObject *
920reop_split(self, args)
921 PyObject *self;
922 PyObject *args;
923{
924 return internal_split(args, 0);
925}
926
927static PyObject *
928reop_splitx(self, args)
929 PyObject *self;
930 PyObject *args;
931{
932 return internal_split(args, 1);
933}
934#endif
935
936static struct PyMethodDef reop_global_methods[] = {
937 {"match", reop_match, 0},
938 {"search", reop_search, 0},
Guido van Rossumc24f0381997-08-13 03:24:53 +0000939 {"expand_escape", reop_expand_escape, 1},
940 {"_expand", reop__expand, 1},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000941#if 0
Guido van Rossum95e80531997-08-13 22:34:14 +0000942 {"_optimize", reop_optimize, 0},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000943 {"split", reop_split, 0},
944 {"splitx", reop_splitx, 0},
945#endif
946 {NULL, NULL} /* sentinel */
947};
948
949void
950initreop()
951{
Guido van Rossum74fb3031997-07-17 22:41:38 +0000952 PyObject *m, *d, *k, *v, *o;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000953 int i;
Guido van Rossum95e80531997-08-13 22:34:14 +0000954 unsigned char *s;
955 unsigned char j[2];
Guido van Rossum74fb3031997-07-17 22:41:38 +0000956
957 re_compile_initialize();
958
Guido van Rossumdb25f321997-07-10 14:31:32 +0000959 m = Py_InitModule("reop", reop_global_methods);
960 d = PyModule_GetDict(m);
961
962 /* Initialize reop.error exception */
963 v = ReopError = PyString_FromString("reop.error");
964 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
965 goto finally;
966
967 /* Initialize reop.casefold constant */
Guido van Rossum95e80531997-08-13 22:34:14 +0000968 if (!(v = PyString_FromStringAndSize((unsigned char *)NULL, 256)))
Guido van Rossumdb25f321997-07-10 14:31:32 +0000969 goto finally;
970
971 if (!(s = PyString_AsString(v)))
972 goto finally;
973
974 for (i = 0; i < 256; i++) {
975 if (isupper(i))
976 s[i] = tolower(i);
977 else
978 s[i] = i;
979 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000980
Guido van Rossumdb25f321997-07-10 14:31:32 +0000981 if (PyDict_SetItemString(d, "casefold", v) < 0)
982 goto finally;
983 Py_DECREF(v);
984
Guido van Rossum74fb3031997-07-17 22:41:38 +0000985 /* Initialize the syntax table */
986
987 o = PyDict_New();
988 if (o == NULL)
989 goto finally;
990
991 j[1] = '\0';
992 for (i = 0; i < 256; i++)
993 {
994 j[0] = i;
995 k = PyString_FromStringAndSize(j, 1);
996 if (k == NULL)
997 goto finally;
998 v = PyInt_FromLong(re_syntax_table[i]);
999 if (v == NULL)
1000 goto finally;
1001 if (PyDict_SetItem(o, k, v) < 0)
1002 goto finally;
1003 Py_DECREF(k);
1004 Py_DECREF(v);
1005 }
1006
1007 if (PyDict_SetItemString(d, "syntax_table", o) < 0)
1008 goto finally;
1009 Py_DECREF(o);
1010
1011 v = PyInt_FromLong(Sword);
1012 if (v == NULL)
1013 goto finally;
1014
1015 if (PyDict_SetItemString(d, "word", v) < 0)
1016 goto finally;
1017 Py_DECREF(v);
1018
1019 v = PyInt_FromLong(Swhitespace);
1020 if (v == NULL)
1021 goto finally;
1022
1023 if (PyDict_SetItemString(d, "whitespace", v) < 0)
1024 goto finally;
1025 Py_DECREF(v);
1026
1027 v = PyInt_FromLong(Sdigit);
1028 if (v == NULL)
1029 goto finally;
1030
1031 if (PyDict_SetItemString(d, "digit", v) < 0)
1032 goto finally;
1033 Py_DECREF(v);
Guido van Rossumc24f0381997-08-13 03:24:53 +00001034
1035 PyDict_SetItemString(d, "NORMAL", PyInt_FromLong(NORMAL));
1036 PyDict_SetItemString(d, "CHARCLASS", PyInt_FromLong(CHARCLASS));
1037 PyDict_SetItemString(d, "REPLACEMENT", PyInt_FromLong(REPLACEMENT));
1038
1039 PyDict_SetItemString(d, "CHAR", PyInt_FromLong(CHAR));
1040 PyDict_SetItemString(d, "MEMORY_REFERENCE", PyInt_FromLong(MEMORY_REFERENCE));
1041 PyDict_SetItemString(d, "SYNTAX", PyInt_FromLong(SYNTAX));
1042 PyDict_SetItemString(d, "NOT_SYNTAX", PyInt_FromLong(NOT_SYNTAX));
1043 PyDict_SetItemString(d, "SET", PyInt_FromLong(SET));
1044 PyDict_SetItemString(d, "WORD_BOUNDARY", PyInt_FromLong(WORD_BOUNDARY));
1045 PyDict_SetItemString(d, "NOT_WORD_BOUNDARY", PyInt_FromLong(NOT_WORD_BOUNDARY));
1046 PyDict_SetItemString(d, "BEGINNING_OF_BUFFER", PyInt_FromLong(BEGINNING_OF_BUFFER));
1047 PyDict_SetItemString(d, "END_OF_BUFFER", PyInt_FromLong(END_OF_BUFFER));
1048
Guido van Rossumdb25f321997-07-10 14:31:32 +00001049 if (!PyErr_Occurred())
1050 return;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001051
Guido van Rossumdb25f321997-07-10 14:31:32 +00001052 finally:
1053 Py_FatalError("can't initialize reop module");
1054}
Guido van Rossumc24f0381997-08-13 03:24:53 +00001055