blob: 39e6ecea453387046d32b3720695b2acde9bb67a [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* $Id$ */
33
34/* Regular expression objects */
35/* This uses Tatu Ylonen's copyleft-free reimplementation of
36 GNU regular expressions */
37
38#include "Python.h"
39
40#include <ctype.h>
41
42#include "regexpr.h"
43
44static PyObject *ReopError; /* Exception */
45
Guido van Rossum74fb3031997-07-17 22:41:38 +000046#define IGNORECASE 0x01
47#define MULTILINE 0x02
48#define DOTALL 0x04
49#define VERBOSE 0x08
50
Guido van Rossumc24f0381997-08-13 03:24:53 +000051#define NORMAL 0
52#define CHARCLASS 1
53#define REPLACEMENT 2
54
55#define CHAR 0
56#define MEMORY_REFERENCE 1
57#define SYNTAX 2
58#define NOT_SYNTAX 3
59#define SET 4
60#define WORD_BOUNDARY 5
61#define NOT_WORD_BOUNDARY 6
62#define BEGINNING_OF_BUFFER 7
63#define END_OF_BUFFER 8
64
Guido van Rossum95e80531997-08-13 22:34:14 +000065static unsigned char *reop_casefold;
Guido van Rossum74fb3031997-07-17 22:41:38 +000066
Guido van Rossumdb25f321997-07-10 14:31:32 +000067static PyObject *
68makeresult(regs, num_regs)
69 struct re_registers *regs;
70 int num_regs;
71{
72 PyObject *v;
73 int i;
74 static PyObject *filler = NULL;
75
76 if (filler == NULL) {
77 filler = Py_BuildValue("(ii)", -1, -1);
78 if (filler == NULL)
79 return NULL;
80 }
81 v = PyTuple_New(num_regs);
82 if (v == NULL)
83 return NULL;
84
85 for (i = 0; i < num_regs; i++) {
86 int lo = regs->start[i];
87 int hi = regs->end[i];
88 PyObject *w;
89 if (lo == -1 && hi == -1) {
90 w = filler;
91 Py_INCREF(w);
92 }
93 else
94 w = Py_BuildValue("(ii)", lo, hi);
95 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
96 Py_DECREF(v);
97 return NULL;
98 }
99 }
100 return v;
101}
102
103static PyObject *
104reop_match(self, args)
105 PyObject *self;
106 PyObject *args;
107{
Guido van Rossum95e80531997-08-13 22:34:14 +0000108 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000109 int fastmaplen, stringlen;
110 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000111 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000112 struct re_pattern_buffer bufp;
113 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000114 PyObject *modules = NULL;
115 PyObject *reopmodule = NULL;
116 PyObject *reopdict = NULL;
117 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118
119 if (!PyArg_Parse(args, "(s#iiis#is#i)",
120 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000121 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000122 &(bufp.fastmap), &fastmaplen,
123 &anchor,
124 &string, &stringlen,
125 &pos))
126 return NULL;
127
128 /* XXX sanity-check the input data */
129 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000130 if (flags & IGNORECASE)
131 {
132 if ((modules = PyImport_GetModuleDict()) == NULL)
133 return NULL;
134
135 if ((reopmodule = PyDict_GetItemString(modules,
136 "reop")) == NULL)
137 return NULL;
138
139 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
140 return NULL;
141
142 if ((casefold = PyDict_GetItemString(reopdict,
143 "casefold")) == NULL)
144 return NULL;
145
146 bufp.translate = PyString_AsString(casefold);
147 }
148 else
149 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000150 bufp.fastmap_accurate=1;
151 bufp.can_be_null=can_be_null;
152 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000153 bufp.anchor=anchor;
154
Guido van Rossum74fb3031997-07-17 22:41:38 +0000155 for(i=0; i<bufp.num_registers; i++) {
156 re_regs.start[i]=-1;
157 re_regs.end[i]=-1;
158 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000159
160 result = re_match(&bufp,
161 string, stringlen, pos,
162 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000163
Guido van Rossumdb25f321997-07-10 14:31:32 +0000164 if (result < -1) {
165 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000166 if (!PyErr_Occurred())
167 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000168 return NULL;
169 }
Guido van Rossum63e18191997-07-11 11:08:38 +0000170 if (result == -1) {
171 Py_INCREF(Py_None);
172 return Py_None;
173 }
Guido van Rossum04a1d741997-07-15 14:38:13 +0000174 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000175}
176
Guido van Rossum95e80531997-08-13 22:34:14 +0000177#if 0
178static PyObject *
179reop_optimize(self, args)
180 PyObject *self;
181 PyObject *args;
182{
183 unsigned char *buffer;
184 int buflen;
185 struct re_pattern_buffer bufp;
186
187 PyObject *opt_code;
188
189 if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL;
190 /* Create a new string for the optimized code */
191 opt_code=PyString_FromStringAndSize(buffer, buflen);
192 if (opt_code!=NULL)
193 {
194 bufp.buffer = PyString_AsString(opt_code);
195 bufp.used=bufp.allocated=buflen;
196
197 }
198 return opt_code;
199
200}
201#endif
202
Guido van Rossumdb25f321997-07-10 14:31:32 +0000203static PyObject *
204reop_search(self, args)
205 PyObject *self;
206 PyObject *args;
207{
Guido van Rossum95e80531997-08-13 22:34:14 +0000208 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000209 int fastmaplen, stringlen;
210 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000211 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000212 struct re_pattern_buffer bufp;
213 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000214 PyObject *modules = NULL;
215 PyObject *reopmodule = NULL;
216 PyObject *reopdict = NULL;
217 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000218
219 if (!PyArg_Parse(args, "(s#iiis#is#i)",
220 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000221 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000222 &(bufp.fastmap), &fastmaplen,
223 &anchor,
224 &string, &stringlen,
225 &pos))
226 return NULL;
227
228 /* XXX sanity-check the input data */
229 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000230 if (flags & IGNORECASE)
231 {
232 if ((modules = PyImport_GetModuleDict()) == NULL)
233 return NULL;
234
235 if ((reopmodule = PyDict_GetItemString(modules,
236 "reop")) == NULL)
237 return NULL;
238
239 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
240 return NULL;
241
242 if ((casefold = PyDict_GetItemString(reopdict,
243 "casefold")) == NULL)
244 return NULL;
245
246 bufp.translate = PyString_AsString(casefold);
247 }
248 else
249 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000250 bufp.fastmap_accurate=1;
251 bufp.can_be_null=can_be_null;
252 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000253 bufp.anchor=anchor;
254
Guido van Rossum74fb3031997-07-17 22:41:38 +0000255 for(i = 0; i < bufp.num_registers; i++) {
256 re_regs.start[i] = -1;
257 re_regs.end[i] = -1;
258 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000259
260 result = re_search(&bufp,
261 string, stringlen, pos, stringlen-pos,
262 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000263
Guido van Rossumdb25f321997-07-10 14:31:32 +0000264 if (result < -1) {
265 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000266 if (!PyErr_Occurred())
267 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000268 return NULL;
269 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000270
Guido van Rossum63e18191997-07-11 11:08:38 +0000271 if (result == -1) {
272 Py_INCREF(Py_None);
273 return Py_None;
274 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000275
Guido van Rossum04a1d741997-07-15 14:38:13 +0000276 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000277}
278
Guido van Rossumc24f0381997-08-13 03:24:53 +0000279static PyObject *
280reop_expand_escape(self, args)
281 PyObject *self;
282 PyObject *args;
283{
284 unsigned char c, *pattern;
285 int index, context=NORMAL, pattern_len;
286
287 if (!PyArg_ParseTuple(args, "s#i|i", &pattern, &pattern_len, &index,
288 &context))
289 return NULL;
290 if (pattern_len<=index)
291 {
292 PyErr_SetString(ReopError, "escape ends too soon");
293 return NULL;
294 }
295 c=pattern[index]; index++;
296 switch (c)
297 {
298 case('t'):
299 return Py_BuildValue("ici", CHAR, (char)9, index);
300 break;
301 case('n'):
302 return Py_BuildValue("ici", CHAR, (char)10, index);
303 break;
304 case('v'):
305 return Py_BuildValue("ici", CHAR, (char)11, index);
306 break;
307 case('r'):
308 return Py_BuildValue("ici", CHAR, (char)13, index);
309 break;
310 case('f'):
311 return Py_BuildValue("ici", CHAR, (char)12, index);
312 break;
313 case('a'):
314 return Py_BuildValue("ici", CHAR, (char)7, index);
315 break;
316 case('x'):
317 {
318 int end, length;
319 unsigned char *string;
320 PyObject *v, *result;
321
322 end=index;
323 while (end<pattern_len &&
324 ( re_syntax_table[ pattern[end] ] & Shexdigit ) )
325 end++;
326 if (end==index)
327 {
328 PyErr_SetString(ReopError, "\\x must be followed by hex digits");
329 return NULL;
330 }
331 length=end-index;
332 string=malloc(length+4+1);
333 if (string==NULL)
334 {
335 PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
336 return NULL;
337 }
338 /* Create a string containing "\x<hexdigits>", which will be
339 passed to eval() */
340 string[0]=string[length+3]='"';
341 string[1]='\\';
342 string[length+4]='\0';
343 memcpy(string+2, pattern+index-1, length+1);
344 v=PyRun_String(string, Py_eval_input,
345 PyEval_GetGlobals(), PyEval_GetLocals());
346 free(string);
347 /* The evaluation raised an exception */
348 if (v==NULL) return NULL;
349 result=Py_BuildValue("iOi", CHAR, v, end);
350 Py_DECREF(v);
351 return result;
352 }
353 break;
354
355 case('b'):
356 if (context!=NORMAL)
357 return Py_BuildValue("ici", CHAR, (char)8, index);
358 else
359 {
360 unsigned char empty_string[1];
361 empty_string[0]='\0';
362 return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
363 }
364 break;
365 case('B'):
366 if (context!=NORMAL)
367 return Py_BuildValue("ici", CHAR, 'B', index);
368 else
369 {
370 unsigned char empty_string[1];
371 empty_string[0]='\0';
372 return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
373 }
374 break;
375 case('A'):
376 if (context!=NORMAL)
377 return Py_BuildValue("ici", CHAR, 'A', index);
378 else
379 {
380 unsigned char empty_string[1];
381 empty_string[0]='\0';
382 return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
383 }
384 break;
385 case('Z'):
386 if (context!=NORMAL)
387 return Py_BuildValue("ici", CHAR, 'Z', index);
388 else
389 {
390 unsigned char empty_string[1];
391 empty_string[0]='\0';
392 return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
393 }
394 break;
395 case('E'): case('G'): case('L'): case('Q'):
396 case('U'): case('l'): case('u'):
397 {
398 char message[50];
399 sprintf(message, "\\%c is not allowed", c);
400 PyErr_SetString(ReopError, message);
401 return NULL;
402 }
403
404 case ('w'):
405 if (context==NORMAL)
406 return Py_BuildValue("iii", SYNTAX, Sword, index);
407 if (context!=CHARCLASS)
408 return Py_BuildValue("ici", CHAR, 'w', index);
409 {
410 /* context==CHARCLASS */
411 unsigned char set[256];
412 int i, j;
413 for(i=j=0; i<256; i++)
414 if (re_syntax_table[i] & Sword)
415 {
416 set[j++] = i;
417 }
418 return Py_BuildValue("is#i", SET, set, j, index);
419 }
420 break;
421 case ('W'):
422 if (context==NORMAL)
423 return Py_BuildValue("iii", NOT_SYNTAX, Sword, index);
424 if (context!=CHARCLASS)
425 return Py_BuildValue("ici", CHAR, 'W', index);
426 {
427 /* context==CHARCLASS */
428 unsigned char set[256];
429 int i, j;
430 for(i=j=0; i<256; i++)
431 if (! (re_syntax_table[i] & Sword))
432 {
433 set[j++] = i;
434 }
435 return Py_BuildValue("is#i", SET, set, j, index);
436 }
437 break;
438 case ('s'):
439 if (context==NORMAL)
440 return Py_BuildValue("iii", SYNTAX, Swhitespace, index);
441 if (context!=CHARCLASS)
442 return Py_BuildValue("ici", CHAR, 's', index);
443 {
444 /* context==CHARCLASS */
445 unsigned char set[256];
446 int i, j;
447 for(i=j=0; i<256; i++)
448 if (re_syntax_table[i] & Swhitespace)
449 {
450 set[j++] = i;
451 }
452 return Py_BuildValue("is#i", SET, set, j, index);
453 }
454 break;
455 case ('S'):
456 if (context==NORMAL)
457 return Py_BuildValue("iii", NOT_SYNTAX, Swhitespace, index);
458 if (context!=CHARCLASS)
459 return Py_BuildValue("ici", CHAR, 'S', index);
460 {
461 /* context==CHARCLASS */
462 unsigned char set[256];
463 int i, j;
464 for(i=j=0; i<256; i++)
465 if (! (re_syntax_table[i] & Swhitespace) )
466 {
467 set[j++] = i;
468 }
469 return Py_BuildValue("is#i", SET, set, j, index);
470 }
471 break;
472
473 case ('d'):
474 if (context==NORMAL)
475 return Py_BuildValue("iii", SYNTAX, Sdigit, index);
476 if (context!=CHARCLASS)
477 return Py_BuildValue("ici", CHAR, 'd', index);
478 {
479 /* context==CHARCLASS */
480 unsigned char set[256];
481 int i, j;
482 for(i=j=0; i<256; i++)
483 if (re_syntax_table[i] & Sdigit)
484 {
485 set[j++] = i;
486 }
487 return Py_BuildValue("is#i", SET, set, j, index);
488 }
489 break;
490 case ('D'):
491 if (context==NORMAL)
492 return Py_BuildValue("iii", NOT_SYNTAX, Sdigit, index);
493 if (context!=CHARCLASS)
494 return Py_BuildValue("ici", CHAR, 'D', index);
495 {
496 /* context==CHARCLASS */
497 unsigned char set[256];
498 int i, j;
499 for(i=j=0; i<256; i++)
500 if ( !(re_syntax_table[i] & Sdigit) )
501 {
502 set[j++] = i;
503 }
504 return Py_BuildValue("is#i", SET, set, j, index);
505 }
506 break;
507
508 case('g'):
509 {
510 int end, valid, i;
511 if (context!=REPLACEMENT)
512 return Py_BuildValue("ici", CHAR, 'g', index);
513 if (pattern_len<=index)
514 {
515 PyErr_SetString(ReopError, "unfinished symbolic reference");
516 return NULL;
517 }
518 if (pattern[index]!='<')
519 {
520 PyErr_SetString(ReopError, "missing < in symbolic reference");
521 return NULL;
522 }
523 index++;
524 end=index;
525 while (end<pattern_len && pattern[end]!='>')
526 end++;
527 if (end==pattern_len)
528 {
529 PyErr_SetString(ReopError, "unfinished symbolic reference");
530 return NULL;
531 }
532 valid=1;
533 if (index==end /* Zero-length name */
534 || !(re_syntax_table[pattern[index]] & Sword) /* First char. not alphanumeric */
535 || (re_syntax_table[pattern[index]] & Sdigit) ) /* First char. a digit */
536 valid=0;
537
538 for(i=index+1; i<end; i++)
539 {
540 if (!(re_syntax_table[pattern[i]] & Sword) )
541 valid=0;
542 }
543 if (!valid)
544 {
545 /* XXX should include the text of the reference */
546 PyErr_SetString(ReopError, "illegal symbolic reference");
547 return NULL;
548 }
549
550 return Py_BuildValue("is#i", MEMORY_REFERENCE,
551 pattern+index, end-index,
552 end+1);
553 }
554 break;
555
556 case('0'):
557 {
558 /* \0 always indicates an octal escape, so we consume up to 3
559 characters, as long as they're all octal digits */
560 int octval=0, i;
561 index--;
562 for(i=index;
563 i<=index+2 && i<pattern_len
564 && (re_syntax_table[ pattern[i] ] & Soctaldigit );
565 i++)
566 {
567 octval = octval * 8 + pattern[i] - '0';
568 }
569 if (octval>255)
570 {
571 PyErr_SetString(ReopError, "octal value out of range");
572 return NULL;
573 }
574 return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
575 }
576 break;
577 case('1'): case('2'): case('3'): case('4'):
578 case('5'): case('6'): case('7'): case('8'):
579 case('9'):
580 {
581 /* Handle \?, where ? is from 1 through 9 */
582 int value=0;
583 index--;
584 /* If it's at least a two-digit reference, like \34, it might
585 either be a 3-digit octal escape (\123) or a 2-digit
586 decimal memory reference (\34) */
587
588 if ( (index+1) <pattern_len &&
589 (re_syntax_table[ pattern[index+1] ] & Sdigit) )
590 {
591 if ( (index+2) <pattern_len &&
592 (re_syntax_table[ pattern[index+2] ] & Soctaldigit) &&
593 (re_syntax_table[ pattern[index+1] ] & Soctaldigit) &&
594 (re_syntax_table[ pattern[index ] ] & Soctaldigit)
595 )
596 {
597 /* 3 octal digits */
598 value= 8*8*(pattern[index ]-'0') +
599 8*(pattern[index+1]-'0') +
600 (pattern[index+2]-'0');
601 if (value>255)
602 {
603 PyErr_SetString(ReopError, "octal value out of range");
604 return NULL;
605 }
606 return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
607 }
608 else
609 {
610 /* 2-digit form, so it's a memory reference */
611 if (context==CHARCLASS)
612 {
613 PyErr_SetString(ReopError, "cannot reference a register "
614 "from inside a character class");
615 return NULL;
616 }
617 value= 10*(pattern[index ]-'0') +
618 (pattern[index+1]-'0');
619 if (value<1 || RE_NREGS<=value)
620 {
621 PyErr_SetString(ReopError, "memory reference out of range");
622 return NULL;
623 }
624 return Py_BuildValue("iii", MEMORY_REFERENCE,
625 value, index+2);
626 }
627 }
628 else
629 {
630 /* Single-digit form, like \2, so it's a memory reference */
631 if (context==CHARCLASS)
632 {
633 PyErr_SetString(ReopError, "cannot reference a register "
634 "from inside a character class");
635 return NULL;
636 }
637 return Py_BuildValue("iii", MEMORY_REFERENCE,
638 pattern[index]-'0', index+1);
639 }
640 }
641 break;
642
643 default:
644 return Py_BuildValue("ici", CHAR, c, index);
645 break;
646 }
647}
648
649static PyObject *
650reop__expand(self, args)
651 PyObject *self;
652 PyObject *args;
653{
654 PyObject *results, *match_obj;
655 PyObject *repl_obj, *newstring;
Guido van Rossum95e80531997-08-13 22:34:14 +0000656 unsigned char *repl;
Guido van Rossumc24f0381997-08-13 03:24:53 +0000657 int size, total_len, i, start, pos;
658
659 if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
660 return NULL;
661
662 repl=PyString_AsString(repl_obj);
663 size=PyString_Size(repl_obj);
664 results=PyList_New(0);
665 if (results==NULL) return NULL;
666 for(start=total_len=i=0; i<size; i++)
667 {
668 if (repl[i]=='\\')
669 {
670 PyObject *args, *t, *value;
671 int escape_type;
672
673 if (start!=i)
674 {
675 PyList_Append(results,
676 PyString_FromStringAndSize(repl+start, i-start));
677 total_len += i-start;
678 }
679 i++;
680 args=Py_BuildValue("Oii", repl_obj, i, REPLACEMENT);
681 t=reop_expand_escape(NULL, args);
682 Py_DECREF(args);
683 if (t==NULL)
684 {
685 /* reop_expand_escape triggered an exception of some sort,
686 so just return */
687 Py_DECREF(results);
688 return NULL;
689 }
690 value=PyTuple_GetItem(t, 1);
691 escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
692 switch (escape_type)
693 {
694 case (CHAR):
695 PyList_Append(results, value);
696 total_len += PyString_Size(value);
697 break;
698 case(MEMORY_REFERENCE):
699 {
700 PyObject *r, *tuple, *result;
701 r=PyObject_GetAttrString(match_obj, "group");
702 tuple=PyTuple_New(1);
703 PyTuple_SetItem(tuple, 0, value);
704 result=PyEval_CallObject(r, tuple);
705 Py_DECREF(r); Py_DECREF(tuple);
706 if (result==NULL)
707 {
708 /* The group() method trigged an exception of some sort */
709 Py_DECREF(results);
710 return NULL;
711 }
712 if (result==Py_None)
713 {
714 char message[50];
715 sprintf(message,
716 "group %li did not contribute to the match",
717 PyInt_AsLong(value));
718 PyErr_SetString(ReopError,
719 message);
720 Py_DECREF(result);
721 Py_DECREF(t);
722 Py_DECREF(results);
723 return NULL;
724 }
725 /* xxx typecheck that it's a string! */
726 PyList_Append(results, result);
727 total_len += PyString_Size(result);
728 Py_DECREF(result);
729 }
730 break;
731 default:
732 Py_DECREF(t);
733 Py_DECREF(results);
734 PyErr_SetString(ReopError,
735 "bad escape in replacement");
736 return NULL;
737 }
738 i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
739 i--; /* Decrement now, because the 'for' loop will increment it */
740 Py_DECREF(t);
741 }
742 } /* endif repl[i]!='\\' */
743
744 if (start!=i)
745 {
746 PyList_Append(results, PyString_FromStringAndSize(repl+start, i-start));
747 total_len += i-start;
748 }
749
750 /* Whew! Now we've constructed a list containing various pieces of
751 strings that will make up our final result. So, iterate over
752 the list concatenating them. A new string measuring total_len
753 bytes is allocated and filled in. */
754
755 newstring=PyString_FromStringAndSize(NULL, total_len);
756 if (newstring==NULL)
757 {
758 Py_DECREF(results);
759 return NULL;
760 }
761
762 repl=PyString_AsString(newstring);
763 for (pos=i=0; i<PyList_Size(results); i++)
764 {
765 PyObject *item=PyList_GetItem(results, i);
766 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
767 pos += PyString_Size(item);
768 }
769 Py_DECREF(results);
770 return newstring;
771}
772
773
Guido van Rossumdb25f321997-07-10 14:31:32 +0000774#if 0
775/* Functions originally in the regsub module.
776 Added June 1, 1997.
777 */
778
779/* A cache of previously used patterns is maintained. Notice that if
780 you change the reop syntax flag, entries in the cache are
781 invalidated.
782 XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
783 every so often, or once it gets past a certain size?
784*/
785
786static PyObject *cache_dict=NULL;
787
788/* Accept an object; if it's a reop pattern, Py_INCREF it and return
789 it. If it's a string, a reop object is compiled and cached.
790*/
791
792static reopobject *
793cached_compile(pattern)
794 PyObject *pattern;
795{
796 reopobject *p2;
797
798 if (!PyString_Check(pattern))
799 {
800 /* It's not a string, so assume it's a compiled reop object */
801 /* XXX check that! */
802 Py_INCREF(pattern);
803 return (reopobject*)pattern;
804 }
805 if (cache_dict==NULL)
806 {
807 cache_dict=PyDict_New();
808 if (cache_dict==NULL)
809 {
810 return (reopobject*)NULL;
811 }
812 }
813
814 /* See if the pattern has already been cached; if so, return that
815 reop object */
816 p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
817 if (p2)
818 {
819 Py_INCREF(p2);
820 return (reopobject*)p2;
821 }
822
823 /* Compile the pattern and cache it */
824 p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
825 if (!p2) return p2;
826 PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
827 return p2;
828}
829
830
831static PyObject *
832internal_split(args, retain)
833 PyObject *args;
834 int retain;
835{
836 PyObject *newlist, *s;
837 reopobject *pattern;
838 int maxsplit=0, count=0, length, next=0, result;
839 int match_end=0; /* match_start is defined below */
Guido van Rossum95e80531997-08-13 22:34:14 +0000840 unsigned char *start;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000841
842 if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
843 &maxsplit))
844 {
845 PyErr_Clear();
846 if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
847 return NULL;
848 }
849 pattern=cached_compile((PyObject *)pattern);
850 if (!pattern) return NULL;
851
852 newlist=PyList_New(0);
853 if (!newlist) return NULL;
854
855 do
856 {
857 result = re_search(&pattern->re_patbuf,
858 start, length, next, length-next,
859 &pattern->re_regs);
860 if (result < -1)
861 { /* Erk... an error happened during the reop search */
862 Py_DECREF(newlist);
863 PyErr_SetString(ReopError, "match failure");
864 return NULL;
865 }
866 if (next<=result)
867 {
868 int match_start=pattern->re_regs.start[0];
869 int oldmatch_end=match_end;
870 match_end=pattern->re_regs.end[0];
871
872 if (match_start==match_end)
873 { /* A zero-length match; increment to the next position */
874 next=result+1;
875 match_end=oldmatch_end;
876 continue;
877 }
878
879 /* Append the string up to the start of the match */
880 s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
881 if (!s)
882 {
883 Py_DECREF(newlist);
884 return NULL;
885 }
886 PyList_Append(newlist, s);
887 Py_DECREF(s);
888
889 if (retain)
890 {
891 /* Append a string containing whatever matched */
892 s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
893 if (!s)
894 {
895 Py_DECREF(newlist);
896 return NULL;
897 }
898 PyList_Append(newlist, s);
899 Py_DECREF(s);
900 }
901 /* Update the pointer, and increment the count of splits */
902 next=match_end; count++;
903 }
904 } while (result!=-1 && !(maxsplit && maxsplit==count) &&
905 next<length);
906 s=PyString_FromStringAndSize(start+match_end, length-match_end);
907 if (!s)
908 {
909 Py_DECREF(newlist);
910 return NULL;
911 }
912 PyList_Append(newlist, s);
913 Py_DECREF(s);
914 Py_DECREF(pattern);
915 return newlist;
916}
917
918static PyObject *
919reop_split(self, args)
920 PyObject *self;
921 PyObject *args;
922{
923 return internal_split(args, 0);
924}
925
926static PyObject *
927reop_splitx(self, args)
928 PyObject *self;
929 PyObject *args;
930{
931 return internal_split(args, 1);
932}
933#endif
934
935static struct PyMethodDef reop_global_methods[] = {
936 {"match", reop_match, 0},
937 {"search", reop_search, 0},
Guido van Rossumc24f0381997-08-13 03:24:53 +0000938 {"expand_escape", reop_expand_escape, 1},
939 {"_expand", reop__expand, 1},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000940#if 0
Guido van Rossum95e80531997-08-13 22:34:14 +0000941 {"_optimize", reop_optimize, 0},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000942 {"split", reop_split, 0},
943 {"splitx", reop_splitx, 0},
944#endif
945 {NULL, NULL} /* sentinel */
946};
947
948void
949initreop()
950{
Guido van Rossum74fb3031997-07-17 22:41:38 +0000951 PyObject *m, *d, *k, *v, *o;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000952 int i;
Guido van Rossum95e80531997-08-13 22:34:14 +0000953 unsigned char *s;
954 unsigned char j[2];
Guido van Rossum74fb3031997-07-17 22:41:38 +0000955
956 re_compile_initialize();
957
Guido van Rossumdb25f321997-07-10 14:31:32 +0000958 m = Py_InitModule("reop", reop_global_methods);
959 d = PyModule_GetDict(m);
960
961 /* Initialize reop.error exception */
962 v = ReopError = PyString_FromString("reop.error");
963 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
964 goto finally;
965
966 /* Initialize reop.casefold constant */
Guido van Rossum95e80531997-08-13 22:34:14 +0000967 if (!(v = PyString_FromStringAndSize((unsigned char *)NULL, 256)))
Guido van Rossumdb25f321997-07-10 14:31:32 +0000968 goto finally;
969
970 if (!(s = PyString_AsString(v)))
971 goto finally;
972
973 for (i = 0; i < 256; i++) {
974 if (isupper(i))
975 s[i] = tolower(i);
976 else
977 s[i] = i;
978 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000979
Guido van Rossumdb25f321997-07-10 14:31:32 +0000980 if (PyDict_SetItemString(d, "casefold", v) < 0)
981 goto finally;
982 Py_DECREF(v);
983
Guido van Rossum74fb3031997-07-17 22:41:38 +0000984 /* Initialize the syntax table */
985
986 o = PyDict_New();
987 if (o == NULL)
988 goto finally;
989
990 j[1] = '\0';
991 for (i = 0; i < 256; i++)
992 {
993 j[0] = i;
994 k = PyString_FromStringAndSize(j, 1);
995 if (k == NULL)
996 goto finally;
997 v = PyInt_FromLong(re_syntax_table[i]);
998 if (v == NULL)
999 goto finally;
1000 if (PyDict_SetItem(o, k, v) < 0)
1001 goto finally;
1002 Py_DECREF(k);
1003 Py_DECREF(v);
1004 }
1005
1006 if (PyDict_SetItemString(d, "syntax_table", o) < 0)
1007 goto finally;
1008 Py_DECREF(o);
1009
1010 v = PyInt_FromLong(Sword);
1011 if (v == NULL)
1012 goto finally;
1013
1014 if (PyDict_SetItemString(d, "word", v) < 0)
1015 goto finally;
1016 Py_DECREF(v);
1017
1018 v = PyInt_FromLong(Swhitespace);
1019 if (v == NULL)
1020 goto finally;
1021
1022 if (PyDict_SetItemString(d, "whitespace", v) < 0)
1023 goto finally;
1024 Py_DECREF(v);
1025
1026 v = PyInt_FromLong(Sdigit);
1027 if (v == NULL)
1028 goto finally;
1029
1030 if (PyDict_SetItemString(d, "digit", v) < 0)
1031 goto finally;
1032 Py_DECREF(v);
Guido van Rossumc24f0381997-08-13 03:24:53 +00001033
1034 PyDict_SetItemString(d, "NORMAL", PyInt_FromLong(NORMAL));
1035 PyDict_SetItemString(d, "CHARCLASS", PyInt_FromLong(CHARCLASS));
1036 PyDict_SetItemString(d, "REPLACEMENT", PyInt_FromLong(REPLACEMENT));
1037
1038 PyDict_SetItemString(d, "CHAR", PyInt_FromLong(CHAR));
1039 PyDict_SetItemString(d, "MEMORY_REFERENCE", PyInt_FromLong(MEMORY_REFERENCE));
1040 PyDict_SetItemString(d, "SYNTAX", PyInt_FromLong(SYNTAX));
1041 PyDict_SetItemString(d, "NOT_SYNTAX", PyInt_FromLong(NOT_SYNTAX));
1042 PyDict_SetItemString(d, "SET", PyInt_FromLong(SET));
1043 PyDict_SetItemString(d, "WORD_BOUNDARY", PyInt_FromLong(WORD_BOUNDARY));
1044 PyDict_SetItemString(d, "NOT_WORD_BOUNDARY", PyInt_FromLong(NOT_WORD_BOUNDARY));
1045 PyDict_SetItemString(d, "BEGINNING_OF_BUFFER", PyInt_FromLong(BEGINNING_OF_BUFFER));
1046 PyDict_SetItemString(d, "END_OF_BUFFER", PyInt_FromLong(END_OF_BUFFER));
1047
Guido van Rossumdb25f321997-07-10 14:31:32 +00001048 if (!PyErr_Occurred())
1049 return;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001050
Guido van Rossumdb25f321997-07-10 14:31:32 +00001051 finally:
1052 Py_FatalError("can't initialize reop module");
1053}
Guido van Rossumc24f0381997-08-13 03:24:53 +00001054