blob: 6c671c898ac3bf347d34ca94399f4fb2e80fcfb1 [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* $Id$ */
33
34/* Regular expression objects */
35/* This uses Tatu Ylonen's copyleft-free reimplementation of
36 GNU regular expressions */
37
38#include "Python.h"
39
40#include <ctype.h>
41
42#include "regexpr.h"
43
44static PyObject *ReopError; /* Exception */
45
Guido van Rossum74fb3031997-07-17 22:41:38 +000046#define IGNORECASE 0x01
47#define MULTILINE 0x02
48#define DOTALL 0x04
49#define VERBOSE 0x08
50
Guido van Rossumc24f0381997-08-13 03:24:53 +000051#define NORMAL 0
52#define CHARCLASS 1
53#define REPLACEMENT 2
54
55#define CHAR 0
56#define MEMORY_REFERENCE 1
57#define SYNTAX 2
58#define NOT_SYNTAX 3
59#define SET 4
60#define WORD_BOUNDARY 5
61#define NOT_WORD_BOUNDARY 6
62#define BEGINNING_OF_BUFFER 7
63#define END_OF_BUFFER 8
64
Guido van Rossumdb25f321997-07-10 14:31:32 +000065static PyObject *
66makeresult(regs, num_regs)
67 struct re_registers *regs;
68 int num_regs;
69{
70 PyObject *v;
71 int i;
72 static PyObject *filler = NULL;
73
74 if (filler == NULL) {
75 filler = Py_BuildValue("(ii)", -1, -1);
76 if (filler == NULL)
77 return NULL;
78 }
79 v = PyTuple_New(num_regs);
80 if (v == NULL)
81 return NULL;
82
83 for (i = 0; i < num_regs; i++) {
84 int lo = regs->start[i];
85 int hi = regs->end[i];
86 PyObject *w;
87 if (lo == -1 && hi == -1) {
88 w = filler;
89 Py_INCREF(w);
90 }
91 else
92 w = Py_BuildValue("(ii)", lo, hi);
93 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
94 Py_DECREF(v);
95 return NULL;
96 }
97 }
98 return v;
99}
100
101static PyObject *
102reop_match(self, args)
103 PyObject *self;
104 PyObject *args;
105{
Guido van Rossum95e80531997-08-13 22:34:14 +0000106 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000107 int fastmaplen, stringlen;
108 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000109 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000110 struct re_pattern_buffer bufp;
111 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000112 PyObject *modules = NULL;
113 PyObject *reopmodule = NULL;
114 PyObject *reopdict = NULL;
115 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000116
117 if (!PyArg_Parse(args, "(s#iiis#is#i)",
118 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000119 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000120 &(bufp.fastmap), &fastmaplen,
121 &anchor,
122 &string, &stringlen,
123 &pos))
124 return NULL;
125
126 /* XXX sanity-check the input data */
127 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000128 if (flags & IGNORECASE)
129 {
130 if ((modules = PyImport_GetModuleDict()) == NULL)
131 return NULL;
132
133 if ((reopmodule = PyDict_GetItemString(modules,
134 "reop")) == NULL)
135 return NULL;
136
137 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
138 return NULL;
139
140 if ((casefold = PyDict_GetItemString(reopdict,
141 "casefold")) == NULL)
142 return NULL;
143
Guido van Rossumed2554a1997-08-18 15:31:24 +0000144 bufp.translate = (unsigned char*)PyString_AsString(casefold);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000145 }
146 else
147 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000148 bufp.fastmap_accurate=1;
149 bufp.can_be_null=can_be_null;
150 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000151 bufp.anchor=anchor;
152
Guido van Rossum74fb3031997-07-17 22:41:38 +0000153 for(i=0; i<bufp.num_registers; i++) {
154 re_regs.start[i]=-1;
155 re_regs.end[i]=-1;
156 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000157
158 result = re_match(&bufp,
159 string, stringlen, pos,
160 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000161
Guido van Rossumdb25f321997-07-10 14:31:32 +0000162 if (result < -1) {
163 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000164 if (!PyErr_Occurred())
165 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000166 return NULL;
167 }
Guido van Rossum63e18191997-07-11 11:08:38 +0000168 if (result == -1) {
169 Py_INCREF(Py_None);
170 return Py_None;
171 }
Guido van Rossum04a1d741997-07-15 14:38:13 +0000172 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000173}
174
Guido van Rossum95e80531997-08-13 22:34:14 +0000175#if 0
176static PyObject *
177reop_optimize(self, args)
178 PyObject *self;
179 PyObject *args;
180{
181 unsigned char *buffer;
182 int buflen;
183 struct re_pattern_buffer bufp;
184
185 PyObject *opt_code;
186
187 if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL;
188 /* Create a new string for the optimized code */
189 opt_code=PyString_FromStringAndSize(buffer, buflen);
190 if (opt_code!=NULL)
191 {
192 bufp.buffer = PyString_AsString(opt_code);
193 bufp.used=bufp.allocated=buflen;
194
195 }
196 return opt_code;
197
198}
199#endif
200
Guido van Rossumdb25f321997-07-10 14:31:32 +0000201static PyObject *
202reop_search(self, args)
203 PyObject *self;
204 PyObject *args;
205{
Guido van Rossum95e80531997-08-13 22:34:14 +0000206 unsigned char *string;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000207 int fastmaplen, stringlen;
208 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000209 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000210 struct re_pattern_buffer bufp;
211 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000212 PyObject *modules = NULL;
213 PyObject *reopmodule = NULL;
214 PyObject *reopdict = NULL;
215 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000216
217 if (!PyArg_Parse(args, "(s#iiis#is#i)",
218 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000219 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000220 &(bufp.fastmap), &fastmaplen,
221 &anchor,
222 &string, &stringlen,
223 &pos))
224 return NULL;
225
226 /* XXX sanity-check the input data */
227 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000228 if (flags & IGNORECASE)
229 {
230 if ((modules = PyImport_GetModuleDict()) == NULL)
231 return NULL;
232
233 if ((reopmodule = PyDict_GetItemString(modules,
234 "reop")) == NULL)
235 return NULL;
236
237 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
238 return NULL;
239
240 if ((casefold = PyDict_GetItemString(reopdict,
241 "casefold")) == NULL)
242 return NULL;
243
Guido van Rossumed2554a1997-08-18 15:31:24 +0000244 bufp.translate = (unsigned char *)PyString_AsString(casefold);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000245 }
246 else
247 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000248 bufp.fastmap_accurate=1;
249 bufp.can_be_null=can_be_null;
250 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000251 bufp.anchor=anchor;
252
Guido van Rossum74fb3031997-07-17 22:41:38 +0000253 for(i = 0; i < bufp.num_registers; i++) {
254 re_regs.start[i] = -1;
255 re_regs.end[i] = -1;
256 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000257
258 result = re_search(&bufp,
259 string, stringlen, pos, stringlen-pos,
260 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000261
Guido van Rossumdb25f321997-07-10 14:31:32 +0000262 if (result < -1) {
263 /* Failure like stack overflow */
Guido van Rossum95e80531997-08-13 22:34:14 +0000264 if (!PyErr_Occurred())
265 PyErr_SetString(ReopError, "match failure");
Guido van Rossumdb25f321997-07-10 14:31:32 +0000266 return NULL;
267 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000268
Guido van Rossum63e18191997-07-11 11:08:38 +0000269 if (result == -1) {
270 Py_INCREF(Py_None);
271 return Py_None;
272 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000273
Guido van Rossum04a1d741997-07-15 14:38:13 +0000274 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000275}
276
Guido van Rossumc24f0381997-08-13 03:24:53 +0000277static PyObject *
278reop_expand_escape(self, args)
279 PyObject *self;
280 PyObject *args;
281{
282 unsigned char c, *pattern;
283 int index, context=NORMAL, pattern_len;
284
285 if (!PyArg_ParseTuple(args, "s#i|i", &pattern, &pattern_len, &index,
286 &context))
287 return NULL;
288 if (pattern_len<=index)
289 {
290 PyErr_SetString(ReopError, "escape ends too soon");
291 return NULL;
292 }
293 c=pattern[index]; index++;
294 switch (c)
295 {
296 case('t'):
297 return Py_BuildValue("ici", CHAR, (char)9, index);
298 break;
299 case('n'):
300 return Py_BuildValue("ici", CHAR, (char)10, index);
301 break;
302 case('v'):
303 return Py_BuildValue("ici", CHAR, (char)11, index);
304 break;
305 case('r'):
306 return Py_BuildValue("ici", CHAR, (char)13, index);
307 break;
308 case('f'):
309 return Py_BuildValue("ici", CHAR, (char)12, index);
310 break;
311 case('a'):
312 return Py_BuildValue("ici", CHAR, (char)7, index);
313 break;
314 case('x'):
315 {
316 int end, length;
317 unsigned char *string;
318 PyObject *v, *result;
319
320 end=index;
321 while (end<pattern_len &&
322 ( re_syntax_table[ pattern[end] ] & Shexdigit ) )
323 end++;
324 if (end==index)
325 {
326 PyErr_SetString(ReopError, "\\x must be followed by hex digits");
327 return NULL;
328 }
329 length=end-index;
330 string=malloc(length+4+1);
331 if (string==NULL)
332 {
333 PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
334 return NULL;
335 }
336 /* Create a string containing "\x<hexdigits>", which will be
337 passed to eval() */
338 string[0]=string[length+3]='"';
339 string[1]='\\';
340 string[length+4]='\0';
341 memcpy(string+2, pattern+index-1, length+1);
Guido van Rossumed2554a1997-08-18 15:31:24 +0000342 v=PyRun_String((char *)string, Py_eval_input,
Guido van Rossumc24f0381997-08-13 03:24:53 +0000343 PyEval_GetGlobals(), PyEval_GetLocals());
344 free(string);
345 /* The evaluation raised an exception */
346 if (v==NULL) return NULL;
347 result=Py_BuildValue("iOi", CHAR, v, end);
348 Py_DECREF(v);
349 return result;
350 }
351 break;
352
353 case('b'):
354 if (context!=NORMAL)
355 return Py_BuildValue("ici", CHAR, (char)8, index);
356 else
357 {
358 unsigned char empty_string[1];
359 empty_string[0]='\0';
360 return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
361 }
362 break;
363 case('B'):
364 if (context!=NORMAL)
365 return Py_BuildValue("ici", CHAR, 'B', index);
366 else
367 {
368 unsigned char empty_string[1];
369 empty_string[0]='\0';
370 return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
371 }
372 break;
373 case('A'):
374 if (context!=NORMAL)
375 return Py_BuildValue("ici", CHAR, 'A', index);
376 else
377 {
378 unsigned char empty_string[1];
379 empty_string[0]='\0';
380 return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
381 }
382 break;
383 case('Z'):
384 if (context!=NORMAL)
385 return Py_BuildValue("ici", CHAR, 'Z', index);
386 else
387 {
388 unsigned char empty_string[1];
389 empty_string[0]='\0';
390 return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
391 }
392 break;
393 case('E'): case('G'): case('L'): case('Q'):
394 case('U'): case('l'): case('u'):
395 {
396 char message[50];
397 sprintf(message, "\\%c is not allowed", c);
398 PyErr_SetString(ReopError, message);
399 return NULL;
400 }
401
402 case ('w'):
403 if (context==NORMAL)
404 return Py_BuildValue("iii", SYNTAX, Sword, index);
405 if (context!=CHARCLASS)
406 return Py_BuildValue("ici", CHAR, 'w', index);
407 {
408 /* context==CHARCLASS */
409 unsigned char set[256];
410 int i, j;
411 for(i=j=0; i<256; i++)
412 if (re_syntax_table[i] & Sword)
413 {
414 set[j++] = i;
415 }
416 return Py_BuildValue("is#i", SET, set, j, index);
417 }
418 break;
419 case ('W'):
420 if (context==NORMAL)
421 return Py_BuildValue("iii", NOT_SYNTAX, Sword, index);
422 if (context!=CHARCLASS)
423 return Py_BuildValue("ici", CHAR, 'W', index);
424 {
425 /* context==CHARCLASS */
426 unsigned char set[256];
427 int i, j;
428 for(i=j=0; i<256; i++)
429 if (! (re_syntax_table[i] & Sword))
430 {
431 set[j++] = i;
432 }
433 return Py_BuildValue("is#i", SET, set, j, index);
434 }
435 break;
436 case ('s'):
437 if (context==NORMAL)
438 return Py_BuildValue("iii", SYNTAX, Swhitespace, index);
439 if (context!=CHARCLASS)
440 return Py_BuildValue("ici", CHAR, 's', index);
441 {
442 /* context==CHARCLASS */
443 unsigned char set[256];
444 int i, j;
445 for(i=j=0; i<256; i++)
446 if (re_syntax_table[i] & Swhitespace)
447 {
448 set[j++] = i;
449 }
450 return Py_BuildValue("is#i", SET, set, j, index);
451 }
452 break;
453 case ('S'):
454 if (context==NORMAL)
455 return Py_BuildValue("iii", NOT_SYNTAX, Swhitespace, index);
456 if (context!=CHARCLASS)
457 return Py_BuildValue("ici", CHAR, 'S', index);
458 {
459 /* context==CHARCLASS */
460 unsigned char set[256];
461 int i, j;
462 for(i=j=0; i<256; i++)
463 if (! (re_syntax_table[i] & Swhitespace) )
464 {
465 set[j++] = i;
466 }
467 return Py_BuildValue("is#i", SET, set, j, index);
468 }
469 break;
470
471 case ('d'):
472 if (context==NORMAL)
473 return Py_BuildValue("iii", SYNTAX, Sdigit, index);
474 if (context!=CHARCLASS)
475 return Py_BuildValue("ici", CHAR, 'd', index);
476 {
477 /* context==CHARCLASS */
478 unsigned char set[256];
479 int i, j;
480 for(i=j=0; i<256; i++)
481 if (re_syntax_table[i] & Sdigit)
482 {
483 set[j++] = i;
484 }
485 return Py_BuildValue("is#i", SET, set, j, index);
486 }
487 break;
488 case ('D'):
489 if (context==NORMAL)
490 return Py_BuildValue("iii", NOT_SYNTAX, Sdigit, index);
491 if (context!=CHARCLASS)
492 return Py_BuildValue("ici", CHAR, 'D', index);
493 {
494 /* context==CHARCLASS */
495 unsigned char set[256];
496 int i, j;
497 for(i=j=0; i<256; i++)
498 if ( !(re_syntax_table[i] & Sdigit) )
499 {
500 set[j++] = i;
501 }
502 return Py_BuildValue("is#i", SET, set, j, index);
503 }
504 break;
505
506 case('g'):
507 {
508 int end, valid, i;
509 if (context!=REPLACEMENT)
510 return Py_BuildValue("ici", CHAR, 'g', index);
511 if (pattern_len<=index)
512 {
513 PyErr_SetString(ReopError, "unfinished symbolic reference");
514 return NULL;
515 }
516 if (pattern[index]!='<')
517 {
518 PyErr_SetString(ReopError, "missing < in symbolic reference");
519 return NULL;
520 }
521 index++;
522 end=index;
523 while (end<pattern_len && pattern[end]!='>')
524 end++;
525 if (end==pattern_len)
526 {
527 PyErr_SetString(ReopError, "unfinished symbolic reference");
528 return NULL;
529 }
530 valid=1;
531 if (index==end /* Zero-length name */
532 || !(re_syntax_table[pattern[index]] & Sword) /* First char. not alphanumeric */
533 || (re_syntax_table[pattern[index]] & Sdigit) ) /* First char. a digit */
534 valid=0;
535
536 for(i=index+1; i<end; i++)
537 {
538 if (!(re_syntax_table[pattern[i]] & Sword) )
539 valid=0;
540 }
541 if (!valid)
542 {
543 /* XXX should include the text of the reference */
544 PyErr_SetString(ReopError, "illegal symbolic reference");
545 return NULL;
546 }
547
548 return Py_BuildValue("is#i", MEMORY_REFERENCE,
549 pattern+index, end-index,
550 end+1);
551 }
552 break;
553
554 case('0'):
555 {
556 /* \0 always indicates an octal escape, so we consume up to 3
557 characters, as long as they're all octal digits */
558 int octval=0, i;
559 index--;
560 for(i=index;
561 i<=index+2 && i<pattern_len
562 && (re_syntax_table[ pattern[i] ] & Soctaldigit );
563 i++)
564 {
565 octval = octval * 8 + pattern[i] - '0';
566 }
567 if (octval>255)
568 {
569 PyErr_SetString(ReopError, "octal value out of range");
570 return NULL;
571 }
572 return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
573 }
574 break;
575 case('1'): case('2'): case('3'): case('4'):
576 case('5'): case('6'): case('7'): case('8'):
577 case('9'):
578 {
579 /* Handle \?, where ? is from 1 through 9 */
580 int value=0;
581 index--;
582 /* If it's at least a two-digit reference, like \34, it might
583 either be a 3-digit octal escape (\123) or a 2-digit
584 decimal memory reference (\34) */
585
586 if ( (index+1) <pattern_len &&
587 (re_syntax_table[ pattern[index+1] ] & Sdigit) )
588 {
589 if ( (index+2) <pattern_len &&
590 (re_syntax_table[ pattern[index+2] ] & Soctaldigit) &&
591 (re_syntax_table[ pattern[index+1] ] & Soctaldigit) &&
592 (re_syntax_table[ pattern[index ] ] & Soctaldigit)
593 )
594 {
595 /* 3 octal digits */
596 value= 8*8*(pattern[index ]-'0') +
597 8*(pattern[index+1]-'0') +
598 (pattern[index+2]-'0');
599 if (value>255)
600 {
601 PyErr_SetString(ReopError, "octal value out of range");
602 return NULL;
603 }
604 return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
605 }
606 else
607 {
608 /* 2-digit form, so it's a memory reference */
609 if (context==CHARCLASS)
610 {
Guido van Rossumbd4435a1997-09-05 07:01:19 +0000611 PyErr_SetString(ReopError,
612 "cannot reference a register from inside a character class");
Guido van Rossumc24f0381997-08-13 03:24:53 +0000613 return NULL;
614 }
615 value= 10*(pattern[index ]-'0') +
616 (pattern[index+1]-'0');
617 if (value<1 || RE_NREGS<=value)
618 {
619 PyErr_SetString(ReopError, "memory reference out of range");
620 return NULL;
621 }
622 return Py_BuildValue("iii", MEMORY_REFERENCE,
623 value, index+2);
624 }
625 }
626 else
627 {
628 /* Single-digit form, like \2, so it's a memory reference */
629 if (context==CHARCLASS)
630 {
Guido van Rossumbd4435a1997-09-05 07:01:19 +0000631 PyErr_SetString(ReopError,
632 "cannot reference a register from inside a character class");
Guido van Rossumc24f0381997-08-13 03:24:53 +0000633 return NULL;
634 }
635 return Py_BuildValue("iii", MEMORY_REFERENCE,
636 pattern[index]-'0', index+1);
637 }
638 }
639 break;
640
641 default:
642 return Py_BuildValue("ici", CHAR, c, index);
643 break;
644 }
645}
646
647static PyObject *
648reop__expand(self, args)
649 PyObject *self;
650 PyObject *args;
651{
652 PyObject *results, *match_obj;
653 PyObject *repl_obj, *newstring;
Guido van Rossum95e80531997-08-13 22:34:14 +0000654 unsigned char *repl;
Guido van Rossumc24f0381997-08-13 03:24:53 +0000655 int size, total_len, i, start, pos;
656
657 if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
658 return NULL;
659
Guido van Rossumed2554a1997-08-18 15:31:24 +0000660 repl=(unsigned char *)PyString_AsString(repl_obj);
Guido van Rossumc24f0381997-08-13 03:24:53 +0000661 size=PyString_Size(repl_obj);
662 results=PyList_New(0);
663 if (results==NULL) return NULL;
664 for(start=total_len=i=0; i<size; i++)
665 {
666 if (repl[i]=='\\')
667 {
668 PyObject *args, *t, *value;
669 int escape_type;
670
671 if (start!=i)
672 {
673 PyList_Append(results,
Guido van Rossumed2554a1997-08-18 15:31:24 +0000674 PyString_FromStringAndSize((char *)repl+start, i-start));
Guido van Rossumc24f0381997-08-13 03:24:53 +0000675 total_len += i-start;
676 }
677 i++;
678 args=Py_BuildValue("Oii", repl_obj, i, REPLACEMENT);
679 t=reop_expand_escape(NULL, args);
680 Py_DECREF(args);
681 if (t==NULL)
682 {
683 /* reop_expand_escape triggered an exception of some sort,
684 so just return */
685 Py_DECREF(results);
686 return NULL;
687 }
688 value=PyTuple_GetItem(t, 1);
689 escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
690 switch (escape_type)
691 {
692 case (CHAR):
693 PyList_Append(results, value);
694 total_len += PyString_Size(value);
695 break;
696 case(MEMORY_REFERENCE):
697 {
698 PyObject *r, *tuple, *result;
699 r=PyObject_GetAttrString(match_obj, "group");
700 tuple=PyTuple_New(1);
Guido van Rossumf1c018d1997-08-14 21:19:13 +0000701 Py_INCREF(value);
Guido van Rossumc24f0381997-08-13 03:24:53 +0000702 PyTuple_SetItem(tuple, 0, value);
703 result=PyEval_CallObject(r, tuple);
704 Py_DECREF(r); Py_DECREF(tuple);
705 if (result==NULL)
706 {
707 /* The group() method trigged an exception of some sort */
708 Py_DECREF(results);
709 return NULL;
710 }
711 if (result==Py_None)
712 {
713 char message[50];
714 sprintf(message,
715 "group %li did not contribute to the match",
716 PyInt_AsLong(value));
717 PyErr_SetString(ReopError,
718 message);
719 Py_DECREF(result);
720 Py_DECREF(t);
721 Py_DECREF(results);
722 return NULL;
723 }
724 /* xxx typecheck that it's a string! */
725 PyList_Append(results, result);
726 total_len += PyString_Size(result);
727 Py_DECREF(result);
728 }
729 break;
730 default:
731 Py_DECREF(t);
732 Py_DECREF(results);
733 PyErr_SetString(ReopError,
734 "bad escape in replacement");
735 return NULL;
736 }
737 i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
738 i--; /* Decrement now, because the 'for' loop will increment it */
739 Py_DECREF(t);
740 }
741 } /* endif repl[i]!='\\' */
742
743 if (start!=i)
744 {
Guido van Rossumed2554a1997-08-18 15:31:24 +0000745 PyList_Append(results, PyString_FromStringAndSize((char *)repl+start, i-start));
Guido van Rossumc24f0381997-08-13 03:24:53 +0000746 total_len += i-start;
747 }
748
749 /* Whew! Now we've constructed a list containing various pieces of
750 strings that will make up our final result. So, iterate over
751 the list concatenating them. A new string measuring total_len
752 bytes is allocated and filled in. */
753
754 newstring=PyString_FromStringAndSize(NULL, total_len);
755 if (newstring==NULL)
756 {
757 Py_DECREF(results);
758 return NULL;
759 }
760
Guido van Rossumed2554a1997-08-18 15:31:24 +0000761 repl=(unsigned char *)PyString_AsString(newstring);
Guido van Rossumc24f0381997-08-13 03:24:53 +0000762 for (pos=i=0; i<PyList_Size(results); i++)
763 {
764 PyObject *item=PyList_GetItem(results, i);
765 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
766 pos += PyString_Size(item);
767 }
768 Py_DECREF(results);
769 return newstring;
770}
771
772
Guido van Rossumdb25f321997-07-10 14:31:32 +0000773#if 0
774/* Functions originally in the regsub module.
775 Added June 1, 1997.
776 */
777
778/* A cache of previously used patterns is maintained. Notice that if
779 you change the reop syntax flag, entries in the cache are
780 invalidated.
781 XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
782 every so often, or once it gets past a certain size?
783*/
784
785static PyObject *cache_dict=NULL;
786
787/* Accept an object; if it's a reop pattern, Py_INCREF it and return
788 it. If it's a string, a reop object is compiled and cached.
789*/
790
791static reopobject *
792cached_compile(pattern)
793 PyObject *pattern;
794{
795 reopobject *p2;
796
797 if (!PyString_Check(pattern))
798 {
799 /* It's not a string, so assume it's a compiled reop object */
800 /* XXX check that! */
801 Py_INCREF(pattern);
802 return (reopobject*)pattern;
803 }
804 if (cache_dict==NULL)
805 {
806 cache_dict=PyDict_New();
807 if (cache_dict==NULL)
808 {
809 return (reopobject*)NULL;
810 }
811 }
812
813 /* See if the pattern has already been cached; if so, return that
814 reop object */
815 p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
816 if (p2)
817 {
818 Py_INCREF(p2);
819 return (reopobject*)p2;
820 }
821
822 /* Compile the pattern and cache it */
823 p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
824 if (!p2) return p2;
825 PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
826 return p2;
827}
828
829
830static PyObject *
831internal_split(args, retain)
832 PyObject *args;
833 int retain;
834{
835 PyObject *newlist, *s;
836 reopobject *pattern;
837 int maxsplit=0, count=0, length, next=0, result;
838 int match_end=0; /* match_start is defined below */
Guido van Rossum95e80531997-08-13 22:34:14 +0000839 unsigned char *start;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000840
841 if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
842 &maxsplit))
843 {
844 PyErr_Clear();
845 if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
846 return NULL;
847 }
848 pattern=cached_compile((PyObject *)pattern);
849 if (!pattern) return NULL;
850
851 newlist=PyList_New(0);
852 if (!newlist) return NULL;
853
854 do
855 {
856 result = re_search(&pattern->re_patbuf,
857 start, length, next, length-next,
858 &pattern->re_regs);
859 if (result < -1)
860 { /* Erk... an error happened during the reop search */
861 Py_DECREF(newlist);
862 PyErr_SetString(ReopError, "match failure");
863 return NULL;
864 }
865 if (next<=result)
866 {
867 int match_start=pattern->re_regs.start[0];
868 int oldmatch_end=match_end;
869 match_end=pattern->re_regs.end[0];
870
871 if (match_start==match_end)
872 { /* A zero-length match; increment to the next position */
873 next=result+1;
874 match_end=oldmatch_end;
875 continue;
876 }
877
878 /* Append the string up to the start of the match */
879 s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
880 if (!s)
881 {
882 Py_DECREF(newlist);
883 return NULL;
884 }
885 PyList_Append(newlist, s);
886 Py_DECREF(s);
887
888 if (retain)
889 {
890 /* Append a string containing whatever matched */
891 s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
892 if (!s)
893 {
894 Py_DECREF(newlist);
895 return NULL;
896 }
897 PyList_Append(newlist, s);
898 Py_DECREF(s);
899 }
900 /* Update the pointer, and increment the count of splits */
901 next=match_end; count++;
902 }
903 } while (result!=-1 && !(maxsplit && maxsplit==count) &&
904 next<length);
905 s=PyString_FromStringAndSize(start+match_end, length-match_end);
906 if (!s)
907 {
908 Py_DECREF(newlist);
909 return NULL;
910 }
911 PyList_Append(newlist, s);
912 Py_DECREF(s);
913 Py_DECREF(pattern);
914 return newlist;
915}
916
917static PyObject *
918reop_split(self, args)
919 PyObject *self;
920 PyObject *args;
921{
922 return internal_split(args, 0);
923}
924
925static PyObject *
926reop_splitx(self, args)
927 PyObject *self;
928 PyObject *args;
929{
930 return internal_split(args, 1);
931}
932#endif
933
934static struct PyMethodDef reop_global_methods[] = {
935 {"match", reop_match, 0},
936 {"search", reop_search, 0},
Guido van Rossumc24f0381997-08-13 03:24:53 +0000937 {"expand_escape", reop_expand_escape, 1},
938 {"_expand", reop__expand, 1},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000939#if 0
Guido van Rossum95e80531997-08-13 22:34:14 +0000940 {"_optimize", reop_optimize, 0},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000941 {"split", reop_split, 0},
942 {"splitx", reop_splitx, 0},
943#endif
944 {NULL, NULL} /* sentinel */
945};
946
947void
948initreop()
949{
Guido van Rossum74fb3031997-07-17 22:41:38 +0000950 PyObject *m, *d, *k, *v, *o;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000951 int i;
Guido van Rossum95e80531997-08-13 22:34:14 +0000952 unsigned char *s;
953 unsigned char j[2];
Guido van Rossum74fb3031997-07-17 22:41:38 +0000954
955 re_compile_initialize();
956
Guido van Rossumdb25f321997-07-10 14:31:32 +0000957 m = Py_InitModule("reop", reop_global_methods);
958 d = PyModule_GetDict(m);
959
960 /* Initialize reop.error exception */
Guido van Rossum0cb96de1997-10-01 04:29:29 +0000961 v = ReopError = PyErr_NewException("reop.error", NULL, NULL);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000962 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
963 goto finally;
964
965 /* Initialize reop.casefold constant */
Guido van Rossumed2554a1997-08-18 15:31:24 +0000966 if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
Guido van Rossumdb25f321997-07-10 14:31:32 +0000967 goto finally;
968
Guido van Rossumed2554a1997-08-18 15:31:24 +0000969 if (!(s = (unsigned char *)PyString_AsString(v)))
Guido van Rossumdb25f321997-07-10 14:31:32 +0000970 goto finally;
971
972 for (i = 0; i < 256; i++) {
973 if (isupper(i))
974 s[i] = tolower(i);
975 else
976 s[i] = i;
977 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000978
Guido van Rossumdb25f321997-07-10 14:31:32 +0000979 if (PyDict_SetItemString(d, "casefold", v) < 0)
980 goto finally;
981 Py_DECREF(v);
982
Guido van Rossum74fb3031997-07-17 22:41:38 +0000983 /* Initialize the syntax table */
984
985 o = PyDict_New();
986 if (o == NULL)
987 goto finally;
988
989 j[1] = '\0';
990 for (i = 0; i < 256; i++)
991 {
992 j[0] = i;
Guido van Rossumed2554a1997-08-18 15:31:24 +0000993 k = PyString_FromStringAndSize((char *)j, 1);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000994 if (k == NULL)
995 goto finally;
996 v = PyInt_FromLong(re_syntax_table[i]);
997 if (v == NULL)
998 goto finally;
999 if (PyDict_SetItem(o, k, v) < 0)
1000 goto finally;
1001 Py_DECREF(k);
1002 Py_DECREF(v);
1003 }
1004
1005 if (PyDict_SetItemString(d, "syntax_table", o) < 0)
1006 goto finally;
1007 Py_DECREF(o);
1008
1009 v = PyInt_FromLong(Sword);
1010 if (v == NULL)
1011 goto finally;
1012
1013 if (PyDict_SetItemString(d, "word", v) < 0)
1014 goto finally;
1015 Py_DECREF(v);
1016
1017 v = PyInt_FromLong(Swhitespace);
1018 if (v == NULL)
1019 goto finally;
1020
1021 if (PyDict_SetItemString(d, "whitespace", v) < 0)
1022 goto finally;
1023 Py_DECREF(v);
1024
1025 v = PyInt_FromLong(Sdigit);
1026 if (v == NULL)
1027 goto finally;
1028
1029 if (PyDict_SetItemString(d, "digit", v) < 0)
1030 goto finally;
1031 Py_DECREF(v);
Guido van Rossumc24f0381997-08-13 03:24:53 +00001032
1033 PyDict_SetItemString(d, "NORMAL", PyInt_FromLong(NORMAL));
1034 PyDict_SetItemString(d, "CHARCLASS", PyInt_FromLong(CHARCLASS));
1035 PyDict_SetItemString(d, "REPLACEMENT", PyInt_FromLong(REPLACEMENT));
1036
1037 PyDict_SetItemString(d, "CHAR", PyInt_FromLong(CHAR));
1038 PyDict_SetItemString(d, "MEMORY_REFERENCE", PyInt_FromLong(MEMORY_REFERENCE));
1039 PyDict_SetItemString(d, "SYNTAX", PyInt_FromLong(SYNTAX));
1040 PyDict_SetItemString(d, "NOT_SYNTAX", PyInt_FromLong(NOT_SYNTAX));
1041 PyDict_SetItemString(d, "SET", PyInt_FromLong(SET));
1042 PyDict_SetItemString(d, "WORD_BOUNDARY", PyInt_FromLong(WORD_BOUNDARY));
1043 PyDict_SetItemString(d, "NOT_WORD_BOUNDARY", PyInt_FromLong(NOT_WORD_BOUNDARY));
1044 PyDict_SetItemString(d, "BEGINNING_OF_BUFFER", PyInt_FromLong(BEGINNING_OF_BUFFER));
1045 PyDict_SetItemString(d, "END_OF_BUFFER", PyInt_FromLong(END_OF_BUFFER));
1046
Guido van Rossumdb25f321997-07-10 14:31:32 +00001047 if (!PyErr_Occurred())
1048 return;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001049
Guido van Rossumdb25f321997-07-10 14:31:32 +00001050 finally:
Guido van Rossum0cb96de1997-10-01 04:29:29 +00001051 /* Nothing */;
Guido van Rossumdb25f321997-07-10 14:31:32 +00001052}
Guido van Rossumc24f0381997-08-13 03:24:53 +00001053