blob: 3578ac72d92697ec78b36097a7dc272747c061c7 [file] [log] [blame]
Guido van Rossumdb25f321997-07-10 14:31:32 +00001/***********************************************************
2Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
3The Netherlands.
4
5 All Rights Reserved
6
7Permission to use, copy, modify, and distribute this software and its
8documentation for any purpose and without fee is hereby granted,
9provided that the above copyright notice appear in all copies and that
10both that copyright notice and this permission notice appear in
11supporting documentation, and that the names of Stichting Mathematisch
12Centrum or CWI or Corporation for National Research Initiatives or
13CNRI not be used in advertising or publicity pertaining to
14distribution of the software without specific, written prior
15permission.
16
17While CWI is the initial source for this software, a modified version
18is made available by the Corporation for National Research Initiatives
19(CNRI) at the Internet address ftp://ftp.python.org.
20
21STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
22REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
23MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
24CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
25DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
26PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THIS SOFTWARE.
29
30******************************************************************/
31
32/* $Id$ */
33
34/* Regular expression objects */
35/* This uses Tatu Ylonen's copyleft-free reimplementation of
36 GNU regular expressions */
37
38#include "Python.h"
39
40#include <ctype.h>
41
42#include "regexpr.h"
43
44static PyObject *ReopError; /* Exception */
45
Guido van Rossum74fb3031997-07-17 22:41:38 +000046#define IGNORECASE 0x01
47#define MULTILINE 0x02
48#define DOTALL 0x04
49#define VERBOSE 0x08
50
Guido van Rossumc24f0381997-08-13 03:24:53 +000051#define NORMAL 0
52#define CHARCLASS 1
53#define REPLACEMENT 2
54
55#define CHAR 0
56#define MEMORY_REFERENCE 1
57#define SYNTAX 2
58#define NOT_SYNTAX 3
59#define SET 4
60#define WORD_BOUNDARY 5
61#define NOT_WORD_BOUNDARY 6
62#define BEGINNING_OF_BUFFER 7
63#define END_OF_BUFFER 8
64
Guido van Rossum74fb3031997-07-17 22:41:38 +000065static char *reop_casefold;
66
Guido van Rossumdb25f321997-07-10 14:31:32 +000067static PyObject *
68makeresult(regs, num_regs)
69 struct re_registers *regs;
70 int num_regs;
71{
72 PyObject *v;
73 int i;
74 static PyObject *filler = NULL;
75
76 if (filler == NULL) {
77 filler = Py_BuildValue("(ii)", -1, -1);
78 if (filler == NULL)
79 return NULL;
80 }
81 v = PyTuple_New(num_regs);
82 if (v == NULL)
83 return NULL;
84
85 for (i = 0; i < num_regs; i++) {
86 int lo = regs->start[i];
87 int hi = regs->end[i];
88 PyObject *w;
89 if (lo == -1 && hi == -1) {
90 w = filler;
91 Py_INCREF(w);
92 }
93 else
94 w = Py_BuildValue("(ii)", lo, hi);
95 if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
96 Py_DECREF(v);
97 return NULL;
98 }
99 }
100 return v;
101}
102
103static PyObject *
104reop_match(self, args)
105 PyObject *self;
106 PyObject *args;
107{
108 char *string;
109 int fastmaplen, stringlen;
110 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000111 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000112 struct re_pattern_buffer bufp;
113 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000114 PyObject *modules = NULL;
115 PyObject *reopmodule = NULL;
116 PyObject *reopdict = NULL;
117 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000118
119 if (!PyArg_Parse(args, "(s#iiis#is#i)",
120 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000121 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000122 &(bufp.fastmap), &fastmaplen,
123 &anchor,
124 &string, &stringlen,
125 &pos))
126 return NULL;
127
128 /* XXX sanity-check the input data */
129 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000130 if (flags & IGNORECASE)
131 {
132 if ((modules = PyImport_GetModuleDict()) == NULL)
133 return NULL;
134
135 if ((reopmodule = PyDict_GetItemString(modules,
136 "reop")) == NULL)
137 return NULL;
138
139 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
140 return NULL;
141
142 if ((casefold = PyDict_GetItemString(reopdict,
143 "casefold")) == NULL)
144 return NULL;
145
146 bufp.translate = PyString_AsString(casefold);
147 }
148 else
149 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000150 bufp.fastmap_accurate=1;
151 bufp.can_be_null=can_be_null;
152 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000153 bufp.anchor=anchor;
154
Guido van Rossum74fb3031997-07-17 22:41:38 +0000155 for(i=0; i<bufp.num_registers; i++) {
156 re_regs.start[i]=-1;
157 re_regs.end[i]=-1;
158 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000159
160 result = re_match(&bufp,
161 string, stringlen, pos,
162 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000163
Guido van Rossumdb25f321997-07-10 14:31:32 +0000164 if (result < -1) {
165 /* Failure like stack overflow */
166 PyErr_SetString(ReopError, "match failure");
Guido van Rossum74fb3031997-07-17 22:41:38 +0000167
Guido van Rossumdb25f321997-07-10 14:31:32 +0000168 return NULL;
169 }
Guido van Rossum63e18191997-07-11 11:08:38 +0000170 if (result == -1) {
171 Py_INCREF(Py_None);
172 return Py_None;
173 }
Guido van Rossum04a1d741997-07-15 14:38:13 +0000174 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000175}
176
177static PyObject *
178reop_search(self, args)
179 PyObject *self;
180 PyObject *args;
181{
182 char *string;
183 int fastmaplen, stringlen;
184 int can_be_null, anchor, i;
Guido van Rossum04a1d741997-07-15 14:38:13 +0000185 int flags, pos, result;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000186 struct re_pattern_buffer bufp;
187 struct re_registers re_regs;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000188 PyObject *modules = NULL;
189 PyObject *reopmodule = NULL;
190 PyObject *reopdict = NULL;
191 PyObject *casefold = NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000192
193 if (!PyArg_Parse(args, "(s#iiis#is#i)",
194 &(bufp.buffer), &(bufp.allocated),
Guido van Rossum04a1d741997-07-15 14:38:13 +0000195 &(bufp.num_registers), &flags, &can_be_null,
Guido van Rossumdb25f321997-07-10 14:31:32 +0000196 &(bufp.fastmap), &fastmaplen,
197 &anchor,
198 &string, &stringlen,
199 &pos))
200 return NULL;
201
202 /* XXX sanity-check the input data */
203 bufp.used=bufp.allocated;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000204 if (flags & IGNORECASE)
205 {
206 if ((modules = PyImport_GetModuleDict()) == NULL)
207 return NULL;
208
209 if ((reopmodule = PyDict_GetItemString(modules,
210 "reop")) == NULL)
211 return NULL;
212
213 if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
214 return NULL;
215
216 if ((casefold = PyDict_GetItemString(reopdict,
217 "casefold")) == NULL)
218 return NULL;
219
220 bufp.translate = PyString_AsString(casefold);
221 }
222 else
223 bufp.translate=NULL;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000224 bufp.fastmap_accurate=1;
225 bufp.can_be_null=can_be_null;
226 bufp.uses_registers=1;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000227 bufp.anchor=anchor;
228
Guido van Rossum74fb3031997-07-17 22:41:38 +0000229 for(i = 0; i < bufp.num_registers; i++) {
230 re_regs.start[i] = -1;
231 re_regs.end[i] = -1;
232 }
Guido van Rossumdb25f321997-07-10 14:31:32 +0000233
234 result = re_search(&bufp,
235 string, stringlen, pos, stringlen-pos,
236 &re_regs);
Guido van Rossum74fb3031997-07-17 22:41:38 +0000237
Guido van Rossumdb25f321997-07-10 14:31:32 +0000238 if (result < -1) {
239 /* Failure like stack overflow */
240 PyErr_SetString(ReopError, "match failure");
241 return NULL;
242 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000243
Guido van Rossum63e18191997-07-11 11:08:38 +0000244 if (result == -1) {
245 Py_INCREF(Py_None);
246 return Py_None;
247 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000248
Guido van Rossum04a1d741997-07-15 14:38:13 +0000249 return makeresult(&re_regs, bufp.num_registers);
Guido van Rossumdb25f321997-07-10 14:31:32 +0000250}
251
Guido van Rossumc24f0381997-08-13 03:24:53 +0000252static PyObject *
253reop_expand_escape(self, args)
254 PyObject *self;
255 PyObject *args;
256{
257 unsigned char c, *pattern;
258 int index, context=NORMAL, pattern_len;
259
260 if (!PyArg_ParseTuple(args, "s#i|i", &pattern, &pattern_len, &index,
261 &context))
262 return NULL;
263 if (pattern_len<=index)
264 {
265 PyErr_SetString(ReopError, "escape ends too soon");
266 return NULL;
267 }
268 c=pattern[index]; index++;
269 switch (c)
270 {
271 case('t'):
272 return Py_BuildValue("ici", CHAR, (char)9, index);
273 break;
274 case('n'):
275 return Py_BuildValue("ici", CHAR, (char)10, index);
276 break;
277 case('v'):
278 return Py_BuildValue("ici", CHAR, (char)11, index);
279 break;
280 case('r'):
281 return Py_BuildValue("ici", CHAR, (char)13, index);
282 break;
283 case('f'):
284 return Py_BuildValue("ici", CHAR, (char)12, index);
285 break;
286 case('a'):
287 return Py_BuildValue("ici", CHAR, (char)7, index);
288 break;
289 case('x'):
290 {
291 int end, length;
292 unsigned char *string;
293 PyObject *v, *result;
294
295 end=index;
296 while (end<pattern_len &&
297 ( re_syntax_table[ pattern[end] ] & Shexdigit ) )
298 end++;
299 if (end==index)
300 {
301 PyErr_SetString(ReopError, "\\x must be followed by hex digits");
302 return NULL;
303 }
304 length=end-index;
305 string=malloc(length+4+1);
306 if (string==NULL)
307 {
308 PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
309 return NULL;
310 }
311 /* Create a string containing "\x<hexdigits>", which will be
312 passed to eval() */
313 string[0]=string[length+3]='"';
314 string[1]='\\';
315 string[length+4]='\0';
316 memcpy(string+2, pattern+index-1, length+1);
317 v=PyRun_String(string, Py_eval_input,
318 PyEval_GetGlobals(), PyEval_GetLocals());
319 free(string);
320 /* The evaluation raised an exception */
321 if (v==NULL) return NULL;
322 result=Py_BuildValue("iOi", CHAR, v, end);
323 Py_DECREF(v);
324 return result;
325 }
326 break;
327
328 case('b'):
329 if (context!=NORMAL)
330 return Py_BuildValue("ici", CHAR, (char)8, index);
331 else
332 {
333 unsigned char empty_string[1];
334 empty_string[0]='\0';
335 return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
336 }
337 break;
338 case('B'):
339 if (context!=NORMAL)
340 return Py_BuildValue("ici", CHAR, 'B', index);
341 else
342 {
343 unsigned char empty_string[1];
344 empty_string[0]='\0';
345 return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
346 }
347 break;
348 case('A'):
349 if (context!=NORMAL)
350 return Py_BuildValue("ici", CHAR, 'A', index);
351 else
352 {
353 unsigned char empty_string[1];
354 empty_string[0]='\0';
355 return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
356 }
357 break;
358 case('Z'):
359 if (context!=NORMAL)
360 return Py_BuildValue("ici", CHAR, 'Z', index);
361 else
362 {
363 unsigned char empty_string[1];
364 empty_string[0]='\0';
365 return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
366 }
367 break;
368 case('E'): case('G'): case('L'): case('Q'):
369 case('U'): case('l'): case('u'):
370 {
371 char message[50];
372 sprintf(message, "\\%c is not allowed", c);
373 PyErr_SetString(ReopError, message);
374 return NULL;
375 }
376
377 case ('w'):
378 if (context==NORMAL)
379 return Py_BuildValue("iii", SYNTAX, Sword, index);
380 if (context!=CHARCLASS)
381 return Py_BuildValue("ici", CHAR, 'w', index);
382 {
383 /* context==CHARCLASS */
384 unsigned char set[256];
385 int i, j;
386 for(i=j=0; i<256; i++)
387 if (re_syntax_table[i] & Sword)
388 {
389 set[j++] = i;
390 }
391 return Py_BuildValue("is#i", SET, set, j, index);
392 }
393 break;
394 case ('W'):
395 if (context==NORMAL)
396 return Py_BuildValue("iii", NOT_SYNTAX, Sword, index);
397 if (context!=CHARCLASS)
398 return Py_BuildValue("ici", CHAR, 'W', index);
399 {
400 /* context==CHARCLASS */
401 unsigned char set[256];
402 int i, j;
403 for(i=j=0; i<256; i++)
404 if (! (re_syntax_table[i] & Sword))
405 {
406 set[j++] = i;
407 }
408 return Py_BuildValue("is#i", SET, set, j, index);
409 }
410 break;
411 case ('s'):
412 if (context==NORMAL)
413 return Py_BuildValue("iii", SYNTAX, Swhitespace, index);
414 if (context!=CHARCLASS)
415 return Py_BuildValue("ici", CHAR, 's', index);
416 {
417 /* context==CHARCLASS */
418 unsigned char set[256];
419 int i, j;
420 for(i=j=0; i<256; i++)
421 if (re_syntax_table[i] & Swhitespace)
422 {
423 set[j++] = i;
424 }
425 return Py_BuildValue("is#i", SET, set, j, index);
426 }
427 break;
428 case ('S'):
429 if (context==NORMAL)
430 return Py_BuildValue("iii", NOT_SYNTAX, Swhitespace, index);
431 if (context!=CHARCLASS)
432 return Py_BuildValue("ici", CHAR, 'S', index);
433 {
434 /* context==CHARCLASS */
435 unsigned char set[256];
436 int i, j;
437 for(i=j=0; i<256; i++)
438 if (! (re_syntax_table[i] & Swhitespace) )
439 {
440 set[j++] = i;
441 }
442 return Py_BuildValue("is#i", SET, set, j, index);
443 }
444 break;
445
446 case ('d'):
447 if (context==NORMAL)
448 return Py_BuildValue("iii", SYNTAX, Sdigit, index);
449 if (context!=CHARCLASS)
450 return Py_BuildValue("ici", CHAR, 'd', index);
451 {
452 /* context==CHARCLASS */
453 unsigned char set[256];
454 int i, j;
455 for(i=j=0; i<256; i++)
456 if (re_syntax_table[i] & Sdigit)
457 {
458 set[j++] = i;
459 }
460 return Py_BuildValue("is#i", SET, set, j, index);
461 }
462 break;
463 case ('D'):
464 if (context==NORMAL)
465 return Py_BuildValue("iii", NOT_SYNTAX, Sdigit, index);
466 if (context!=CHARCLASS)
467 return Py_BuildValue("ici", CHAR, 'D', index);
468 {
469 /* context==CHARCLASS */
470 unsigned char set[256];
471 int i, j;
472 for(i=j=0; i<256; i++)
473 if ( !(re_syntax_table[i] & Sdigit) )
474 {
475 set[j++] = i;
476 }
477 return Py_BuildValue("is#i", SET, set, j, index);
478 }
479 break;
480
481 case('g'):
482 {
483 int end, valid, i;
484 if (context!=REPLACEMENT)
485 return Py_BuildValue("ici", CHAR, 'g', index);
486 if (pattern_len<=index)
487 {
488 PyErr_SetString(ReopError, "unfinished symbolic reference");
489 return NULL;
490 }
491 if (pattern[index]!='<')
492 {
493 PyErr_SetString(ReopError, "missing < in symbolic reference");
494 return NULL;
495 }
496 index++;
497 end=index;
498 while (end<pattern_len && pattern[end]!='>')
499 end++;
500 if (end==pattern_len)
501 {
502 PyErr_SetString(ReopError, "unfinished symbolic reference");
503 return NULL;
504 }
505 valid=1;
506 if (index==end /* Zero-length name */
507 || !(re_syntax_table[pattern[index]] & Sword) /* First char. not alphanumeric */
508 || (re_syntax_table[pattern[index]] & Sdigit) ) /* First char. a digit */
509 valid=0;
510
511 for(i=index+1; i<end; i++)
512 {
513 if (!(re_syntax_table[pattern[i]] & Sword) )
514 valid=0;
515 }
516 if (!valid)
517 {
518 /* XXX should include the text of the reference */
519 PyErr_SetString(ReopError, "illegal symbolic reference");
520 return NULL;
521 }
522
523 return Py_BuildValue("is#i", MEMORY_REFERENCE,
524 pattern+index, end-index,
525 end+1);
526 }
527 break;
528
529 case('0'):
530 {
531 /* \0 always indicates an octal escape, so we consume up to 3
532 characters, as long as they're all octal digits */
533 int octval=0, i;
534 index--;
535 for(i=index;
536 i<=index+2 && i<pattern_len
537 && (re_syntax_table[ pattern[i] ] & Soctaldigit );
538 i++)
539 {
540 octval = octval * 8 + pattern[i] - '0';
541 }
542 if (octval>255)
543 {
544 PyErr_SetString(ReopError, "octal value out of range");
545 return NULL;
546 }
547 return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
548 }
549 break;
550 case('1'): case('2'): case('3'): case('4'):
551 case('5'): case('6'): case('7'): case('8'):
552 case('9'):
553 {
554 /* Handle \?, where ? is from 1 through 9 */
555 int value=0;
556 index--;
557 /* If it's at least a two-digit reference, like \34, it might
558 either be a 3-digit octal escape (\123) or a 2-digit
559 decimal memory reference (\34) */
560
561 if ( (index+1) <pattern_len &&
562 (re_syntax_table[ pattern[index+1] ] & Sdigit) )
563 {
564 if ( (index+2) <pattern_len &&
565 (re_syntax_table[ pattern[index+2] ] & Soctaldigit) &&
566 (re_syntax_table[ pattern[index+1] ] & Soctaldigit) &&
567 (re_syntax_table[ pattern[index ] ] & Soctaldigit)
568 )
569 {
570 /* 3 octal digits */
571 value= 8*8*(pattern[index ]-'0') +
572 8*(pattern[index+1]-'0') +
573 (pattern[index+2]-'0');
574 if (value>255)
575 {
576 PyErr_SetString(ReopError, "octal value out of range");
577 return NULL;
578 }
579 return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
580 }
581 else
582 {
583 /* 2-digit form, so it's a memory reference */
584 if (context==CHARCLASS)
585 {
586 PyErr_SetString(ReopError, "cannot reference a register "
587 "from inside a character class");
588 return NULL;
589 }
590 value= 10*(pattern[index ]-'0') +
591 (pattern[index+1]-'0');
592 if (value<1 || RE_NREGS<=value)
593 {
594 PyErr_SetString(ReopError, "memory reference out of range");
595 return NULL;
596 }
597 return Py_BuildValue("iii", MEMORY_REFERENCE,
598 value, index+2);
599 }
600 }
601 else
602 {
603 /* Single-digit form, like \2, so it's a memory reference */
604 if (context==CHARCLASS)
605 {
606 PyErr_SetString(ReopError, "cannot reference a register "
607 "from inside a character class");
608 return NULL;
609 }
610 return Py_BuildValue("iii", MEMORY_REFERENCE,
611 pattern[index]-'0', index+1);
612 }
613 }
614 break;
615
616 default:
617 return Py_BuildValue("ici", CHAR, c, index);
618 break;
619 }
620}
621
622static PyObject *
623reop__expand(self, args)
624 PyObject *self;
625 PyObject *args;
626{
627 PyObject *results, *match_obj;
628 PyObject *repl_obj, *newstring;
629 char *repl;
630 int size, total_len, i, start, pos;
631
632 if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
633 return NULL;
634
635 repl=PyString_AsString(repl_obj);
636 size=PyString_Size(repl_obj);
637 results=PyList_New(0);
638 if (results==NULL) return NULL;
639 for(start=total_len=i=0; i<size; i++)
640 {
641 if (repl[i]=='\\')
642 {
643 PyObject *args, *t, *value;
644 int escape_type;
645
646 if (start!=i)
647 {
648 PyList_Append(results,
649 PyString_FromStringAndSize(repl+start, i-start));
650 total_len += i-start;
651 }
652 i++;
653 args=Py_BuildValue("Oii", repl_obj, i, REPLACEMENT);
654 t=reop_expand_escape(NULL, args);
655 Py_DECREF(args);
656 if (t==NULL)
657 {
658 /* reop_expand_escape triggered an exception of some sort,
659 so just return */
660 Py_DECREF(results);
661 return NULL;
662 }
663 value=PyTuple_GetItem(t, 1);
664 escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
665 switch (escape_type)
666 {
667 case (CHAR):
668 PyList_Append(results, value);
669 total_len += PyString_Size(value);
670 break;
671 case(MEMORY_REFERENCE):
672 {
673 PyObject *r, *tuple, *result;
674 r=PyObject_GetAttrString(match_obj, "group");
675 tuple=PyTuple_New(1);
676 PyTuple_SetItem(tuple, 0, value);
677 result=PyEval_CallObject(r, tuple);
678 Py_DECREF(r); Py_DECREF(tuple);
679 if (result==NULL)
680 {
681 /* The group() method trigged an exception of some sort */
682 Py_DECREF(results);
683 return NULL;
684 }
685 if (result==Py_None)
686 {
687 char message[50];
688 sprintf(message,
689 "group %li did not contribute to the match",
690 PyInt_AsLong(value));
691 PyErr_SetString(ReopError,
692 message);
693 Py_DECREF(result);
694 Py_DECREF(t);
695 Py_DECREF(results);
696 return NULL;
697 }
698 /* xxx typecheck that it's a string! */
699 PyList_Append(results, result);
700 total_len += PyString_Size(result);
701 Py_DECREF(result);
702 }
703 break;
704 default:
705 Py_DECREF(t);
706 Py_DECREF(results);
707 PyErr_SetString(ReopError,
708 "bad escape in replacement");
709 return NULL;
710 }
711 i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
712 i--; /* Decrement now, because the 'for' loop will increment it */
713 Py_DECREF(t);
714 }
715 } /* endif repl[i]!='\\' */
716
717 if (start!=i)
718 {
719 PyList_Append(results, PyString_FromStringAndSize(repl+start, i-start));
720 total_len += i-start;
721 }
722
723 /* Whew! Now we've constructed a list containing various pieces of
724 strings that will make up our final result. So, iterate over
725 the list concatenating them. A new string measuring total_len
726 bytes is allocated and filled in. */
727
728 newstring=PyString_FromStringAndSize(NULL, total_len);
729 if (newstring==NULL)
730 {
731 Py_DECREF(results);
732 return NULL;
733 }
734
735 repl=PyString_AsString(newstring);
736 for (pos=i=0; i<PyList_Size(results); i++)
737 {
738 PyObject *item=PyList_GetItem(results, i);
739 memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
740 pos += PyString_Size(item);
741 }
742 Py_DECREF(results);
743 return newstring;
744}
745
746
Guido van Rossumdb25f321997-07-10 14:31:32 +0000747#if 0
748/* Functions originally in the regsub module.
749 Added June 1, 1997.
750 */
751
752/* A cache of previously used patterns is maintained. Notice that if
753 you change the reop syntax flag, entries in the cache are
754 invalidated.
755 XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
756 every so often, or once it gets past a certain size?
757*/
758
759static PyObject *cache_dict=NULL;
760
761/* Accept an object; if it's a reop pattern, Py_INCREF it and return
762 it. If it's a string, a reop object is compiled and cached.
763*/
764
765static reopobject *
766cached_compile(pattern)
767 PyObject *pattern;
768{
769 reopobject *p2;
770
771 if (!PyString_Check(pattern))
772 {
773 /* It's not a string, so assume it's a compiled reop object */
774 /* XXX check that! */
775 Py_INCREF(pattern);
776 return (reopobject*)pattern;
777 }
778 if (cache_dict==NULL)
779 {
780 cache_dict=PyDict_New();
781 if (cache_dict==NULL)
782 {
783 return (reopobject*)NULL;
784 }
785 }
786
787 /* See if the pattern has already been cached; if so, return that
788 reop object */
789 p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
790 if (p2)
791 {
792 Py_INCREF(p2);
793 return (reopobject*)p2;
794 }
795
796 /* Compile the pattern and cache it */
797 p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
798 if (!p2) return p2;
799 PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
800 return p2;
801}
802
803
804static PyObject *
805internal_split(args, retain)
806 PyObject *args;
807 int retain;
808{
809 PyObject *newlist, *s;
810 reopobject *pattern;
811 int maxsplit=0, count=0, length, next=0, result;
812 int match_end=0; /* match_start is defined below */
813 char *start;
814
815 if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
816 &maxsplit))
817 {
818 PyErr_Clear();
819 if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
820 return NULL;
821 }
822 pattern=cached_compile((PyObject *)pattern);
823 if (!pattern) return NULL;
824
825 newlist=PyList_New(0);
826 if (!newlist) return NULL;
827
828 do
829 {
830 result = re_search(&pattern->re_patbuf,
831 start, length, next, length-next,
832 &pattern->re_regs);
833 if (result < -1)
834 { /* Erk... an error happened during the reop search */
835 Py_DECREF(newlist);
836 PyErr_SetString(ReopError, "match failure");
837 return NULL;
838 }
839 if (next<=result)
840 {
841 int match_start=pattern->re_regs.start[0];
842 int oldmatch_end=match_end;
843 match_end=pattern->re_regs.end[0];
844
845 if (match_start==match_end)
846 { /* A zero-length match; increment to the next position */
847 next=result+1;
848 match_end=oldmatch_end;
849 continue;
850 }
851
852 /* Append the string up to the start of the match */
853 s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
854 if (!s)
855 {
856 Py_DECREF(newlist);
857 return NULL;
858 }
859 PyList_Append(newlist, s);
860 Py_DECREF(s);
861
862 if (retain)
863 {
864 /* Append a string containing whatever matched */
865 s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
866 if (!s)
867 {
868 Py_DECREF(newlist);
869 return NULL;
870 }
871 PyList_Append(newlist, s);
872 Py_DECREF(s);
873 }
874 /* Update the pointer, and increment the count of splits */
875 next=match_end; count++;
876 }
877 } while (result!=-1 && !(maxsplit && maxsplit==count) &&
878 next<length);
879 s=PyString_FromStringAndSize(start+match_end, length-match_end);
880 if (!s)
881 {
882 Py_DECREF(newlist);
883 return NULL;
884 }
885 PyList_Append(newlist, s);
886 Py_DECREF(s);
887 Py_DECREF(pattern);
888 return newlist;
889}
890
891static PyObject *
892reop_split(self, args)
893 PyObject *self;
894 PyObject *args;
895{
896 return internal_split(args, 0);
897}
898
899static PyObject *
900reop_splitx(self, args)
901 PyObject *self;
902 PyObject *args;
903{
904 return internal_split(args, 1);
905}
906#endif
907
908static struct PyMethodDef reop_global_methods[] = {
909 {"match", reop_match, 0},
910 {"search", reop_search, 0},
Guido van Rossumc24f0381997-08-13 03:24:53 +0000911 {"expand_escape", reop_expand_escape, 1},
912 {"_expand", reop__expand, 1},
Guido van Rossumdb25f321997-07-10 14:31:32 +0000913#if 0
914 {"split", reop_split, 0},
915 {"splitx", reop_splitx, 0},
916#endif
917 {NULL, NULL} /* sentinel */
918};
919
920void
921initreop()
922{
Guido van Rossum74fb3031997-07-17 22:41:38 +0000923 PyObject *m, *d, *k, *v, *o;
Guido van Rossumdb25f321997-07-10 14:31:32 +0000924 int i;
925 char *s;
Guido van Rossum74fb3031997-07-17 22:41:38 +0000926 char j[2];
927
928 re_compile_initialize();
929
Guido van Rossumdb25f321997-07-10 14:31:32 +0000930 m = Py_InitModule("reop", reop_global_methods);
931 d = PyModule_GetDict(m);
932
933 /* Initialize reop.error exception */
934 v = ReopError = PyString_FromString("reop.error");
935 if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
936 goto finally;
937
938 /* Initialize reop.casefold constant */
939 if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
940 goto finally;
941
942 if (!(s = PyString_AsString(v)))
943 goto finally;
944
945 for (i = 0; i < 256; i++) {
946 if (isupper(i))
947 s[i] = tolower(i);
948 else
949 s[i] = i;
950 }
Guido van Rossum74fb3031997-07-17 22:41:38 +0000951
Guido van Rossumdb25f321997-07-10 14:31:32 +0000952 if (PyDict_SetItemString(d, "casefold", v) < 0)
953 goto finally;
954 Py_DECREF(v);
955
Guido van Rossum74fb3031997-07-17 22:41:38 +0000956 /* Initialize the syntax table */
957
958 o = PyDict_New();
959 if (o == NULL)
960 goto finally;
961
962 j[1] = '\0';
963 for (i = 0; i < 256; i++)
964 {
965 j[0] = i;
966 k = PyString_FromStringAndSize(j, 1);
967 if (k == NULL)
968 goto finally;
969 v = PyInt_FromLong(re_syntax_table[i]);
970 if (v == NULL)
971 goto finally;
972 if (PyDict_SetItem(o, k, v) < 0)
973 goto finally;
974 Py_DECREF(k);
975 Py_DECREF(v);
976 }
977
978 if (PyDict_SetItemString(d, "syntax_table", o) < 0)
979 goto finally;
980 Py_DECREF(o);
981
982 v = PyInt_FromLong(Sword);
983 if (v == NULL)
984 goto finally;
985
986 if (PyDict_SetItemString(d, "word", v) < 0)
987 goto finally;
988 Py_DECREF(v);
989
990 v = PyInt_FromLong(Swhitespace);
991 if (v == NULL)
992 goto finally;
993
994 if (PyDict_SetItemString(d, "whitespace", v) < 0)
995 goto finally;
996 Py_DECREF(v);
997
998 v = PyInt_FromLong(Sdigit);
999 if (v == NULL)
1000 goto finally;
1001
1002 if (PyDict_SetItemString(d, "digit", v) < 0)
1003 goto finally;
1004 Py_DECREF(v);
Guido van Rossumc24f0381997-08-13 03:24:53 +00001005
1006 PyDict_SetItemString(d, "NORMAL", PyInt_FromLong(NORMAL));
1007 PyDict_SetItemString(d, "CHARCLASS", PyInt_FromLong(CHARCLASS));
1008 PyDict_SetItemString(d, "REPLACEMENT", PyInt_FromLong(REPLACEMENT));
1009
1010 PyDict_SetItemString(d, "CHAR", PyInt_FromLong(CHAR));
1011 PyDict_SetItemString(d, "MEMORY_REFERENCE", PyInt_FromLong(MEMORY_REFERENCE));
1012 PyDict_SetItemString(d, "SYNTAX", PyInt_FromLong(SYNTAX));
1013 PyDict_SetItemString(d, "NOT_SYNTAX", PyInt_FromLong(NOT_SYNTAX));
1014 PyDict_SetItemString(d, "SET", PyInt_FromLong(SET));
1015 PyDict_SetItemString(d, "WORD_BOUNDARY", PyInt_FromLong(WORD_BOUNDARY));
1016 PyDict_SetItemString(d, "NOT_WORD_BOUNDARY", PyInt_FromLong(NOT_WORD_BOUNDARY));
1017 PyDict_SetItemString(d, "BEGINNING_OF_BUFFER", PyInt_FromLong(BEGINNING_OF_BUFFER));
1018 PyDict_SetItemString(d, "END_OF_BUFFER", PyInt_FromLong(END_OF_BUFFER));
1019
Guido van Rossumdb25f321997-07-10 14:31:32 +00001020 if (!PyErr_Occurred())
1021 return;
Guido van Rossum74fb3031997-07-17 22:41:38 +00001022
Guido van Rossumdb25f321997-07-10 14:31:32 +00001023 finally:
1024 Py_FatalError("can't initialize reop module");
1025}
Guido van Rossumc24f0381997-08-13 03:24:53 +00001026