Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 1 | /*********************************************************** |
| 2 | Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam, |
| 3 | The Netherlands. |
| 4 | |
| 5 | All Rights Reserved |
| 6 | |
| 7 | Permission to use, copy, modify, and distribute this software and its |
| 8 | documentation for any purpose and without fee is hereby granted, |
| 9 | provided that the above copyright notice appear in all copies and that |
| 10 | both that copyright notice and this permission notice appear in |
| 11 | supporting documentation, and that the names of Stichting Mathematisch |
| 12 | Centrum or CWI or Corporation for National Research Initiatives or |
| 13 | CNRI not be used in advertising or publicity pertaining to |
| 14 | distribution of the software without specific, written prior |
| 15 | permission. |
| 16 | |
| 17 | While CWI is the initial source for this software, a modified version |
| 18 | is made available by the Corporation for National Research Initiatives |
| 19 | (CNRI) at the Internet address ftp://ftp.python.org. |
| 20 | |
| 21 | STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH |
| 22 | REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF |
| 23 | MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH |
| 24 | CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL |
| 25 | DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR |
| 26 | PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
| 27 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| 28 | PERFORMANCE OF THIS SOFTWARE. |
| 29 | |
| 30 | ******************************************************************/ |
| 31 | |
| 32 | /* $Id$ */ |
| 33 | |
| 34 | /* Regular expression objects */ |
| 35 | /* This uses Tatu Ylonen's copyleft-free reimplementation of |
| 36 | GNU regular expressions */ |
| 37 | |
| 38 | #include "Python.h" |
| 39 | |
| 40 | #include <ctype.h> |
| 41 | |
| 42 | #include "regexpr.h" |
| 43 | |
| 44 | static PyObject *ReopError; /* Exception */ |
| 45 | |
| 46 | static PyObject * |
| 47 | makeresult(regs, num_regs) |
| 48 | struct re_registers *regs; |
| 49 | int num_regs; |
| 50 | { |
| 51 | PyObject *v; |
| 52 | int i; |
| 53 | static PyObject *filler = NULL; |
| 54 | |
| 55 | if (filler == NULL) { |
| 56 | filler = Py_BuildValue("(ii)", -1, -1); |
| 57 | if (filler == NULL) |
| 58 | return NULL; |
| 59 | } |
| 60 | v = PyTuple_New(num_regs); |
| 61 | if (v == NULL) |
| 62 | return NULL; |
| 63 | |
| 64 | for (i = 0; i < num_regs; i++) { |
| 65 | int lo = regs->start[i]; |
| 66 | int hi = regs->end[i]; |
| 67 | PyObject *w; |
| 68 | if (lo == -1 && hi == -1) { |
| 69 | w = filler; |
| 70 | Py_INCREF(w); |
| 71 | } |
| 72 | else |
| 73 | w = Py_BuildValue("(ii)", lo, hi); |
| 74 | if (w == NULL || PyTuple_SetItem(v, i, w) < 0) { |
| 75 | Py_DECREF(v); |
| 76 | return NULL; |
| 77 | } |
| 78 | } |
| 79 | return v; |
| 80 | } |
| 81 | |
| 82 | static PyObject * |
| 83 | reop_match(self, args) |
| 84 | PyObject *self; |
| 85 | PyObject *args; |
| 86 | { |
| 87 | char *string; |
| 88 | int fastmaplen, stringlen; |
| 89 | int can_be_null, anchor, i; |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 90 | int flags, pos, result; |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 91 | struct re_pattern_buffer bufp; |
| 92 | struct re_registers re_regs; |
| 93 | |
| 94 | if (!PyArg_Parse(args, "(s#iiis#is#i)", |
| 95 | &(bufp.buffer), &(bufp.allocated), |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 96 | &(bufp.num_registers), &flags, &can_be_null, |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 97 | &(bufp.fastmap), &fastmaplen, |
| 98 | &anchor, |
| 99 | &string, &stringlen, |
| 100 | &pos)) |
| 101 | return NULL; |
| 102 | |
| 103 | /* XXX sanity-check the input data */ |
| 104 | bufp.used=bufp.allocated; |
| 105 | bufp.translate=NULL; |
| 106 | bufp.fastmap_accurate=1; |
| 107 | bufp.can_be_null=can_be_null; |
| 108 | bufp.uses_registers=1; |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 109 | bufp.anchor=anchor; |
| 110 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 111 | for(i=0; i<bufp.num_registers; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;} |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 112 | |
| 113 | result = re_match(&bufp, |
| 114 | string, stringlen, pos, |
| 115 | &re_regs); |
| 116 | if (result < -1) { |
| 117 | /* Failure like stack overflow */ |
| 118 | PyErr_SetString(ReopError, "match failure"); |
| 119 | return NULL; |
| 120 | } |
Guido van Rossum | 63e1819 | 1997-07-11 11:08:38 +0000 | [diff] [blame] | 121 | if (result == -1) { |
| 122 | Py_INCREF(Py_None); |
| 123 | return Py_None; |
| 124 | } |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 125 | return makeresult(&re_regs, bufp.num_registers); |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 126 | } |
| 127 | |
| 128 | static PyObject * |
| 129 | reop_search(self, args) |
| 130 | PyObject *self; |
| 131 | PyObject *args; |
| 132 | { |
| 133 | char *string; |
| 134 | int fastmaplen, stringlen; |
| 135 | int can_be_null, anchor, i; |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 136 | int flags, pos, result; |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 137 | struct re_pattern_buffer bufp; |
| 138 | struct re_registers re_regs; |
| 139 | |
| 140 | if (!PyArg_Parse(args, "(s#iiis#is#i)", |
| 141 | &(bufp.buffer), &(bufp.allocated), |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 142 | &(bufp.num_registers), &flags, &can_be_null, |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 143 | &(bufp.fastmap), &fastmaplen, |
| 144 | &anchor, |
| 145 | &string, &stringlen, |
| 146 | &pos)) |
| 147 | return NULL; |
| 148 | |
| 149 | /* XXX sanity-check the input data */ |
| 150 | bufp.used=bufp.allocated; |
| 151 | bufp.translate=NULL; |
| 152 | bufp.fastmap_accurate=1; |
| 153 | bufp.can_be_null=can_be_null; |
| 154 | bufp.uses_registers=1; |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 155 | bufp.anchor=anchor; |
| 156 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 157 | for(i=0; i<bufp.num_registers; i++) {re_regs.start[i]=-1; re_regs.end[i]=-1;} |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 158 | |
| 159 | result = re_search(&bufp, |
| 160 | string, stringlen, pos, stringlen-pos, |
| 161 | &re_regs); |
| 162 | if (result < -1) { |
| 163 | /* Failure like stack overflow */ |
| 164 | PyErr_SetString(ReopError, "match failure"); |
| 165 | return NULL; |
| 166 | } |
Guido van Rossum | 63e1819 | 1997-07-11 11:08:38 +0000 | [diff] [blame] | 167 | if (result == -1) { |
| 168 | Py_INCREF(Py_None); |
| 169 | return Py_None; |
| 170 | } |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 171 | return makeresult(&re_regs, bufp.num_registers); |
Guido van Rossum | db25f32 | 1997-07-10 14:31:32 +0000 | [diff] [blame] | 172 | } |
| 173 | |
| 174 | #if 0 |
| 175 | /* Functions originally in the regsub module. |
| 176 | Added June 1, 1997. |
| 177 | */ |
| 178 | |
| 179 | /* A cache of previously used patterns is maintained. Notice that if |
| 180 | you change the reop syntax flag, entries in the cache are |
| 181 | invalidated. |
| 182 | XXX Solution: use (syntax flag, pattern) as keys? Clear the cache |
| 183 | every so often, or once it gets past a certain size? |
| 184 | */ |
| 185 | |
| 186 | static PyObject *cache_dict=NULL; |
| 187 | |
| 188 | /* Accept an object; if it's a reop pattern, Py_INCREF it and return |
| 189 | it. If it's a string, a reop object is compiled and cached. |
| 190 | */ |
| 191 | |
| 192 | static reopobject * |
| 193 | cached_compile(pattern) |
| 194 | PyObject *pattern; |
| 195 | { |
| 196 | reopobject *p2; |
| 197 | |
| 198 | if (!PyString_Check(pattern)) |
| 199 | { |
| 200 | /* It's not a string, so assume it's a compiled reop object */ |
| 201 | /* XXX check that! */ |
| 202 | Py_INCREF(pattern); |
| 203 | return (reopobject*)pattern; |
| 204 | } |
| 205 | if (cache_dict==NULL) |
| 206 | { |
| 207 | cache_dict=PyDict_New(); |
| 208 | if (cache_dict==NULL) |
| 209 | { |
| 210 | return (reopobject*)NULL; |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | /* See if the pattern has already been cached; if so, return that |
| 215 | reop object */ |
| 216 | p2=(reopobject*)PyDict_GetItem(cache_dict, pattern); |
| 217 | if (p2) |
| 218 | { |
| 219 | Py_INCREF(p2); |
| 220 | return (reopobject*)p2; |
| 221 | } |
| 222 | |
| 223 | /* Compile the pattern and cache it */ |
| 224 | p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL); |
| 225 | if (!p2) return p2; |
| 226 | PyDict_SetItem(cache_dict, pattern, (PyObject*)p2); |
| 227 | return p2; |
| 228 | } |
| 229 | |
| 230 | |
| 231 | static PyObject * |
| 232 | internal_split(args, retain) |
| 233 | PyObject *args; |
| 234 | int retain; |
| 235 | { |
| 236 | PyObject *newlist, *s; |
| 237 | reopobject *pattern; |
| 238 | int maxsplit=0, count=0, length, next=0, result; |
| 239 | int match_end=0; /* match_start is defined below */ |
| 240 | char *start; |
| 241 | |
| 242 | if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern, |
| 243 | &maxsplit)) |
| 244 | { |
| 245 | PyErr_Clear(); |
| 246 | if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern)) |
| 247 | return NULL; |
| 248 | } |
| 249 | pattern=cached_compile((PyObject *)pattern); |
| 250 | if (!pattern) return NULL; |
| 251 | |
| 252 | newlist=PyList_New(0); |
| 253 | if (!newlist) return NULL; |
| 254 | |
| 255 | do |
| 256 | { |
| 257 | result = re_search(&pattern->re_patbuf, |
| 258 | start, length, next, length-next, |
| 259 | &pattern->re_regs); |
| 260 | if (result < -1) |
| 261 | { /* Erk... an error happened during the reop search */ |
| 262 | Py_DECREF(newlist); |
| 263 | PyErr_SetString(ReopError, "match failure"); |
| 264 | return NULL; |
| 265 | } |
| 266 | if (next<=result) |
| 267 | { |
| 268 | int match_start=pattern->re_regs.start[0]; |
| 269 | int oldmatch_end=match_end; |
| 270 | match_end=pattern->re_regs.end[0]; |
| 271 | |
| 272 | if (match_start==match_end) |
| 273 | { /* A zero-length match; increment to the next position */ |
| 274 | next=result+1; |
| 275 | match_end=oldmatch_end; |
| 276 | continue; |
| 277 | } |
| 278 | |
| 279 | /* Append the string up to the start of the match */ |
| 280 | s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end); |
| 281 | if (!s) |
| 282 | { |
| 283 | Py_DECREF(newlist); |
| 284 | return NULL; |
| 285 | } |
| 286 | PyList_Append(newlist, s); |
| 287 | Py_DECREF(s); |
| 288 | |
| 289 | if (retain) |
| 290 | { |
| 291 | /* Append a string containing whatever matched */ |
| 292 | s=PyString_FromStringAndSize(start+match_start, match_end-match_start); |
| 293 | if (!s) |
| 294 | { |
| 295 | Py_DECREF(newlist); |
| 296 | return NULL; |
| 297 | } |
| 298 | PyList_Append(newlist, s); |
| 299 | Py_DECREF(s); |
| 300 | } |
| 301 | /* Update the pointer, and increment the count of splits */ |
| 302 | next=match_end; count++; |
| 303 | } |
| 304 | } while (result!=-1 && !(maxsplit && maxsplit==count) && |
| 305 | next<length); |
| 306 | s=PyString_FromStringAndSize(start+match_end, length-match_end); |
| 307 | if (!s) |
| 308 | { |
| 309 | Py_DECREF(newlist); |
| 310 | return NULL; |
| 311 | } |
| 312 | PyList_Append(newlist, s); |
| 313 | Py_DECREF(s); |
| 314 | Py_DECREF(pattern); |
| 315 | return newlist; |
| 316 | } |
| 317 | |
| 318 | static PyObject * |
| 319 | reop_split(self, args) |
| 320 | PyObject *self; |
| 321 | PyObject *args; |
| 322 | { |
| 323 | return internal_split(args, 0); |
| 324 | } |
| 325 | |
| 326 | static PyObject * |
| 327 | reop_splitx(self, args) |
| 328 | PyObject *self; |
| 329 | PyObject *args; |
| 330 | { |
| 331 | return internal_split(args, 1); |
| 332 | } |
| 333 | #endif |
| 334 | |
| 335 | static struct PyMethodDef reop_global_methods[] = { |
| 336 | {"match", reop_match, 0}, |
| 337 | {"search", reop_search, 0}, |
| 338 | #if 0 |
| 339 | {"split", reop_split, 0}, |
| 340 | {"splitx", reop_splitx, 0}, |
| 341 | #endif |
| 342 | {NULL, NULL} /* sentinel */ |
| 343 | }; |
| 344 | |
| 345 | void |
| 346 | initreop() |
| 347 | { |
| 348 | PyObject *m, *d, *v; |
| 349 | int i; |
| 350 | char *s; |
| 351 | |
| 352 | m = Py_InitModule("reop", reop_global_methods); |
| 353 | d = PyModule_GetDict(m); |
| 354 | |
| 355 | /* Initialize reop.error exception */ |
| 356 | v = ReopError = PyString_FromString("reop.error"); |
| 357 | if (v == NULL || PyDict_SetItemString(d, "error", v) != 0) |
| 358 | goto finally; |
| 359 | |
| 360 | /* Initialize reop.casefold constant */ |
| 361 | if (!(v = PyString_FromStringAndSize((char *)NULL, 256))) |
| 362 | goto finally; |
| 363 | |
| 364 | if (!(s = PyString_AsString(v))) |
| 365 | goto finally; |
| 366 | |
| 367 | for (i = 0; i < 256; i++) { |
| 368 | if (isupper(i)) |
| 369 | s[i] = tolower(i); |
| 370 | else |
| 371 | s[i] = i; |
| 372 | } |
| 373 | if (PyDict_SetItemString(d, "casefold", v) < 0) |
| 374 | goto finally; |
| 375 | Py_DECREF(v); |
| 376 | |
| 377 | if (!PyErr_Occurred()) |
| 378 | return; |
| 379 | finally: |
| 380 | Py_FatalError("can't initialize reop module"); |
| 381 | } |