Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 1 | /* |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 2 | XXX support range parameter on search |
| 3 | XXX support mstop parameter on search |
| 4 | */ |
| 5 | |
| 6 | /*********************************************************** |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 7 | Copyright 1991, 1992, 1993 by Stichting Mathematisch Centrum, |
| 8 | Amsterdam, The Netherlands. |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 9 | |
| 10 | All Rights Reserved |
| 11 | |
| 12 | Permission to use, copy, modify, and distribute this software and its |
| 13 | documentation for any purpose and without fee is hereby granted, |
| 14 | provided that the above copyright notice appear in all copies and that |
| 15 | both that copyright notice and this permission notice appear in |
| 16 | supporting documentation, and that the names of Stichting Mathematisch |
| 17 | Centrum or CWI not be used in advertising or publicity pertaining to |
| 18 | distribution of the software without specific, written prior permission. |
| 19 | |
| 20 | STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO |
| 21 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 22 | FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE |
| 23 | FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 24 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 25 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| 26 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 27 | |
| 28 | ******************************************************************/ |
| 29 | |
| 30 | /* Regular expression objects */ |
Guido van Rossum | 1cab95c | 1992-01-19 16:31:57 +0000 | [diff] [blame] | 31 | /* This uses Tatu Ylonen's copyleft-free reimplementation of |
| 32 | GNU regular expressions */ |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 33 | |
| 34 | #include "allobjects.h" |
| 35 | #include "modsupport.h" |
| 36 | |
Guido van Rossum | 1cab95c | 1992-01-19 16:31:57 +0000 | [diff] [blame] | 37 | #include "regexpr.h" |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 38 | |
| 39 | static object *RegexError; /* Exception */ |
| 40 | |
| 41 | typedef struct { |
| 42 | OB_HEAD |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 43 | struct re_pattern_buffer re_patbuf; /* The compiled expression */ |
| 44 | struct re_registers re_regs; /* The registers from the last match */ |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 45 | char re_fastmap[256]; /* Storage for fastmap */ |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 46 | object *re_translate; /* String object for translate table */ |
| 47 | object *re_lastok; /* String object last matched/searched */ |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 48 | } regexobject; |
| 49 | |
| 50 | /* Regex object methods */ |
| 51 | |
| 52 | static void |
| 53 | reg_dealloc(re) |
| 54 | regexobject *re; |
| 55 | { |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 56 | XDECREF(re->re_translate); |
| 57 | XDECREF(re->re_lastok); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 58 | XDEL(re->re_patbuf.buffer); |
| 59 | XDEL(re->re_patbuf.translate); |
| 60 | DEL(re); |
| 61 | } |
| 62 | |
| 63 | static object * |
| 64 | makeresult(regs) |
| 65 | struct re_registers *regs; |
| 66 | { |
| 67 | object *v = newtupleobject(RE_NREGS); |
| 68 | if (v != NULL) { |
| 69 | int i; |
| 70 | for (i = 0; i < RE_NREGS; i++) { |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 71 | object *w; |
| 72 | w = mkvalue("(ii)", regs->start[i], regs->end[i]); |
| 73 | if (w == NULL) { |
| 74 | XDECREF(v); |
| 75 | v = NULL; |
| 76 | break; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 77 | } |
| 78 | settupleitem(v, i, w); |
| 79 | } |
| 80 | } |
| 81 | return v; |
| 82 | } |
| 83 | |
| 84 | static object * |
| 85 | reg_match(re, args) |
| 86 | regexobject *re; |
| 87 | object *args; |
| 88 | { |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 89 | object *argstring; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 90 | char *buffer; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 91 | int size; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 92 | int offset; |
| 93 | int result; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 94 | if (getargs(args, "S", &argstring)) { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 95 | offset = 0; |
| 96 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 97 | else { |
| 98 | err_clear(); |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 99 | if (!getargs(args, "(Si)", &argstring, &offset)) |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 100 | return NULL; |
| 101 | } |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 102 | buffer = getstringvalue(argstring); |
| 103 | size = getstringsize(argstring); |
| 104 | if (offset < 0 || offset > size) { |
| 105 | err_setstr(RegexError, "match offset out of range"); |
| 106 | return NULL; |
| 107 | } |
| 108 | XDECREF(re->re_lastok); |
| 109 | re->re_lastok = NULL; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 110 | result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 111 | if (result < -1) { |
| 112 | /* Failure like stack overflow */ |
| 113 | err_setstr(RegexError, "match failure"); |
| 114 | return NULL; |
| 115 | } |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 116 | if (result >= 0) { |
| 117 | INCREF(argstring); |
| 118 | re->re_lastok = argstring; |
| 119 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 120 | return newintobject((long)result); /* Length of the match or -1 */ |
| 121 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 122 | |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 123 | static object * |
| 124 | reg_search(re, args) |
| 125 | regexobject *re; |
| 126 | object *args; |
| 127 | { |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 128 | object *argstring; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 129 | char *buffer; |
| 130 | int size; |
| 131 | int offset; |
| 132 | int range; |
| 133 | int result; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 134 | |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 135 | if (getargs(args, "S", &argstring)) { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 136 | offset = 0; |
| 137 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 138 | else { |
| 139 | err_clear(); |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 140 | if (!getargs(args, "(Si)", &argstring, &offset)) |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 141 | return NULL; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 142 | } |
| 143 | buffer = getstringvalue(argstring); |
| 144 | size = getstringsize(argstring); |
| 145 | if (offset < 0 || offset > size) { |
| 146 | err_setstr(RegexError, "search offset out of range"); |
| 147 | return NULL; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 148 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 149 | /* NB: In Emacs 18.57, the documentation for re_search[_2] and |
| 150 | the implementation don't match: the documentation states that |
| 151 | |range| positions are tried, while the code tries |range|+1 |
| 152 | positions. It seems more productive to believe the code! */ |
Guido van Rossum | 2d78590 | 1992-01-26 18:12:41 +0000 | [diff] [blame] | 153 | range = size - offset; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 154 | XDECREF(re->re_lastok); |
| 155 | re->re_lastok = NULL; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 156 | result = re_search(&re->re_patbuf, buffer, size, offset, range, |
| 157 | &re->re_regs); |
| 158 | if (result < -1) { |
| 159 | /* Failure like stack overflow */ |
| 160 | err_setstr(RegexError, "match failure"); |
| 161 | return NULL; |
| 162 | } |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 163 | if (result >= 0) { |
| 164 | INCREF(argstring); |
| 165 | re->re_lastok = argstring; |
| 166 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 167 | return newintobject((long)result); /* Position of the match or -1 */ |
| 168 | } |
| 169 | |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 170 | static object * |
| 171 | reg_substring(re, args) |
| 172 | regexobject *re; |
| 173 | object *args; |
| 174 | { |
| 175 | int i, a, b; |
| 176 | if (args != NULL && is_tupleobject(args)) { |
| 177 | int n = gettuplesize(args); |
| 178 | object *res = newtupleobject(n); |
| 179 | if (res == NULL) |
| 180 | return NULL; |
| 181 | for (i = 0; i < n; i++) { |
| 182 | object *v = reg_substring(re, gettupleitem(args, i)); |
| 183 | if (v == NULL) { |
| 184 | DECREF(res); |
| 185 | return NULL; |
| 186 | } |
| 187 | settupleitem(res, i, v); |
| 188 | } |
| 189 | return res; |
| 190 | } |
| 191 | if (!getargs(args, "i", &i)) |
| 192 | return NULL; |
| 193 | if (i < 0 || i >= RE_NREGS) { |
| 194 | err_setstr(RegexError, "substring() index out of range"); |
| 195 | return NULL; |
| 196 | } |
| 197 | if (re->re_lastok == NULL) { |
| 198 | err_setstr(RegexError, |
| 199 | "substring() only valid after successful match/search"); |
| 200 | return NULL; |
| 201 | } |
| 202 | a = re->re_regs.start[i]; |
| 203 | b = re->re_regs.end[i]; |
| 204 | if (a < 0 || b < 0) { |
| 205 | INCREF(None); |
| 206 | return None; |
| 207 | } |
| 208 | return newsizedstringobject(getstringvalue(re->re_lastok)+a, b-a); |
| 209 | } |
| 210 | |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 211 | static struct methodlist reg_methods[] = { |
| 212 | {"match", reg_match}, |
| 213 | {"search", reg_search}, |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 214 | {"substring", reg_substring}, |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 215 | {NULL, NULL} /* sentinel */ |
| 216 | }; |
| 217 | |
| 218 | static object * |
| 219 | reg_getattr(re, name) |
| 220 | regexobject *re; |
| 221 | char *name; |
| 222 | { |
Guido van Rossum | b824fc6 | 1992-01-01 14:52:16 +0000 | [diff] [blame] | 223 | if (strcmp(name, "regs") == 0) { |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 224 | if (re->re_lastok == NULL) { |
Guido van Rossum | b824fc6 | 1992-01-01 14:52:16 +0000 | [diff] [blame] | 225 | err_setstr(RegexError, |
| 226 | "regs only valid after successful match/search"); |
| 227 | return NULL; |
| 228 | } |
| 229 | return makeresult(&re->re_regs); |
| 230 | } |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 231 | if (strcmp(name, "last") == 0) { |
| 232 | if (re->re_lastok == NULL) { |
| 233 | err_setstr(RegexError, |
| 234 | "last only valid after successful match/search"); |
| 235 | return NULL; |
| 236 | } |
| 237 | INCREF(re->re_lastok); |
| 238 | return re->re_lastok; |
| 239 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 240 | return findmethod(reg_methods, (object *)re, name); |
| 241 | } |
| 242 | |
| 243 | static typeobject Regextype = { |
| 244 | OB_HEAD_INIT(&Typetype) |
| 245 | 0, /*ob_size*/ |
| 246 | "regex", /*tp_name*/ |
| 247 | sizeof(regexobject), /*tp_size*/ |
| 248 | 0, /*tp_itemsize*/ |
| 249 | /* methods */ |
| 250 | reg_dealloc, /*tp_dealloc*/ |
| 251 | 0, /*tp_print*/ |
| 252 | reg_getattr, /*tp_getattr*/ |
| 253 | 0, /*tp_setattr*/ |
| 254 | 0, /*tp_compare*/ |
| 255 | 0, /*tp_repr*/ |
| 256 | }; |
| 257 | |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 258 | static object * |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 259 | newregexobject(pat, size, translate) |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 260 | char *pat; |
| 261 | int size; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 262 | object *translate; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 263 | { |
| 264 | regexobject *re; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 265 | if (translate != NULL && getstringsize(translate) != 256) { |
| 266 | err_setstr(RegexError, |
| 267 | "translation table must be 256 bytes"); |
| 268 | return NULL; |
| 269 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 270 | re = NEWOBJ(regexobject, &Regextype); |
| 271 | if (re != NULL) { |
| 272 | char *error; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 273 | re->re_patbuf.buffer = NULL; |
| 274 | re->re_patbuf.allocated = 0; |
| 275 | re->re_patbuf.fastmap = re->re_fastmap; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 276 | if (translate) |
| 277 | re->re_patbuf.translate = getstringvalue(translate); |
| 278 | else |
| 279 | re->re_patbuf.translate = NULL; |
| 280 | XINCREF(translate); |
| 281 | re->re_translate = translate; |
| 282 | re->re_lastok = NULL; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 283 | error = re_compile_pattern(pat, size, &re->re_patbuf); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 284 | if (error != NULL) { |
| 285 | err_setstr(RegexError, error); |
| 286 | DECREF(re); |
| 287 | re = NULL; |
| 288 | } |
| 289 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 290 | return (object *)re; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 291 | } |
| 292 | |
| 293 | static object * |
| 294 | regex_compile(self, args) |
| 295 | object *self; |
| 296 | object *args; |
| 297 | { |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 298 | char *pat; |
| 299 | int size; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 300 | object *tran = NULL; |
| 301 | if (!getargs(args, "s#", &pat, &size)) { |
| 302 | err_clear(); |
| 303 | if (!getargs(args, "(s#S)", &pat, &size, &tran)) |
| 304 | return NULL; |
| 305 | } |
| 306 | return newregexobject(pat, size, tran); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 307 | } |
| 308 | |
| 309 | static object *cache_pat; |
| 310 | static object *cache_prog; |
| 311 | |
| 312 | static int |
| 313 | update_cache(pat) |
| 314 | object *pat; |
| 315 | { |
| 316 | if (pat != cache_pat) { |
| 317 | XDECREF(cache_pat); |
| 318 | cache_pat = NULL; |
| 319 | XDECREF(cache_prog); |
| 320 | cache_prog = regex_compile((object *)NULL, pat); |
| 321 | if (cache_prog == NULL) |
| 322 | return -1; |
| 323 | cache_pat = pat; |
| 324 | INCREF(cache_pat); |
| 325 | } |
| 326 | return 0; |
| 327 | } |
| 328 | |
| 329 | static object * |
| 330 | regex_match(self, args) |
| 331 | object *self; |
| 332 | object *args; |
| 333 | { |
| 334 | object *pat, *string; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 335 | if (!getargs(args, "(SS)", &pat, &string)) |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 336 | return NULL; |
| 337 | if (update_cache(pat) < 0) |
| 338 | return NULL; |
| 339 | return reg_match((regexobject *)cache_prog, string); |
| 340 | } |
| 341 | |
| 342 | static object * |
| 343 | regex_search(self, args) |
| 344 | object *self; |
| 345 | object *args; |
| 346 | { |
| 347 | object *pat, *string; |
Guido van Rossum | 36d330b | 1993-02-21 20:12:16 +0000 | [diff] [blame^] | 348 | if (!getargs(args, "(SS)", &pat, &string)) |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 349 | return NULL; |
| 350 | if (update_cache(pat) < 0) |
| 351 | return NULL; |
| 352 | return reg_search((regexobject *)cache_prog, string); |
| 353 | } |
| 354 | |
| 355 | static object * |
| 356 | regex_set_syntax(self, args) |
| 357 | object *self, *args; |
| 358 | { |
| 359 | int syntax; |
| 360 | if (!getintarg(args, &syntax)) |
| 361 | return NULL; |
| 362 | syntax = re_set_syntax(syntax); |
| 363 | return newintobject((long)syntax); |
| 364 | } |
| 365 | |
| 366 | static struct methodlist regex_global_methods[] = { |
| 367 | {"compile", regex_compile}, |
| 368 | {"match", regex_match}, |
| 369 | {"search", regex_search}, |
| 370 | {"set_syntax", regex_set_syntax}, |
| 371 | {NULL, NULL} /* sentinel */ |
| 372 | }; |
| 373 | |
| 374 | initregex() |
| 375 | { |
| 376 | object *m, *d; |
| 377 | |
| 378 | m = initmodule("regex", regex_global_methods); |
| 379 | d = getmoduledict(m); |
| 380 | |
| 381 | /* Initialize regex.error exception */ |
| 382 | RegexError = newstringobject("regex.error"); |
| 383 | if (RegexError == NULL || dictinsert(d, "error", RegexError) != 0) |
| 384 | fatal("can't define regex.error"); |
| 385 | } |