Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 1 | /* |
| 2 | XXX support translate table |
| 3 | XXX support range parameter on search |
| 4 | XXX support mstop parameter on search |
| 5 | */ |
| 6 | |
| 7 | /*********************************************************** |
| 8 | Copyright 1991 by Stichting Mathematisch Centrum, Amsterdam, The |
| 9 | Netherlands. |
| 10 | |
| 11 | All Rights Reserved |
| 12 | |
| 13 | Permission to use, copy, modify, and distribute this software and its |
| 14 | documentation for any purpose and without fee is hereby granted, |
| 15 | provided that the above copyright notice appear in all copies and that |
| 16 | both that copyright notice and this permission notice appear in |
| 17 | supporting documentation, and that the names of Stichting Mathematisch |
| 18 | Centrum or CWI not be used in advertising or publicity pertaining to |
| 19 | distribution of the software without specific, written prior permission. |
| 20 | |
| 21 | STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO |
| 22 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
| 23 | FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE |
| 24 | FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 25 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 26 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
| 27 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 28 | |
| 29 | ******************************************************************/ |
| 30 | |
| 31 | /* Regular expression objects */ |
Guido van Rossum | 1cab95c | 1992-01-19 16:31:57 +0000 | [diff] [blame] | 32 | /* This uses Tatu Ylonen's copyleft-free reimplementation of |
| 33 | GNU regular expressions */ |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 34 | |
| 35 | #include "allobjects.h" |
| 36 | #include "modsupport.h" |
| 37 | |
Guido van Rossum | 1cab95c | 1992-01-19 16:31:57 +0000 | [diff] [blame] | 38 | #include "regexpr.h" |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 39 | |
| 40 | static object *RegexError; /* Exception */ |
| 41 | |
| 42 | typedef struct { |
| 43 | OB_HEAD |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 44 | struct re_pattern_buffer re_patbuf; /* The compiled expression */ |
| 45 | struct re_registers re_regs; /* The registers from the last match */ |
| 46 | int re_regs_valid; /* Nonzero if the registers are valid */ |
| 47 | char re_fastmap[256]; /* Storage for fastmap */ |
| 48 | } regexobject; |
| 49 | |
| 50 | /* Regex object methods */ |
| 51 | |
| 52 | static void |
| 53 | reg_dealloc(re) |
| 54 | regexobject *re; |
| 55 | { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 56 | XDEL(re->re_patbuf.buffer); |
| 57 | XDEL(re->re_patbuf.translate); |
| 58 | DEL(re); |
| 59 | } |
| 60 | |
| 61 | static object * |
| 62 | makeresult(regs) |
| 63 | struct re_registers *regs; |
| 64 | { |
| 65 | object *v = newtupleobject(RE_NREGS); |
| 66 | if (v != NULL) { |
| 67 | int i; |
| 68 | for (i = 0; i < RE_NREGS; i++) { |
| 69 | object *w, *u; |
| 70 | if ( (w = newtupleobject(2)) == NULL || |
| 71 | (u = newintobject(regs->start[i])) == NULL || |
| 72 | settupleitem(w, 0, u) != 0 || |
| 73 | (u = newintobject(regs->end[i])) == NULL || |
| 74 | settupleitem(w, 1, u) != 0) { |
| 75 | XDECREF(w); |
| 76 | DECREF(v); |
| 77 | return NULL; |
| 78 | } |
| 79 | settupleitem(v, i, w); |
| 80 | } |
| 81 | } |
| 82 | return v; |
| 83 | } |
| 84 | |
| 85 | static object * |
| 86 | reg_match(re, args) |
| 87 | regexobject *re; |
| 88 | object *args; |
| 89 | { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 90 | char *buffer; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 91 | int size; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 92 | int offset; |
| 93 | int result; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 94 | if (getargs(args, "s#", &buffer, &size)) { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 95 | offset = 0; |
| 96 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 97 | else { |
| 98 | err_clear(); |
| 99 | if (!getargs(args, "(s#i)", &buffer, &size, &offset)) |
| 100 | return NULL; |
Guido van Rossum | 3d1e146 | 1992-09-03 20:35:01 +0000 | [diff] [blame] | 101 | if (offset < 0 || offset > size) { |
| 102 | err_setstr(RegexError, "match offset out of range"); |
| 103 | return NULL; |
| 104 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 105 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 106 | re->re_regs_valid = 0; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 107 | result = re_match(&re->re_patbuf, buffer, size, offset, &re->re_regs); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 108 | if (result < -1) { |
| 109 | /* Failure like stack overflow */ |
| 110 | err_setstr(RegexError, "match failure"); |
| 111 | return NULL; |
| 112 | } |
| 113 | re->re_regs_valid = result >= 0; |
| 114 | return newintobject((long)result); /* Length of the match or -1 */ |
| 115 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 116 | |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 117 | static object * |
| 118 | reg_search(re, args) |
| 119 | regexobject *re; |
| 120 | object *args; |
| 121 | { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 122 | char *buffer; |
| 123 | int size; |
| 124 | int offset; |
| 125 | int range; |
| 126 | int result; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 127 | |
| 128 | if (getargs(args, "s#", &buffer, &size)) { |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 129 | offset = 0; |
| 130 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 131 | else { |
| 132 | err_clear(); |
| 133 | if (!getargs(args, "(s#i)", &buffer, &size, &offset)) |
| 134 | return NULL; |
Guido van Rossum | 3d1e146 | 1992-09-03 20:35:01 +0000 | [diff] [blame] | 135 | if (offset < 0 || offset > size) { |
| 136 | err_setstr(RegexError, "search offset out of range"); |
| 137 | return NULL; |
| 138 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 139 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 140 | /* NB: In Emacs 18.57, the documentation for re_search[_2] and |
| 141 | the implementation don't match: the documentation states that |
| 142 | |range| positions are tried, while the code tries |range|+1 |
| 143 | positions. It seems more productive to believe the code! */ |
Guido van Rossum | 2d78590 | 1992-01-26 18:12:41 +0000 | [diff] [blame] | 144 | range = size - offset; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 145 | re->re_regs_valid = 0; |
| 146 | result = re_search(&re->re_patbuf, buffer, size, offset, range, |
| 147 | &re->re_regs); |
| 148 | if (result < -1) { |
| 149 | /* Failure like stack overflow */ |
| 150 | err_setstr(RegexError, "match failure"); |
| 151 | return NULL; |
| 152 | } |
| 153 | re->re_regs_valid = result >= 0; |
| 154 | return newintobject((long)result); /* Position of the match or -1 */ |
| 155 | } |
| 156 | |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 157 | static struct methodlist reg_methods[] = { |
| 158 | {"match", reg_match}, |
| 159 | {"search", reg_search}, |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 160 | {NULL, NULL} /* sentinel */ |
| 161 | }; |
| 162 | |
| 163 | static object * |
| 164 | reg_getattr(re, name) |
| 165 | regexobject *re; |
| 166 | char *name; |
| 167 | { |
Guido van Rossum | b824fc6 | 1992-01-01 14:52:16 +0000 | [diff] [blame] | 168 | if (strcmp(name, "regs") == 0) { |
| 169 | if (!re->re_regs_valid) { |
| 170 | err_setstr(RegexError, |
| 171 | "regs only valid after successful match/search"); |
| 172 | return NULL; |
| 173 | } |
| 174 | return makeresult(&re->re_regs); |
| 175 | } |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 176 | return findmethod(reg_methods, (object *)re, name); |
| 177 | } |
| 178 | |
| 179 | static typeobject Regextype = { |
| 180 | OB_HEAD_INIT(&Typetype) |
| 181 | 0, /*ob_size*/ |
| 182 | "regex", /*tp_name*/ |
| 183 | sizeof(regexobject), /*tp_size*/ |
| 184 | 0, /*tp_itemsize*/ |
| 185 | /* methods */ |
| 186 | reg_dealloc, /*tp_dealloc*/ |
| 187 | 0, /*tp_print*/ |
| 188 | reg_getattr, /*tp_getattr*/ |
| 189 | 0, /*tp_setattr*/ |
| 190 | 0, /*tp_compare*/ |
| 191 | 0, /*tp_repr*/ |
| 192 | }; |
| 193 | |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 194 | static object * |
| 195 | newregexobject(pat, size) |
| 196 | char *pat; |
| 197 | int size; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 198 | { |
| 199 | regexobject *re; |
| 200 | re = NEWOBJ(regexobject, &Regextype); |
| 201 | if (re != NULL) { |
| 202 | char *error; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 203 | re->re_patbuf.buffer = NULL; |
| 204 | re->re_patbuf.allocated = 0; |
| 205 | re->re_patbuf.fastmap = re->re_fastmap; |
| 206 | re->re_patbuf.translate = NULL; |
| 207 | re->re_regs_valid = 0; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 208 | error = re_compile_pattern(pat, size, &re->re_patbuf); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 209 | if (error != NULL) { |
| 210 | err_setstr(RegexError, error); |
| 211 | DECREF(re); |
| 212 | re = NULL; |
| 213 | } |
| 214 | } |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 215 | return (object *)re; |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 216 | } |
| 217 | |
| 218 | static object * |
| 219 | regex_compile(self, args) |
| 220 | object *self; |
| 221 | object *args; |
| 222 | { |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 223 | char *pat; |
| 224 | int size; |
| 225 | if (!getargs(args, "s#", &pat, &size)) |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 226 | return NULL; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 227 | return newregexobject(pat, size); |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 228 | } |
| 229 | |
| 230 | static object *cache_pat; |
| 231 | static object *cache_prog; |
| 232 | |
| 233 | static int |
| 234 | update_cache(pat) |
| 235 | object *pat; |
| 236 | { |
| 237 | if (pat != cache_pat) { |
| 238 | XDECREF(cache_pat); |
| 239 | cache_pat = NULL; |
| 240 | XDECREF(cache_prog); |
| 241 | cache_prog = regex_compile((object *)NULL, pat); |
| 242 | if (cache_prog == NULL) |
| 243 | return -1; |
| 244 | cache_pat = pat; |
| 245 | INCREF(cache_pat); |
| 246 | } |
| 247 | return 0; |
| 248 | } |
| 249 | |
| 250 | static object * |
| 251 | regex_match(self, args) |
| 252 | object *self; |
| 253 | object *args; |
| 254 | { |
| 255 | object *pat, *string; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 256 | if (!getStrStrarg(args, &pat, &string)) |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 257 | return NULL; |
| 258 | if (update_cache(pat) < 0) |
| 259 | return NULL; |
| 260 | return reg_match((regexobject *)cache_prog, string); |
| 261 | } |
| 262 | |
| 263 | static object * |
| 264 | regex_search(self, args) |
| 265 | object *self; |
| 266 | object *args; |
| 267 | { |
| 268 | object *pat, *string; |
Guido van Rossum | d577c0c | 1992-01-27 16:46:19 +0000 | [diff] [blame] | 269 | if (!getStrStrarg(args, &pat, &string)) |
Guido van Rossum | 6f4c43d | 1991-12-30 01:42:57 +0000 | [diff] [blame] | 270 | return NULL; |
| 271 | if (update_cache(pat) < 0) |
| 272 | return NULL; |
| 273 | return reg_search((regexobject *)cache_prog, string); |
| 274 | } |
| 275 | |
| 276 | static object * |
| 277 | regex_set_syntax(self, args) |
| 278 | object *self, *args; |
| 279 | { |
| 280 | int syntax; |
| 281 | if (!getintarg(args, &syntax)) |
| 282 | return NULL; |
| 283 | syntax = re_set_syntax(syntax); |
| 284 | return newintobject((long)syntax); |
| 285 | } |
| 286 | |
| 287 | static struct methodlist regex_global_methods[] = { |
| 288 | {"compile", regex_compile}, |
| 289 | {"match", regex_match}, |
| 290 | {"search", regex_search}, |
| 291 | {"set_syntax", regex_set_syntax}, |
| 292 | {NULL, NULL} /* sentinel */ |
| 293 | }; |
| 294 | |
| 295 | initregex() |
| 296 | { |
| 297 | object *m, *d; |
| 298 | |
| 299 | m = initmodule("regex", regex_global_methods); |
| 300 | d = getmoduledict(m); |
| 301 | |
| 302 | /* Initialize regex.error exception */ |
| 303 | RegexError = newstringobject("regex.error"); |
| 304 | if (RegexError == NULL || dictinsert(d, "error", RegexError) != 0) |
| 305 | fatal("can't define regex.error"); |
| 306 | } |