Ian Hodson | 2ee91b4 | 2012-05-14 12:29:36 +0100 | [diff] [blame] | 1 | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Rewrite POSIX and other features in re |
| 6 | // to use simple extended regular expression features. |
| 7 | // Also sort and simplify character classes. |
| 8 | |
| 9 | #include "util/util.h" |
| 10 | #include "re2/regexp.h" |
| 11 | #include "re2/walker-inl.h" |
| 12 | |
| 13 | namespace re2 { |
| 14 | |
| 15 | // Parses the regexp src and then simplifies it and sets *dst to the |
| 16 | // string representation of the simplified form. Returns true on success. |
| 17 | // Returns false and sets *error (if error != NULL) on error. |
| 18 | bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, |
| 19 | string* dst, |
| 20 | RegexpStatus* status) { |
| 21 | Regexp* re = Parse(src, flags, status); |
| 22 | if (re == NULL) |
| 23 | return false; |
| 24 | Regexp* sre = re->Simplify(); |
| 25 | re->Decref(); |
| 26 | if (sre == NULL) { |
| 27 | // Should not happen, since Simplify never fails. |
| 28 | LOG(ERROR) << "Simplify failed on " << src; |
| 29 | if (status) { |
| 30 | status->set_code(kRegexpInternalError); |
| 31 | status->set_error_arg(src); |
| 32 | } |
| 33 | return false; |
| 34 | } |
| 35 | *dst = sre->ToString(); |
| 36 | sre->Decref(); |
| 37 | return true; |
| 38 | } |
| 39 | |
| 40 | // Assuming the simple_ flags on the children are accurate, |
| 41 | // is this Regexp* simple? |
| 42 | bool Regexp::ComputeSimple() { |
| 43 | Regexp** subs; |
| 44 | switch (op_) { |
| 45 | case kRegexpNoMatch: |
| 46 | case kRegexpEmptyMatch: |
| 47 | case kRegexpLiteral: |
| 48 | case kRegexpLiteralString: |
| 49 | case kRegexpBeginLine: |
| 50 | case kRegexpEndLine: |
| 51 | case kRegexpBeginText: |
| 52 | case kRegexpWordBoundary: |
| 53 | case kRegexpNoWordBoundary: |
| 54 | case kRegexpEndText: |
| 55 | case kRegexpAnyChar: |
| 56 | case kRegexpAnyByte: |
| 57 | case kRegexpHaveMatch: |
| 58 | return true; |
| 59 | case kRegexpConcat: |
| 60 | case kRegexpAlternate: |
| 61 | // These are simple as long as the subpieces are simple. |
| 62 | subs = sub(); |
| 63 | for (int i = 0; i < nsub_; i++) |
| 64 | if (!subs[i]->simple_) |
| 65 | return false; |
| 66 | return true; |
| 67 | case kRegexpCharClass: |
| 68 | // Simple as long as the char class is not empty, not full. |
| 69 | if (ccb_ != NULL) |
| 70 | return !ccb_->empty() && !ccb_->full(); |
| 71 | return !cc_->empty() && !cc_->full(); |
| 72 | case kRegexpCapture: |
| 73 | subs = sub(); |
| 74 | return subs[0]->simple_; |
| 75 | case kRegexpStar: |
| 76 | case kRegexpPlus: |
| 77 | case kRegexpQuest: |
| 78 | subs = sub(); |
| 79 | if (!subs[0]->simple_) |
| 80 | return false; |
| 81 | switch (subs[0]->op_) { |
| 82 | case kRegexpStar: |
| 83 | case kRegexpPlus: |
| 84 | case kRegexpQuest: |
| 85 | case kRegexpEmptyMatch: |
| 86 | case kRegexpNoMatch: |
| 87 | return false; |
| 88 | default: |
| 89 | break; |
| 90 | } |
| 91 | return true; |
| 92 | case kRegexpRepeat: |
| 93 | return false; |
| 94 | } |
| 95 | LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; |
| 96 | return false; |
| 97 | } |
| 98 | |
| 99 | // Walker subclass used by Simplify. |
| 100 | // The simplify walk is purely post-recursive: given the simplified children, |
| 101 | // PostVisit creates the simplified result. |
| 102 | // The child_args are simplified Regexp*s. |
| 103 | class SimplifyWalker : public Regexp::Walker<Regexp*> { |
| 104 | public: |
| 105 | SimplifyWalker() {} |
| 106 | virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); |
| 107 | virtual Regexp* PostVisit(Regexp* re, |
| 108 | Regexp* parent_arg, |
| 109 | Regexp* pre_arg, |
| 110 | Regexp** child_args, int nchild_args); |
| 111 | virtual Regexp* Copy(Regexp* re); |
| 112 | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
| 113 | |
| 114 | private: |
| 115 | // These functions are declared inside SimplifyWalker so that |
| 116 | // they can edit the private fields of the Regexps they construct. |
| 117 | |
| 118 | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
| 119 | // Caller must Decref return value when done with it. |
| 120 | static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); |
| 121 | |
| 122 | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
| 123 | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
| 124 | // Caller must Decref return value when done with it. |
| 125 | static Regexp* SimplifyRepeat(Regexp* re, int min, int max, |
| 126 | Regexp::ParseFlags parse_flags); |
| 127 | |
| 128 | // Simplifies a character class by expanding any named classes |
| 129 | // into rune ranges. Does not edit re. Does not consume ref to re. |
| 130 | // Caller must Decref return value when done with it. |
| 131 | static Regexp* SimplifyCharClass(Regexp* re); |
| 132 | |
| 133 | DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); |
| 134 | }; |
| 135 | |
| 136 | // Simplifies a regular expression, returning a new regexp. |
| 137 | // The new regexp uses traditional Unix egrep features only, |
| 138 | // plus the Perl (?:) non-capturing parentheses. |
| 139 | // Otherwise, no POSIX or Perl additions. The new regexp |
| 140 | // captures exactly the same subexpressions (with the same indices) |
| 141 | // as the original. |
| 142 | // Does not edit current object. |
| 143 | // Caller must Decref() return value when done with it. |
| 144 | |
| 145 | Regexp* Regexp::Simplify() { |
| 146 | if (simple_) |
| 147 | return Incref(); |
| 148 | SimplifyWalker w; |
| 149 | return w.Walk(this, NULL); |
| 150 | } |
| 151 | |
| 152 | #define Simplify DontCallSimplify // Avoid accidental recursion |
| 153 | |
| 154 | Regexp* SimplifyWalker::Copy(Regexp* re) { |
| 155 | return re->Incref(); |
| 156 | } |
| 157 | |
| 158 | Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
| 159 | // This should never be called, since we use Walk and not |
| 160 | // WalkExponential. |
| 161 | LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; |
| 162 | return re->Incref(); |
| 163 | } |
| 164 | |
| 165 | Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { |
| 166 | if (re->simple_) { |
| 167 | *stop = true; |
| 168 | return re->Incref(); |
| 169 | } |
| 170 | return NULL; |
| 171 | } |
| 172 | |
| 173 | Regexp* SimplifyWalker::PostVisit(Regexp* re, |
| 174 | Regexp* parent_arg, |
| 175 | Regexp* pre_arg, |
| 176 | Regexp** child_args, |
| 177 | int nchild_args) { |
| 178 | switch (re->op()) { |
| 179 | case kRegexpNoMatch: |
| 180 | case kRegexpEmptyMatch: |
| 181 | case kRegexpLiteral: |
| 182 | case kRegexpLiteralString: |
| 183 | case kRegexpBeginLine: |
| 184 | case kRegexpEndLine: |
| 185 | case kRegexpBeginText: |
| 186 | case kRegexpWordBoundary: |
| 187 | case kRegexpNoWordBoundary: |
| 188 | case kRegexpEndText: |
| 189 | case kRegexpAnyChar: |
| 190 | case kRegexpAnyByte: |
| 191 | case kRegexpHaveMatch: |
| 192 | // All these are always simple. |
| 193 | re->simple_ = true; |
| 194 | return re->Incref(); |
| 195 | |
| 196 | case kRegexpConcat: |
| 197 | case kRegexpAlternate: { |
| 198 | // These are simple as long as the subpieces are simple. |
| 199 | // Two passes to avoid allocation in the common case. |
| 200 | bool changed = false; |
| 201 | Regexp** subs = re->sub(); |
| 202 | for (int i = 0; i < re->nsub_; i++) { |
| 203 | Regexp* sub = subs[i]; |
| 204 | Regexp* newsub = child_args[i]; |
| 205 | if (newsub != sub) { |
| 206 | changed = true; |
| 207 | break; |
| 208 | } |
| 209 | } |
| 210 | if (!changed) { |
| 211 | for (int i = 0; i < re->nsub_; i++) { |
| 212 | Regexp* newsub = child_args[i]; |
| 213 | newsub->Decref(); |
| 214 | } |
| 215 | re->simple_ = true; |
| 216 | return re->Incref(); |
| 217 | } |
| 218 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
| 219 | nre->AllocSub(re->nsub_); |
| 220 | Regexp** nre_subs = nre->sub(); |
| 221 | for (int i = 0; i <re->nsub_; i++) |
| 222 | nre_subs[i] = child_args[i]; |
| 223 | nre->simple_ = true; |
| 224 | return nre; |
| 225 | } |
| 226 | |
| 227 | case kRegexpCapture: { |
| 228 | Regexp* newsub = child_args[0]; |
| 229 | if (newsub == re->sub()[0]) { |
| 230 | newsub->Decref(); |
| 231 | re->simple_ = true; |
| 232 | return re->Incref(); |
| 233 | } |
| 234 | Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); |
| 235 | nre->AllocSub(1); |
| 236 | nre->sub()[0] = newsub; |
| 237 | nre->cap_ = re->cap_; |
| 238 | nre->simple_ = true; |
| 239 | return nre; |
| 240 | } |
| 241 | |
| 242 | case kRegexpStar: |
| 243 | case kRegexpPlus: |
| 244 | case kRegexpQuest: { |
| 245 | Regexp* newsub = child_args[0]; |
| 246 | // Special case: repeat the empty string as much as |
| 247 | // you want, but it's still the empty string. |
| 248 | if (newsub->op() == kRegexpEmptyMatch) |
| 249 | return newsub; |
| 250 | |
| 251 | // These are simple as long as the subpiece is simple. |
| 252 | if (newsub == re->sub()[0]) { |
| 253 | newsub->Decref(); |
| 254 | re->simple_ = true; |
| 255 | return re->Incref(); |
| 256 | } |
| 257 | |
| 258 | // These are also idempotent if flags are constant. |
| 259 | if (re->op() == newsub->op() && |
| 260 | re->parse_flags() == newsub->parse_flags()) |
| 261 | return newsub; |
| 262 | |
| 263 | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
| 264 | nre->AllocSub(1); |
| 265 | nre->sub()[0] = newsub; |
| 266 | nre->simple_ = true; |
| 267 | return nre; |
| 268 | } |
| 269 | |
| 270 | case kRegexpRepeat: { |
| 271 | Regexp* newsub = child_args[0]; |
| 272 | // Special case: repeat the empty string as much as |
| 273 | // you want, but it's still the empty string. |
| 274 | if (newsub->op() == kRegexpEmptyMatch) |
| 275 | return newsub; |
| 276 | |
| 277 | Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, |
| 278 | re->parse_flags()); |
| 279 | newsub->Decref(); |
| 280 | nre->simple_ = true; |
| 281 | return nre; |
| 282 | } |
| 283 | |
| 284 | case kRegexpCharClass: { |
| 285 | Regexp* nre = SimplifyCharClass(re); |
| 286 | nre->simple_ = true; |
| 287 | return nre; |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | LOG(ERROR) << "Simplify case not handled: " << re->op(); |
| 292 | return re->Incref(); |
| 293 | } |
| 294 | |
| 295 | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
| 296 | // Returns a new Regexp, handing the ref to the caller. |
| 297 | Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, |
| 298 | Regexp::ParseFlags parse_flags) { |
| 299 | Regexp* re = new Regexp(kRegexpConcat, parse_flags); |
| 300 | re->AllocSub(2); |
| 301 | Regexp** subs = re->sub(); |
| 302 | subs[0] = re1; |
| 303 | subs[1] = re2; |
| 304 | return re; |
| 305 | } |
| 306 | |
| 307 | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
| 308 | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
| 309 | // Caller must Decref return value when done with it. |
| 310 | // The result will *not* necessarily have the right capturing parens |
| 311 | // if you call ToString() and re-parse it: (x){2} becomes (x)(x), |
| 312 | // but in the Regexp* representation, both (x) are marked as $1. |
| 313 | Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, |
| 314 | Regexp::ParseFlags f) { |
| 315 | // x{n,} means at least n matches of x. |
| 316 | if (max == -1) { |
| 317 | // Special case: x{0,} is x* |
| 318 | if (min == 0) |
| 319 | return Regexp::Star(re->Incref(), f); |
| 320 | |
| 321 | // Special case: x{1,} is x+ |
| 322 | if (min == 1) |
| 323 | return Regexp::Plus(re->Incref(), f); |
| 324 | |
| 325 | // General case: x{4,} is xxxx+ |
| 326 | Regexp* nre = new Regexp(kRegexpConcat, f); |
| 327 | nre->AllocSub(min); |
| 328 | VLOG(1) << "Simplify " << min; |
| 329 | Regexp** nre_subs = nre->sub(); |
| 330 | for (int i = 0; i < min-1; i++) |
| 331 | nre_subs[i] = re->Incref(); |
| 332 | nre_subs[min-1] = Regexp::Plus(re->Incref(), f); |
| 333 | return nre; |
| 334 | } |
| 335 | |
| 336 | // Special case: (x){0} matches only empty string. |
| 337 | if (min == 0 && max == 0) |
| 338 | return new Regexp(kRegexpEmptyMatch, f); |
| 339 | |
| 340 | // Special case: x{1} is just x. |
| 341 | if (min == 1 && max == 1) |
| 342 | return re->Incref(); |
| 343 | |
| 344 | // General case: x{n,m} means n copies of x and m copies of x?. |
| 345 | // The machine will do less work if we nest the final m copies, |
| 346 | // so that x{2,5} = xx(x(x(x)?)?)? |
| 347 | |
| 348 | // Build leading prefix: xx. Capturing only on the last one. |
| 349 | Regexp* nre = NULL; |
| 350 | if (min > 0) { |
| 351 | nre = new Regexp(kRegexpConcat, f); |
| 352 | nre->AllocSub(min); |
| 353 | Regexp** nre_subs = nre->sub(); |
| 354 | for (int i = 0; i < min; i++) |
| 355 | nre_subs[i] = re->Incref(); |
| 356 | } |
| 357 | |
| 358 | // Build and attach suffix: (x(x(x)?)?)? |
| 359 | if (max > min) { |
| 360 | Regexp* suf = Regexp::Quest(re->Incref(), f); |
| 361 | for (int i = min+1; i < max; i++) |
| 362 | suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); |
| 363 | if (nre == NULL) |
| 364 | nre = suf; |
| 365 | else |
| 366 | nre = Concat2(nre, suf, f); |
| 367 | } |
| 368 | |
| 369 | if (nre == NULL) { |
| 370 | // Some degenerate case, like min > max, or min < max < 0. |
| 371 | // This shouldn't happen, because the parser rejects such regexps. |
| 372 | LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; |
| 373 | return new Regexp(kRegexpNoMatch, f); |
| 374 | } |
| 375 | |
| 376 | return nre; |
| 377 | } |
| 378 | |
| 379 | // Simplifies a character class. |
| 380 | // Caller must Decref return value when done with it. |
| 381 | Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { |
| 382 | CharClass* cc = re->cc(); |
| 383 | |
| 384 | // Special cases |
| 385 | if (cc->empty()) |
| 386 | return new Regexp(kRegexpNoMatch, re->parse_flags()); |
| 387 | if (cc->full()) |
| 388 | return new Regexp(kRegexpAnyChar, re->parse_flags()); |
| 389 | |
| 390 | return re->Incref(); |
| 391 | } |
| 392 | |
| 393 | } // namespace re2 |