blob: 8c4f76136d51fbbfbeb9372b61dad0438c2d9886 [file] [log] [blame]
Ian Hodson2ee91b42012-05-14 12:29:36 +01001// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc.
6//
7// Prog::SearchNFA, an NFA search.
8// This is an actual NFA like the theorists talk about,
9// not the pseudo-NFA found in backtracking regexp implementations.
10//
11// IMPLEMENTATION
12//
13// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14// which is a variant of the one described in Thompson's 1968 CACM paper.
15// See http://swtch.com/~rsc/regexp/ for various history. The main feature
16// over the DFA implementation is that it tracks submatch boundaries.
17//
18// When the choice of submatch boundaries is ambiguous, this particular
19// implementation makes the same choices that traditional backtracking
20// implementations (in particular, Perl and PCRE) do.
21// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22// time in the length of the input.
23//
24// Like Thompson's original machine and like the DFA implementation, this
25// implementation notices a match only once it is one byte past it.
26
27#include "re2/prog.h"
28#include "re2/regexp.h"
29#include "util/sparse_array.h"
30#include "util/sparse_set.h"
31
32namespace re2 {
33
34class NFA {
35 public:
36 NFA(Prog* prog);
37 ~NFA();
38
39 // Searches for a matching string.
40 // * If anchored is true, only considers matches starting at offset.
41 // Otherwise finds lefmost match at or after offset.
42 // * If longest is true, returns the longest match starting
43 // at the chosen start point. Otherwise returns the so-called
44 // left-biased match, the one traditional backtracking engines
45 // (like Perl and PCRE) find.
46 // Records submatch boundaries in submatch[1..nsubmatch-1].
47 // Submatch[0] is the entire match. When there is a choice in
48 // which text matches each subexpression, the submatch boundaries
49 // are chosen to match what a backtracking implementation would choose.
50 bool Search(const StringPiece& text, const StringPiece& context,
51 bool anchored, bool longest,
52 StringPiece* submatch, int nsubmatch);
53
54 static const int Debug = 0;
55
56 private:
57 struct Thread {
58 union {
59 int id;
60 Thread* next; // when on free list
61 };
62 const char** capture;
63 };
64
65 // State for explicit stack in AddToThreadq.
66 struct AddState {
67 int id; // Inst to process
68 int j;
69 const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
70
71 AddState()
72 : id(0), j(-1), cap_j(NULL) {}
73 explicit AddState(int id)
74 : id(id), j(-1), cap_j(NULL) {}
75 AddState(int id, const char* cap_j, int j)
76 : id(id), j(j), cap_j(cap_j) {}
77 };
78
79 // Threadq is a list of threads. The list is sorted by the order
80 // in which Perl would explore that particular state -- the earlier
81 // choices appear earlier in the list.
82 typedef SparseArray<Thread*> Threadq;
83
84 inline Thread* AllocThread();
85 inline void FreeThread(Thread*);
86
Alexander Gutkin0d4c5232013-02-28 13:47:27 +000087 // Add id (or its children, following unlabeled arrows)
Ian Hodson2ee91b42012-05-14 12:29:36 +010088 // to the workqueue q with associated capture info.
89 void AddToThreadq(Threadq* q, int id, int flag,
90 const char* p, const char** capture);
91
92 // Run runq on byte c, appending new states to nextq.
93 // Updates matched_ and match_ as new, better matches are found.
94 // p is position of the next byte (the one after c)
95 // in the input string, used when processing capturing parens.
96 // flag is the bitwise or of Bol, Eol, etc., specifying whether
97 // ^, $ and \b match the current input point (after c).
98 inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
99
100 // Returns text version of capture information, for debugging.
101 string FormatCapture(const char** capture);
102
103 inline void CopyCapture(const char** dst, const char** src);
104
105 // Computes whether all matches must begin with the same first
106 // byte, and if so, returns that byte. If not, returns -1.
107 int ComputeFirstByte();
108
109 Prog* prog_; // underlying program
110 int start_; // start instruction in program
111 int ncapture_; // number of submatches to track
112 bool longest_; // whether searching for longest match
113 bool endmatch_; // whether match must end at text.end()
114 const char* btext_; // beginning of text being matched (for FormatSubmatch)
115 const char* etext_; // end of text being matched (for endmatch_)
116 Threadq q0_, q1_; // pre-allocated for Search.
117 const char** match_; // best match so far
118 bool matched_; // any match so far?
119 AddState* astack_; // pre-allocated for AddToThreadq
120 int nastack_;
121 int first_byte_; // required first byte for match, or -1 if none
122
123 Thread* free_threads_; // free list
124
125 DISALLOW_EVIL_CONSTRUCTORS(NFA);
126};
127
128NFA::NFA(Prog* prog) {
129 prog_ = prog;
130 start_ = prog->start();
131 ncapture_ = 0;
132 longest_ = false;
133 endmatch_ = false;
134 btext_ = NULL;
135 etext_ = NULL;
136 q0_.resize(prog_->size());
137 q1_.resize(prog_->size());
138 nastack_ = 2*prog_->size();
139 astack_ = new AddState[nastack_];
140 match_ = NULL;
141 matched_ = false;
142 free_threads_ = NULL;
143 first_byte_ = ComputeFirstByte();
144}
145
146NFA::~NFA() {
147 delete[] match_;
148 delete[] astack_;
149 Thread* next;
150 for (Thread* t = free_threads_; t; t = next) {
151 next = t->next;
152 delete[] t->capture;
153 delete t;
154 }
155}
156
157void NFA::FreeThread(Thread *t) {
158 if (t == NULL)
159 return;
160 t->next = free_threads_;
161 free_threads_ = t;
162}
163
164NFA::Thread* NFA::AllocThread() {
165 Thread* t = free_threads_;
166 if (t == NULL) {
167 t = new Thread;
168 t->capture = new const char*[ncapture_];
169 return t;
170 }
171 free_threads_ = t->next;
172 return t;
173}
174
175void NFA::CopyCapture(const char** dst, const char** src) {
176 for (int i = 0; i < ncapture_; i+=2) {
177 dst[i] = src[i];
178 dst[i+1] = src[i+1];
179 }
180}
181
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000182// Follows all empty arrows from id0 and enqueues all the states reached.
Ian Hodson2ee91b42012-05-14 12:29:36 +0100183// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
184// The pointer p is the current input position, and m is the
185// current set of match boundaries.
186void NFA::AddToThreadq(Threadq* q, int id0, int flag,
187 const char* p, const char** capture) {
188 if (id0 == 0)
189 return;
190
191 // Astack_ is pre-allocated to avoid resize operations.
192 // It has room for 2*prog_->size() entries, which is enough:
193 // Each inst in prog can be processed at most once,
194 // pushing at most two entries on stk.
195
196 int nstk = 0;
197 AddState* stk = astack_;
198 stk[nstk++] = AddState(id0);
199
200 while (nstk > 0) {
201 DCHECK_LE(nstk, nastack_);
202 const AddState& a = stk[--nstk];
203 if (a.j >= 0)
204 capture[a.j] = a.cap_j;
205
206 int id = a.id;
207 if (id == 0)
208 continue;
209 if (q->has_index(id)) {
210 if (Debug)
211 fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
212 continue;
213 }
214
215 // Create entry in q no matter what. We might fill it in below,
216 // or we might not. Even if not, it is necessary to have it,
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000217 // so that we don't revisit id0 during the recursion.
Ian Hodson2ee91b42012-05-14 12:29:36 +0100218 q->set_new(id, NULL);
219
220 Thread** tp = &q->find(id)->second;
221 int j;
222 Thread* t;
223 Prog::Inst* ip = prog_->inst(id);
224 switch (ip->opcode()) {
225 default:
226 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
227 break;
228
229 case kInstFail:
230 break;
231
232 case kInstAltMatch:
233 // Save state; will pick up at next byte.
234 t = AllocThread();
235 t->id = id;
236 CopyCapture(t->capture, capture);
237 *tp = t;
238 // fall through
239
240 case kInstAlt:
241 // Explore alternatives.
242 stk[nstk++] = AddState(ip->out1());
243 stk[nstk++] = AddState(ip->out());
244 break;
245
246 case kInstNop:
247 // Continue on.
248 stk[nstk++] = AddState(ip->out());
249 break;
250
251 case kInstCapture:
252 if ((j=ip->cap()) < ncapture_) {
253 // Push a dummy whose only job is to restore capture[j]
254 // once we finish exploring this possibility.
255 stk[nstk++] = AddState(0, capture[j], j);
256
257 // Record capture.
258 capture[j] = p;
259 }
260 stk[nstk++] = AddState(ip->out());
261 break;
262
263 case kInstMatch:
264 case kInstByteRange:
265 // Save state; will pick up at next byte.
266 t = AllocThread();
267 t->id = id;
268 CopyCapture(t->capture, capture);
269 *tp = t;
270 if (Debug)
271 fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
272 break;
273
274 case kInstEmptyWidth:
275 // Continue on if we have all the right flag bits.
276 if (ip->empty() & ~flag)
277 break;
278 stk[nstk++] = AddState(ip->out());
279 break;
280 }
281 }
282}
283
284// Run runq on byte c, appending new states to nextq.
285// Updates match as new, better matches are found.
286// p is position of the byte c in the input string,
287// used when processing capturing parens.
288// flag is the bitwise or of Bol, Eol, etc., specifying whether
289// ^, $ and \b match the current input point (after c).
290// Frees all the threads on runq.
291// If there is a shortcut to the end, returns that shortcut.
292int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
293 nextq->clear();
294
295 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
296 Thread* t = i->second;
297 if (t == NULL)
298 continue;
299
300 if (longest_) {
301 // Can skip any threads started after our current best match.
302 if (matched_ && match_[0] < t->capture[0]) {
303 FreeThread(t);
304 continue;
305 }
306 }
307
308 int id = t->id;
309 Prog::Inst* ip = prog_->inst(id);
310
311 switch (ip->opcode()) {
312 default:
313 // Should only see the values handled below.
314 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
315 break;
316
317 case kInstByteRange:
318 if (ip->Matches(c))
319 AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
320 break;
321
322 case kInstAltMatch:
323 if (i != runq->begin())
324 break;
325 // The match is ours if we want it.
326 if (ip->greedy(prog_) || longest_) {
327 CopyCapture((const char**)match_, t->capture);
328 FreeThread(t);
329 for (++i; i != runq->end(); ++i)
330 FreeThread(i->second);
331 runq->clear();
332 matched_ = true;
333 if (ip->greedy(prog_))
334 return ip->out1();
335 return ip->out();
336 }
337 break;
338
339 case kInstMatch:
340 if (endmatch_ && p != etext_)
341 break;
342
343 const char* old = t->capture[1]; // previous end pointer
344 t->capture[1] = p;
345 if (longest_) {
346 // Leftmost-longest mode: save this match only if
347 // it is either farther to the left or at the same
348 // point but longer than an existing match.
349 if (!matched_ || t->capture[0] < match_[0] ||
350 (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
351 CopyCapture((const char**)match_, t->capture);
352 } else {
353 // Leftmost-biased mode: this match is by definition
354 // better than what we've already found (see next line).
355 CopyCapture((const char**)match_, t->capture);
356
357 // Cut off the threads that can only find matches
358 // worse than the one we just found: don't run the
359 // rest of the current Threadq.
360 t->capture[0] = old;
361 FreeThread(t);
362 for (++i; i != runq->end(); ++i)
363 FreeThread(i->second);
364 runq->clear();
365 matched_ = true;
366 return 0;
367 }
368 t->capture[0] = old;
369 matched_ = true;
370 break;
371 }
372 FreeThread(t);
373 }
374 runq->clear();
375 return 0;
376}
377
378string NFA::FormatCapture(const char** capture) {
379 string s;
380
381 for (int i = 0; i < ncapture_; i+=2) {
382 if (capture[i] == NULL)
383 StringAppendF(&s, "(?,?)");
384 else if (capture[i+1] == NULL)
385 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
386 else
387 StringAppendF(&s, "(%d,%d)",
388 (int)(capture[i] - btext_),
389 (int)(capture[i+1] - btext_));
390 }
391 return s;
392}
393
394// Returns whether haystack contains needle's memory.
395static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
396 return haystack.begin() <= needle.begin() &&
397 haystack.end() >= needle.end();
398}
399
400bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
401 bool anchored, bool longest,
402 StringPiece* submatch, int nsubmatch) {
403 if (start_ == 0)
404 return false;
405
406 StringPiece context = const_context;
407 if (context.begin() == NULL)
408 context = text;
409
410 if (!StringPieceContains(context, text)) {
411 LOG(FATAL) << "Bad args: context does not contain text "
412 << reinterpret_cast<const void*>(context.begin())
413 << "+" << context.size() << " "
414 << reinterpret_cast<const void*>(text.begin())
415 << "+" << text.size();
416 return false;
417 }
418
419 if (prog_->anchor_start() && context.begin() != text.begin())
420 return false;
421 if (prog_->anchor_end() && context.end() != text.end())
422 return false;
423 anchored |= prog_->anchor_start();
424 if (prog_->anchor_end()) {
425 longest = true;
426 endmatch_ = true;
427 etext_ = text.end();
428 }
429
430 if (nsubmatch < 0) {
431 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
432 return false;
433 }
434
435 // Save search parameters.
436 ncapture_ = 2*nsubmatch;
437 longest_ = longest;
438
439 if (nsubmatch == 0) {
440 // We need to maintain match[0], both to distinguish the
441 // longest match (if longest is true) and also to tell
442 // whether we've seen any matches at all.
443 ncapture_ = 2;
444 }
445
446 match_ = new const char*[ncapture_];
447 matched_ = false;
448 memset(match_, 0, ncapture_*sizeof match_[0]);
449
450 // For debugging prints.
451 btext_ = context.begin();
452
453 if (Debug) {
454 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
455 text.as_string().c_str(), context.as_string().c_str(), anchored,
456 longest);
457 }
458
459 // Set up search.
460 Threadq* runq = &q0_;
461 Threadq* nextq = &q1_;
462 runq->clear();
463 nextq->clear();
464 memset(&match_[0], 0, ncapture_*sizeof match_[0]);
465 const char* bp = context.begin();
466 int c = -1;
467 int wasword = 0;
468
469 if (text.begin() > context.begin()) {
470 c = text.begin()[-1] & 0xFF;
471 wasword = Prog::IsWordChar(c);
472 }
473
474 // Loop over the text, stepping the machine.
475 for (const char* p = text.begin();; p++) {
476 // Check for empty-width specials.
477 int flag = 0;
478
479 // ^ and \A
480 if (p == context.begin())
481 flag |= kEmptyBeginText | kEmptyBeginLine;
482 else if (p <= context.end() && p[-1] == '\n')
483 flag |= kEmptyBeginLine;
484
485 // $ and \z
486 if (p == context.end())
487 flag |= kEmptyEndText | kEmptyEndLine;
488 else if (p < context.end() && p[0] == '\n')
489 flag |= kEmptyEndLine;
490
491 // \b and \B
492 int isword = 0;
493 if (p < context.end())
494 isword = Prog::IsWordChar(p[0] & 0xFF);
495
496 if (isword != wasword)
497 flag |= kEmptyWordBoundary;
498 else
499 flag |= kEmptyNonWordBoundary;
500
501 if (Debug) {
502 fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
503 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
504 Thread* t = i->second;
505 if (t == NULL)
506 continue;
507 fprintf(stderr, " %d%s", t->id,
508 FormatCapture((const char**)t->capture).c_str());
509 }
510 fprintf(stderr, "\n");
511 }
512
513 // Process previous character (waited until now to avoid
514 // repeating the flag computation above).
515 // This is a no-op the first time around the loop, because
516 // runq is empty.
517 int id = Step(runq, nextq, c, flag, p-1);
518 DCHECK_EQ(runq->size(), 0);
519 swap(nextq, runq);
520 nextq->clear();
521 if (id != 0) {
522 // We're done: full match ahead.
523 p = text.end();
524 for (;;) {
525 Prog::Inst* ip = prog_->inst(id);
526 switch (ip->opcode()) {
527 default:
528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529 break;
530
531 case kInstCapture:
532 match_[ip->cap()] = p;
533 id = ip->out();
534 continue;
535
536 case kInstNop:
537 id = ip->out();
538 continue;
539
540 case kInstMatch:
541 match_[1] = p;
542 matched_ = true;
543 break;
544
545 case kInstEmptyWidth:
546 if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
547 LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
548 break;
549 }
550 id = ip->out();
551 continue;
552 }
553 break;
554 }
555 break;
556 }
557
558 if (p > text.end())
559 break;
560
561 // Start a new thread if there have not been any matches.
562 // (No point in starting a new thread if there have been
563 // matches, since it would be to the right of the match
564 // we already found.)
565 if (!matched_ && (!anchored || p == text.begin())) {
566 // If there's a required first byte for an unanchored search
567 // and we're not in the middle of any possible matches,
568 // use memchr to search for the byte quickly.
569 if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
570 p < text.end() && (p[0] & 0xFF) != first_byte_) {
571 p = reinterpret_cast<const char*>(memchr(p, first_byte_,
572 text.end() - p));
573 if (p == NULL) {
574 p = text.end();
575 isword = 0;
576 } else {
577 isword = Prog::IsWordChar(p[0] & 0xFF);
578 }
579 flag = Prog::EmptyFlags(context, p);
580 }
581
582 // Steal match storage (cleared but unused as of yet)
583 // temporarily to hold match boundaries for new thread.
584 match_[0] = p;
585 AddToThreadq(runq, start_, flag, p, match_);
586 match_[0] = NULL;
587 }
588
589 // If all the threads have died, stop early.
590 if (runq->size() == 0) {
591 if (Debug)
592 fprintf(stderr, "dead\n");
593 break;
594 }
595
596 if (p == text.end())
597 c = 0;
598 else
599 c = *p & 0xFF;
600 wasword = isword;
601
602 // Will run step(runq, nextq, c, ...) on next iteration. See above.
603 }
604
605 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
606 FreeThread(i->second);
607
608 if (matched_) {
609 for (int i = 0; i < nsubmatch; i++)
610 submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
611 if (Debug)
612 fprintf(stderr, "match (%d,%d)\n",
613 static_cast<int>(match_[0] - btext_),
614 static_cast<int>(match_[1] - btext_));
615 return true;
616 }
617 VLOG(1) << "No matches found";
618 return false;
619}
620
621// Computes whether all successful matches have a common first byte,
622// and if so, returns that byte. If not, returns -1.
623int NFA::ComputeFirstByte() {
624 if (start_ == 0)
625 return -1;
626
627 int b = -1; // first byte, not yet computed
628
629 typedef SparseSet Workq;
630 Workq q(prog_->size());
631 q.insert(start_);
632 for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
633 int id = *it;
634 Prog::Inst* ip = prog_->inst(id);
635 switch (ip->opcode()) {
636 default:
637 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
638 break;
639
640 case kInstMatch:
641 // The empty string matches: no first byte.
642 return -1;
643
644 case kInstByteRange:
645 // Must match only a single byte
646 if (ip->lo() != ip->hi())
647 return -1;
648 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
649 return -1;
650 // If we haven't seen any bytes yet, record it;
651 // otherwise must match the one we saw before.
652 if (b == -1)
653 b = ip->lo();
654 else if (b != ip->lo())
655 return -1;
656 break;
657
658 case kInstNop:
659 case kInstCapture:
660 case kInstEmptyWidth:
661 // Continue on.
662 // Ignore ip->empty() flags for kInstEmptyWidth
663 // in order to be as conservative as possible
664 // (assume all possible empty-width flags are true).
665 if (ip->out())
666 q.insert(ip->out());
667 break;
668
669 case kInstAlt:
670 case kInstAltMatch:
671 // Explore alternatives.
672 if (ip->out())
673 q.insert(ip->out());
674 if (ip->out1())
675 q.insert(ip->out1());
676 break;
677
678 case kInstFail:
679 break;
680 }
681 }
682 return b;
683}
684
685bool
686Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
687 Anchor anchor, MatchKind kind,
688 StringPiece* match, int nmatch) {
689 if (NFA::Debug)
690 Dump();
691
692 NFA nfa(this);
693 StringPiece sp;
694 if (kind == kFullMatch) {
695 anchor = kAnchored;
696 if (nmatch == 0) {
697 match = &sp;
698 nmatch = 1;
699 }
700 }
701 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
702 return false;
703 if (kind == kFullMatch && match[0].end() != text.end())
704 return false;
705 return true;
706}
707
708} // namespace re2
709