blob: 9cddb7119ebc2db4ff89937a8666d60b6e372497 [file] [log] [blame]
Ian Hodson2ee91b42012-05-14 12:29:36 +01001// Copyright 2007 The RE2 Authors. All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compile regular expression to Prog.
6//
7// Prog and Inst are defined in prog.h.
8// This file's external interface is just Regexp::CompileToProg.
9// The Compiler class defined in this file is private.
10
11#include "re2/prog.h"
12#include "re2/re2.h"
13#include "re2/regexp.h"
14#include "re2/walker-inl.h"
15
16namespace re2 {
17
18// List of pointers to Inst* that need to be filled in (patched).
19// Because the Inst* haven't been filled in yet,
20// we can use the Inst* word to hold the list's "next" pointer.
21// It's kind of sleazy, but it works well in practice.
22// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23//
24// Because the out and out1 fields in Inst are no longer pointers,
25// we can't use pointers directly here either. Instead, p refers
26// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27// p == 0 represents the NULL list. This is okay because instruction #0
28// is always the fail instruction, which never appears on a list.
29
30struct PatchList {
31 uint32 p;
32
33 // Returns patch list containing just p.
34 static PatchList Mk(uint32 p);
35
36 // Patches all the entries on l to have value v.
37 // Caller must not ever use patch list again.
38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
40 // Deref returns the next pointer pointed at by p.
41 static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
43 // Appends two patch lists and returns result.
44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45};
46
Alexander Gutkin0d4c5232013-02-28 13:47:27 +000047static PatchList nullPatchList;
Ian Hodson2ee91b42012-05-14 12:29:36 +010048
49// Returns patch list containing just p.
50PatchList PatchList::Mk(uint32 p) {
51 PatchList l;
52 l.p = p;
53 return l;
54}
55
56// Returns the next pointer pointed at by l.
57PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58 Prog::Inst* ip = &inst0[l.p>>1];
59 if (l.p&1)
60 l.p = ip->out1();
61 else
62 l.p = ip->out();
63 return l;
64}
65
66// Patches all the entries on l to have value v.
67void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68 while (l.p != 0) {
69 Prog::Inst* ip = &inst0[l.p>>1];
70 if (l.p&1) {
71 l.p = ip->out1();
72 ip->out1_ = val;
73 } else {
74 l.p = ip->out();
75 ip->set_out(val);
76 }
77 }
78}
79
80// Appends two patch lists and returns result.
81PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82 if (l1.p == 0)
83 return l2;
84 if (l2.p == 0)
85 return l1;
86
87 PatchList l = l1;
88 for (;;) {
89 PatchList next = PatchList::Deref(inst0, l);
90 if (next.p == 0)
91 break;
92 l = next;
93 }
94
95 Prog::Inst* ip = &inst0[l.p>>1];
96 if (l.p&1)
97 ip->out1_ = l2.p;
98 else
99 ip->set_out(l2.p);
100
101 return l1;
102}
103
104// Compiled program fragment.
105struct Frag {
106 uint32 begin;
107 PatchList end;
108
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000109 explicit Frag(LinkerInitialized) {}
Ian Hodson2ee91b42012-05-14 12:29:36 +0100110 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
111 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
112};
113
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000114static Frag kNullFrag(LINKER_INITIALIZED);
Ian Hodson2ee91b42012-05-14 12:29:36 +0100115
116// Input encodings.
117enum Encoding {
118 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
119 kEncodingLatin1, // Latin1 (0-FF)
120};
121
122class Compiler : public Regexp::Walker<Frag> {
123 public:
124 explicit Compiler();
125 ~Compiler();
126
127 // Compiles Regexp to a new Prog.
128 // Caller is responsible for deleting Prog when finished with it.
129 // If reversed is true, compiles for walking over the input
130 // string backward (reverses all concatenations).
131 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
132
133 // Compiles alternation of all the re to a new Prog.
134 // Each re has a match with an id equal to its index in the vector.
135 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
136 Regexp* re);
137
138 // Interface for Regexp::Walker, which helps traverse the Regexp.
139 // The walk is purely post-recursive: given the machines for the
140 // children, PostVisit combines them to create the machine for
141 // the current node. The child_args are Frags.
142 // The Compiler traverses the Regexp parse tree, visiting
143 // each node in depth-first order. It invokes PreVisit before
144 // visiting the node's children and PostVisit after visiting
145 // the children.
146 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
147 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
148 int nchild_args);
149 Frag ShortVisit(Regexp* re, Frag parent_arg);
150 Frag Copy(Frag arg);
151
152 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
153 Frag Plus(Frag a, bool nongreedy);
154 Frag Star(Frag a, bool nongreedy);
155 Frag Quest(Frag a, bool nongreedy);
156
157 // Given fragment a, returns (a) capturing as \n.
158 Frag Capture(Frag a, int n);
159
160 // Given fragments a and b, returns ab; a|b
161 Frag Cat(Frag a, Frag b);
162 Frag Alt(Frag a, Frag b);
163
164 // Returns a fragment that can't match anything.
165 Frag NoMatch();
166
167 // Returns a fragment that matches the empty string.
168 Frag Match(int32 id);
169
170 // Returns a no-op fragment.
171 Frag Nop();
172
173 // Returns a fragment matching the byte range lo-hi.
174 Frag ByteRange(int lo, int hi, bool foldcase);
175
176 // Returns a fragment matching an empty-width special op.
177 Frag EmptyWidth(EmptyOp op);
178
179 // Adds n instructions to the program.
180 // Returns the index of the first one.
181 // Returns -1 if no more instructions are available.
182 int AllocInst(int n);
183
184 // Deletes unused instructions.
185 void Trim();
186
187 // Rune range compiler.
188
189 // Begins a new alternation.
190 void BeginRange();
191
192 // Adds a fragment matching the rune range lo-hi.
193 void AddRuneRange(Rune lo, Rune hi, bool foldcase);
194 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
195 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
196 void Add_80_10ffff();
197
198 // New suffix that matches the byte range lo-hi, then goes to next.
199 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
200 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
201
202 // Adds a suffix to alternation.
203 void AddSuffix(int id);
204
205 // Returns the alternation of all the added suffixes.
206 Frag EndRange();
207
208 // Single rune.
209 Frag Literal(Rune r, bool foldcase);
210
211 void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
212 Prog* Finish();
213
214 // Returns .* where dot = any byte
215 Frag DotStar();
216
217 private:
218 Prog* prog_; // Program being built.
219 bool failed_; // Did we give up compiling?
220 Encoding encoding_; // Input encoding
221 bool reversed_; // Should program run backward over text?
222
223 int max_inst_; // Maximum number of instructions.
224
225 Prog::Inst* inst_; // Pointer to first instruction.
226 int inst_len_; // Number of instructions used.
227 int inst_cap_; // Number of instructions allocated.
228
229 int64 max_mem_; // Total memory budget.
230
231 map<uint64, int> rune_cache_;
232 Frag rune_range_;
233
234 RE2::Anchor anchor_; // anchor mode for RE2::Set
235
236 DISALLOW_EVIL_CONSTRUCTORS(Compiler);
237};
238
239Compiler::Compiler() {
240 prog_ = new Prog();
241 failed_ = false;
242 encoding_ = kEncodingUTF8;
243 reversed_ = false;
244 inst_ = NULL;
245 inst_len_ = 0;
246 inst_cap_ = 0;
247 max_inst_ = 1; // make AllocInst for fail instruction okay
248 max_mem_ = 0;
249 int fail = AllocInst(1);
250 inst_[fail].InitFail();
251 max_inst_ = 0; // Caller must change
252}
253
254Compiler::~Compiler() {
255 delete prog_;
256 delete[] inst_;
257}
258
259int Compiler::AllocInst(int n) {
260 if (failed_ || inst_len_ + n > max_inst_) {
261 failed_ = true;
262 return -1;
263 }
264
265 if (inst_len_ + n > inst_cap_) {
266 if (inst_cap_ == 0)
267 inst_cap_ = 8;
268 while (inst_len_ + n > inst_cap_)
269 inst_cap_ *= 2;
270 Prog::Inst* ip = new Prog::Inst[inst_cap_];
271 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
272 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
273 delete[] inst_;
274 inst_ = ip;
275 }
276 int id = inst_len_;
277 inst_len_ += n;
278 return id;
279}
280
281void Compiler::Trim() {
282 if (inst_len_ < inst_cap_) {
283 Prog::Inst* ip = new Prog::Inst[inst_len_];
284 memmove(ip, inst_, inst_len_ * sizeof ip[0]);
285 delete[] inst_;
286 inst_ = ip;
287 inst_cap_ = inst_len_;
288 }
289}
290
291// These routines are somewhat hard to visualize in text --
292// see http://swtch.com/~rsc/regexp/regexp1.html for
293// pictures explaining what is going on here.
294
295// Returns an unmatchable fragment.
296Frag Compiler::NoMatch() {
297 return Frag(0, nullPatchList);
298}
299
300// Is a an unmatchable fragment?
301static bool IsNoMatch(Frag a) {
302 return a.begin == 0;
303}
304
305// Given fragments a and b, returns fragment for ab.
306Frag Compiler::Cat(Frag a, Frag b) {
307 if (IsNoMatch(a) || IsNoMatch(b))
308 return NoMatch();
309
310 // Elide no-op.
311 Prog::Inst* begin = &inst_[a.begin];
312 if (begin->opcode() == kInstNop &&
313 a.end.p == (a.begin << 1) &&
314 begin->out() == 0) {
315 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere
316 return b;
317 }
318
319 // To run backward over string, reverse all concatenations.
320 if (reversed_) {
321 PatchList::Patch(inst_, b.end, a.begin);
322 return Frag(b.begin, a.end);
323 }
324
325 PatchList::Patch(inst_, a.end, b.begin);
326 return Frag(a.begin, b.end);
327}
328
329// Given fragments for a and b, returns fragment for a|b.
330Frag Compiler::Alt(Frag a, Frag b) {
331 // Special case for convenience in loops.
332 if (IsNoMatch(a))
333 return b;
334 if (IsNoMatch(b))
335 return a;
336
337 int id = AllocInst(1);
338 if (id < 0)
339 return NoMatch();
340
341 inst_[id].InitAlt(a.begin, b.begin);
342 return Frag(id, PatchList::Append(inst_, a.end, b.end));
343}
344
345// When capturing submatches in like-Perl mode, a kOpAlt Inst
346// treats out_ as the first choice, out1_ as the second.
347//
348// For *, +, and ?, if out_ causes another repetition,
349// then the operator is greedy. If out1_ is the repetition
350// (and out_ moves forward), then the operator is non-greedy.
351
352// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
353Frag Compiler::Star(Frag a, bool nongreedy) {
354 int id = AllocInst(1);
355 if (id < 0)
356 return NoMatch();
357 inst_[id].InitAlt(0, 0);
358 PatchList::Patch(inst_, a.end, id);
359 if (nongreedy) {
360 inst_[id].out1_ = a.begin;
361 return Frag(id, PatchList::Mk(id << 1));
362 } else {
363 inst_[id].set_out(a.begin);
364 return Frag(id, PatchList::Mk((id << 1) | 1));
365 }
366}
367
368// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
369Frag Compiler::Plus(Frag a, bool nongreedy) {
370 // a+ is just a* with a different entry point.
371 Frag f = Star(a, nongreedy);
372 return Frag(a.begin, f.end);
373}
374
375// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
376Frag Compiler::Quest(Frag a, bool nongreedy) {
377 int id = AllocInst(1);
378 if (id < 0)
379 return NoMatch();
380 PatchList pl;
381 if (nongreedy) {
382 inst_[id].InitAlt(0, a.begin);
383 pl = PatchList::Mk(id << 1);
384 } else {
385 inst_[id].InitAlt(a.begin, 0);
386 pl = PatchList::Mk((id << 1) | 1);
387 }
388 return Frag(id, PatchList::Append(inst_, pl, a.end));
389}
390
391// Returns a fragment for the byte range lo-hi.
392Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
393 int id = AllocInst(1);
394 if (id < 0)
395 return NoMatch();
396 inst_[id].InitByteRange(lo, hi, foldcase, 0);
397 prog_->byte_inst_count_++;
398 prog_->MarkByteRange(lo, hi);
399 if (foldcase && lo <= 'z' && hi >= 'a') {
400 if (lo < 'a')
401 lo = 'a';
402 if (hi > 'z')
403 hi = 'z';
404 if (lo <= hi)
405 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
406 }
407 return Frag(id, PatchList::Mk(id << 1));
408}
409
410// Returns a no-op fragment. Sometimes unavoidable.
411Frag Compiler::Nop() {
412 int id = AllocInst(1);
413 if (id < 0)
414 return NoMatch();
415 inst_[id].InitNop(0);
416 return Frag(id, PatchList::Mk(id << 1));
417}
418
419// Returns a fragment that signals a match.
420Frag Compiler::Match(int32 match_id) {
421 int id = AllocInst(1);
422 if (id < 0)
423 return NoMatch();
424 inst_[id].InitMatch(match_id);
425 return Frag(id, nullPatchList);
426}
427
428// Returns a fragment matching a particular empty-width op (like ^ or $)
429Frag Compiler::EmptyWidth(EmptyOp empty) {
430 int id = AllocInst(1);
431 if (id < 0)
432 return NoMatch();
433 inst_[id].InitEmptyWidth(empty, 0);
434 if (empty & (kEmptyBeginLine|kEmptyEndLine))
435 prog_->MarkByteRange('\n', '\n');
436 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
437 int j;
438 for (int i = 0; i < 256; i = j) {
439 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
440 ;
441 prog_->MarkByteRange(i, j-1);
442 }
443 }
444 return Frag(id, PatchList::Mk(id << 1));
445}
446
447// Given a fragment a, returns a fragment with capturing parens around a.
448Frag Compiler::Capture(Frag a, int n) {
449 int id = AllocInst(2);
450 if (id < 0)
451 return NoMatch();
452 inst_[id].InitCapture(2*n, a.begin);
453 inst_[id+1].InitCapture(2*n+1, 0);
454 PatchList::Patch(inst_, a.end, id+1);
455
456 return Frag(id, PatchList::Mk((id+1) << 1));
457}
458
459// A Rune is a name for a Unicode code point.
460// Returns maximum rune encoded by UTF-8 sequence of length len.
461static int MaxRune(int len) {
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000462 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
Ian Hodson2ee91b42012-05-14 12:29:36 +0100463 if (len == 1)
464 b = 7;
465 else
466 b = 8-(len+1) + 6*(len-1);
467 return (1<<b) - 1; // maximum Rune for b bits.
468}
469
470// The rune range compiler caches common suffix fragments,
471// which are very common in UTF-8 (e.g., [80-bf]).
472// The fragment suffixes are identified by their start
473// instructions. NULL denotes the eventual end match.
474// The Frag accumulates in rune_range_. Caching common
475// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
476// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
477
478void Compiler::BeginRange() {
479 rune_cache_.clear();
480 rune_range_.begin = 0;
481 rune_range_.end = nullPatchList;
482}
483
484int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
485 int next) {
486 Frag f = ByteRange(lo, hi, foldcase);
487 if (next != 0) {
488 PatchList::Patch(inst_, f.end, next);
489 } else {
490 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
491 }
492 return f.begin;
493}
494
495int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
496 // In Latin1 mode, there's no point in caching.
497 // In forward UTF-8 mode, only need to cache continuation bytes.
498 if (encoding_ == kEncodingLatin1 ||
499 (encoding_ == kEncodingUTF8 &&
500 !reversed_ &&
501 !(0x80 <= lo && hi <= 0xbf))) {
502 return UncachedRuneByteSuffix(lo, hi, foldcase, next);
503 }
504
505 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
506 map<uint64, int>::iterator it = rune_cache_.find(key);
507 if (it != rune_cache_.end())
508 return it->second;
509 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
510 rune_cache_[key] = id;
511 return id;
512}
513
514void Compiler::AddSuffix(int id) {
515 if (rune_range_.begin == 0) {
516 rune_range_.begin = id;
517 return;
518 }
519
520 int alt = AllocInst(1);
521 if (alt < 0) {
522 rune_range_.begin = 0;
523 return;
524 }
525 inst_[alt].InitAlt(rune_range_.begin, id);
526 rune_range_.begin = alt;
527}
528
529Frag Compiler::EndRange() {
530 return rune_range_;
531}
532
533// Converts rune range lo-hi into a fragment that recognizes
534// the bytes that would make up those runes in the current
535// encoding (Latin 1 or UTF-8).
536// This lets the machine work byte-by-byte even when
537// using multibyte encodings.
538
539void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
540 switch (encoding_) {
541 default:
542 case kEncodingUTF8:
543 AddRuneRangeUTF8(lo, hi, foldcase);
544 break;
545 case kEncodingLatin1:
546 AddRuneRangeLatin1(lo, hi, foldcase);
547 break;
548 }
549}
550
551void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
552 // Latin1 is easy: runes *are* bytes.
553 if (lo > hi || lo > 0xFF)
554 return;
555 if (hi > 0xFF)
556 hi = 0xFF;
557 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
558}
559
560// Table describing how to make a UTF-8 matching machine
561// for the rune range 80-10FFFF (Runeself-Runemax).
562// This range happens frequently enough (for example /./ and /[^a-z]/)
563// and the rune_cache_ map is slow enough that this is worth
564// special handling. Makes compilation of a small expression
565// with a dot in it about 10% faster.
566// The * in the comments below mark whole sequences.
567static struct ByteRangeProg {
568 int next;
569 int lo;
570 int hi;
571} prog_80_10ffff[] = {
572 // Two-byte
573 { -1, 0x80, 0xBF, }, // 0: 80-BF
574 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF*
575
576 // Three-byte
577 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF
578 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF*
579 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF
580 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF*
581
582 // Four-byte
583 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF
584 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF*
585 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF
586 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF*
587 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF
588 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF*
589};
590
591void Compiler::Add_80_10ffff() {
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000592 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
Ian Hodson2ee91b42012-05-14 12:29:36 +0100593 for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
594 const ByteRangeProg& p = prog_80_10ffff[i];
595 int next = 0;
596 if (p.next >= 0)
597 next = inst[p.next];
598 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
599 if ((p.lo & 0xC0) != 0x80)
600 AddSuffix(inst[i]);
601 }
602}
603
604void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
605 if (lo > hi)
606 return;
607
608 // Pick off 80-10FFFF as a common special case
609 // that can bypass the slow rune_cache_.
610 if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
611 Add_80_10ffff();
612 return;
613 }
614
615 // Split range into same-length sized ranges.
616 for (int i = 1; i < UTFmax; i++) {
617 Rune max = MaxRune(i);
618 if (lo <= max && max < hi) {
619 AddRuneRangeUTF8(lo, max, foldcase);
620 AddRuneRangeUTF8(max+1, hi, foldcase);
621 return;
622 }
623 }
624
625 // ASCII range is always a special case.
626 if (hi < Runeself) {
627 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
628 return;
629 }
630
631 // Split range into sections that agree on leading bytes.
632 for (int i = 1; i < UTFmax; i++) {
633 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
634 if ((lo & ~m) != (hi & ~m)) {
635 if ((lo & m) != 0) {
636 AddRuneRangeUTF8(lo, lo|m, foldcase);
637 AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
638 return;
639 }
640 if ((hi & m) != m) {
641 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
642 AddRuneRangeUTF8(hi&~m, hi, foldcase);
643 return;
644 }
645 }
646 }
647
648 // Finally. Generate byte matching equivalent for lo-hi.
649 uint8 ulo[UTFmax], uhi[UTFmax];
650 int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
651 int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
652 (void)m; // USED(m)
653 DCHECK_EQ(n, m);
654
655 int id = 0;
656 if (reversed_) {
657 for (int i = 0; i < n; i++)
658 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
659 } else {
660 for (int i = n-1; i >= 0; i--)
661 id = RuneByteSuffix(ulo[i], uhi[i], false, id);
662 }
663 AddSuffix(id);
664}
665
666// Should not be called.
667Frag Compiler::Copy(Frag arg) {
668 // We're using WalkExponential; there should be no copying.
669 LOG(DFATAL) << "Compiler::Copy called!";
670 failed_ = true;
671 return NoMatch();
672}
673
674// Visits a node quickly; called once WalkExponential has
675// decided to cut this walk short.
676Frag Compiler::ShortVisit(Regexp* re, Frag) {
677 failed_ = true;
678 return NoMatch();
679}
680
681// Called before traversing a node's children during the walk.
682Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
683 // Cut off walk if we've already failed.
684 if (failed_)
685 *stop = true;
686
687 return kNullFrag; // not used by caller
688}
689
690Frag Compiler::Literal(Rune r, bool foldcase) {
691 switch (encoding_) {
692 default:
693 return kNullFrag;
694
695 case kEncodingLatin1:
696 return ByteRange(r, r, foldcase);
697
698 case kEncodingUTF8: {
699 if (r < Runeself) // Make common case fast.
700 return ByteRange(r, r, foldcase);
701 uint8 buf[UTFmax];
702 int n = runetochar(reinterpret_cast<char*>(buf), &r);
703 Frag f = ByteRange((uint8)buf[0], buf[0], false);
704 for (int i = 1; i < n; i++)
705 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
706 return f;
707 }
708 }
709}
710
711// Called after traversing the node's children during the walk.
712// Given their frags, build and return the frag for this re.
713Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
714 int nchild_frags) {
715 // If a child failed, don't bother going forward, especially
716 // since the child_frags might contain Frags with NULLs in them.
717 if (failed_)
718 return NoMatch();
719
720 // Given the child fragments, return the fragment for this node.
721 switch (re->op()) {
722 case kRegexpRepeat:
723 // Should not see; code at bottom of function will print error
724 break;
725
726 case kRegexpNoMatch:
727 return NoMatch();
728
729 case kRegexpEmptyMatch:
730 return Nop();
731
732 case kRegexpHaveMatch: {
733 Frag f = Match(re->match_id());
734 // Remember unanchored match to end of string.
735 if (anchor_ != RE2::ANCHOR_BOTH)
Alexander Gutkin0d4c5232013-02-28 13:47:27 +0000736 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
Ian Hodson2ee91b42012-05-14 12:29:36 +0100737 return f;
738 }
739
740 case kRegexpConcat: {
741 Frag f = child_frags[0];
742 for (int i = 1; i < nchild_frags; i++)
743 f = Cat(f, child_frags[i]);
744 return f;
745 }
746
747 case kRegexpAlternate: {
748 Frag f = child_frags[0];
749 for (int i = 1; i < nchild_frags; i++)
750 f = Alt(f, child_frags[i]);
751 return f;
752 }
753
754 case kRegexpStar:
755 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
756
757 case kRegexpPlus:
758 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
759
760 case kRegexpQuest:
761 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
762
763 case kRegexpLiteral:
764 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
765
766 case kRegexpLiteralString: {
767 // Concatenation of literals.
768 if (re->nrunes() == 0)
769 return Nop();
770 Frag f;
771 for (int i = 0; i < re->nrunes(); i++) {
772 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
773 if (i == 0)
774 f = f1;
775 else
776 f = Cat(f, f1);
777 }
778 return f;
779 }
780
781 case kRegexpAnyChar:
782 BeginRange();
783 AddRuneRange(0, Runemax, false);
784 return EndRange();
785
786 case kRegexpAnyByte:
787 return ByteRange(0x00, 0xFF, false);
788
789 case kRegexpCharClass: {
790 CharClass* cc = re->cc();
791 if (cc->empty()) {
792 // This can't happen.
793 LOG(DFATAL) << "No ranges in char class";
794 failed_ = true;
795 return NoMatch();
796 }
797
798 // ASCII case-folding optimization: if the char class
799 // behaves the same on A-Z as it does on a-z,
800 // discard any ranges wholly contained in A-Z
801 // and mark the other ranges as foldascii.
802 // This reduces the size of a program for
803 // (?i)abc from 3 insts per letter to 1 per letter.
804 bool foldascii = cc->FoldsASCII();
805
806 // Character class is just a big OR of the different
807 // character ranges in the class.
808 BeginRange();
809 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
810 // ASCII case-folding optimization (see above).
811 if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
812 continue;
813
814 // If this range contains all of A-Za-z or none of it,
815 // the fold flag is unnecessary; don't bother.
816 bool fold = foldascii;
817 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
818 fold = false;
819
820 AddRuneRange(i->lo, i->hi, fold);
821 }
822 return EndRange();
823 }
824
825 case kRegexpCapture:
826 // If this is a non-capturing parenthesis -- (?:foo) --
827 // just use the inner expression.
828 if (re->cap() < 0)
829 return child_frags[0];
830 return Capture(child_frags[0], re->cap());
831
832 case kRegexpBeginLine:
833 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
834
835 case kRegexpEndLine:
836 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
837
838 case kRegexpBeginText:
839 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
840
841 case kRegexpEndText:
842 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
843
844 case kRegexpWordBoundary:
845 return EmptyWidth(kEmptyWordBoundary);
846
847 case kRegexpNoWordBoundary:
848 return EmptyWidth(kEmptyNonWordBoundary);
849 }
850 LOG(DFATAL) << "Missing case in Compiler: " << re->op();
851 failed_ = true;
852 return NoMatch();
853}
854
855// Is this regexp required to start at the beginning of the text?
856// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
857// but handles (\A(a|b)). Could use the Walker to write a more exact one.
858static bool IsAnchorStart(Regexp** pre, int depth) {
859 Regexp* re = *pre;
860 Regexp* sub;
861 // The depth limit makes sure that we don't overflow
862 // the stack on a deeply nested regexp. As the comment
863 // above says, IsAnchorStart is conservative, so returning
864 // a false negative is okay. The exact limit is somewhat arbitrary.
865 if (re == NULL || depth >= 4)
866 return false;
867 switch (re->op()) {
868 default:
869 break;
870 case kRegexpConcat:
871 if (re->nsub() > 0) {
872 sub = re->sub()[0]->Incref();
873 if (IsAnchorStart(&sub, depth+1)) {
874 Regexp** subcopy = new Regexp*[re->nsub()];
875 subcopy[0] = sub; // already have reference
876 for (int i = 1; i < re->nsub(); i++)
877 subcopy[i] = re->sub()[i]->Incref();
878 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
879 delete[] subcopy;
880 re->Decref();
881 return true;
882 }
883 sub->Decref();
884 }
885 break;
886 case kRegexpCapture:
887 sub = re->sub()[0]->Incref();
888 if (IsAnchorStart(&sub, depth+1)) {
889 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
890 re->Decref();
891 return true;
892 }
893 sub->Decref();
894 break;
895 case kRegexpBeginText:
896 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
897 re->Decref();
898 return true;
899 }
900 return false;
901}
902
903// Is this regexp required to start at the end of the text?
904// Only approximate; can return false for complicated regexps like (a\z|b\z),
905// but handles ((a|b)\z). Could use the Walker to write a more exact one.
906static bool IsAnchorEnd(Regexp** pre, int depth) {
907 Regexp* re = *pre;
908 Regexp* sub;
909 // The depth limit makes sure that we don't overflow
910 // the stack on a deeply nested regexp. As the comment
911 // above says, IsAnchorEnd is conservative, so returning
912 // a false negative is okay. The exact limit is somewhat arbitrary.
913 if (re == NULL || depth >= 4)
914 return false;
915 switch (re->op()) {
916 default:
917 break;
918 case kRegexpConcat:
919 if (re->nsub() > 0) {
920 sub = re->sub()[re->nsub() - 1]->Incref();
921 if (IsAnchorEnd(&sub, depth+1)) {
922 Regexp** subcopy = new Regexp*[re->nsub()];
923 subcopy[re->nsub() - 1] = sub; // already have reference
924 for (int i = 0; i < re->nsub() - 1; i++)
925 subcopy[i] = re->sub()[i]->Incref();
926 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
927 delete[] subcopy;
928 re->Decref();
929 return true;
930 }
931 sub->Decref();
932 }
933 break;
934 case kRegexpCapture:
935 sub = re->sub()[0]->Incref();
936 if (IsAnchorEnd(&sub, depth+1)) {
937 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
938 re->Decref();
939 return true;
940 }
941 sub->Decref();
942 break;
943 case kRegexpEndText:
944 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
945 re->Decref();
946 return true;
947 }
948 return false;
949}
950
951void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
952 RE2::Anchor anchor) {
953 prog_->set_flags(flags);
954
955 if (flags & Regexp::Latin1)
956 encoding_ = kEncodingLatin1;
957 max_mem_ = max_mem;
958 if (max_mem <= 0) {
959 max_inst_ = 100000; // more than enough
960 } else if (max_mem <= sizeof(Prog)) {
961 // No room for anything.
962 max_inst_ = 0;
963 } else {
964 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
965 // Limit instruction count so that inst->id() fits nicely in an int.
966 // SparseArray also assumes that the indices (inst->id()) are ints.
967 // The call to WalkExponential uses 2*max_inst_ below,
968 // and other places in the code use 2 or 3 * prog->size().
969 // Limiting to 2^24 should avoid overflow in those places.
970 // (The point of allowing more than 32 bits of memory is to
971 // have plenty of room for the DFA states, not to use it up
972 // on the program.)
973 if (m >= 1<<24)
974 m = 1<<24;
975
976 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
977 if (m > Prog::Inst::kMaxInst)
978 m = Prog::Inst::kMaxInst;
979
980 max_inst_ = m;
981 }
982
983 anchor_ = anchor;
984}
985
986// Compiles re, returning program.
987// Caller is responsible for deleting prog_.
988// If reversed is true, compiles a program that expects
989// to run over the input string backward (reverses all concatenations).
990// The reversed flag is also recorded in the returned program.
991Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
992 Compiler c;
993
994 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
995 c.reversed_ = reversed;
996
997 // Simplify to remove things like counted repetitions
998 // and character classes like \d.
999 Regexp* sre = re->Simplify();
1000 if (sre == NULL)
1001 return NULL;
1002
1003 // Record whether prog is anchored, removing the anchors.
1004 // (They get in the way of other optimizations.)
1005 bool is_anchor_start = IsAnchorStart(&sre, 0);
1006 bool is_anchor_end = IsAnchorEnd(&sre, 0);
1007
1008 // Generate fragment for entire regexp.
1009 Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
1010 sre->Decref();
1011 if (c.failed_)
1012 return NULL;
1013
1014 // Success! Finish by putting Match node at end, and record start.
1015 // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1016 // to behave normally.
1017 c.reversed_ = false;
1018 Frag all = c.Cat(f, c.Match(0));
1019 c.prog_->set_start(all.begin);
1020
1021 if (reversed) {
1022 c.prog_->set_anchor_start(is_anchor_end);
1023 c.prog_->set_anchor_end(is_anchor_start);
1024 } else {
1025 c.prog_->set_anchor_start(is_anchor_start);
1026 c.prog_->set_anchor_end(is_anchor_end);
1027 }
1028
1029 // Also create unanchored version, which starts with a .*? loop.
1030 if (c.prog_->anchor_start()) {
1031 c.prog_->set_start_unanchored(c.prog_->start());
1032 } else {
1033 Frag unanchored = c.Cat(c.DotStar(), all);
1034 c.prog_->set_start_unanchored(unanchored.begin);
1035 }
1036
1037 c.prog_->set_reversed(reversed);
1038
1039 // Hand ownership of prog_ to caller.
1040 return c.Finish();
1041}
1042
1043Prog* Compiler::Finish() {
1044 if (failed_)
1045 return NULL;
1046
1047 if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1048 // No possible matches; keep Fail instruction only.
1049 inst_len_ = 1;
1050 }
1051
1052 // Trim instruction to minimum array and transfer to Prog.
1053 Trim();
1054 prog_->inst_ = inst_;
1055 prog_->size_ = inst_len_;
1056 inst_ = NULL;
1057
1058 // Compute byte map.
1059 prog_->ComputeByteMap();
1060
1061 prog_->Optimize();
1062
1063 // Record remaining memory for DFA.
1064 if (max_mem_ <= 0) {
1065 prog_->set_dfa_mem(1<<20);
1066 } else {
1067 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1068 if (m < 0)
1069 m = 0;
1070 prog_->set_dfa_mem(m);
1071 }
1072
1073 Prog* p = prog_;
1074 prog_ = NULL;
1075 return p;
1076}
1077
1078// Converts Regexp to Prog.
1079Prog* Regexp::CompileToProg(int64 max_mem) {
1080 return Compiler::Compile(this, false, max_mem);
1081}
1082
1083Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1084 return Compiler::Compile(this, true, max_mem);
1085}
1086
1087Frag Compiler::DotStar() {
1088 return Star(ByteRange(0x00, 0xff, false), true);
1089}
1090
1091// Compiles RE set to Prog.
1092Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1093 Regexp* re) {
1094 Compiler c;
1095
1096 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1097 c.Setup(pf, options.max_mem(), anchor);
1098
1099 // Compile alternation of fragments.
1100 Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1101 re->Decref();
1102 if (c.failed_)
1103 return NULL;
1104
1105 if (anchor == RE2::UNANCHORED) {
1106 // The trailing .* was added while handling kRegexpHaveMatch.
1107 // We just have to add the leading one.
1108 all = c.Cat(c.DotStar(), all);
1109 }
1110
1111 c.prog_->set_start(all.begin);
1112 c.prog_->set_start_unanchored(all.begin);
1113 c.prog_->set_anchor_start(true);
1114 c.prog_->set_anchor_end(true);
1115
1116 Prog* prog = c.Finish();
1117 if (prog == NULL)
1118 return NULL;
1119
1120 // Make sure DFA has enough memory to operate,
1121 // since we're not going to fall back to the NFA.
1122 bool failed;
1123 StringPiece sp = "hello, world";
1124 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1125 NULL, &failed, NULL);
1126 if (failed) {
1127 delete prog;
1128 return NULL;
1129 }
1130
1131 return prog;
1132}
1133
1134Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1135 Regexp* re) {
1136 return Compiler::CompileSet(options, anchor, re);
1137}
1138
1139} // namespace re2