blob: 57aec04a15cfd2fc93dd75468841076a7042fc97 [file] [log] [blame]
Ian Hodson2ee91b42012-05-14 12:29:36 +01001This is a dump from Google's source control system of the change
2that removed UCS-2 support from RE2. As the explanation below
3says, UCS-2 mode is fundamentally at odds with things like ^ and $,
4so it never really worked very well. But if you are interested in using
5it without those operators, it did work for that. It assumed that the
6UCS-2 data was in the native host byte order.
7
8If you are interested in adding UCS-2 mode back, this patch might
9be a good starting point.
10
11
12Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
13
14 Retire UCS-2 mode.
15
16 I added it as an experiment for V8, but it
17 requires 2-byte lookahead to do completely,
18 and RE2 has 1-byte lookahead (enough for UTF-8)
19 as a fairly deep fundamental assumption,
20 so it did not support ^ or $.
21
22==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
23re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
24 cap_[0] = p;
25 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
26 return true;
27- if (prog_->flags() & Regexp::UCS2)
28- p++;
29 }
30 return false;
31 }
32==== re2/compile.cc#17 - re2/compile.cc#18 ====
33re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
34 // Input encodings.
35 enum Encoding {
36 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
37- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order
38 kEncodingLatin1, // Latin1 (0-FF)
39 };
40
41re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
42 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
43 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
44 void Add_80_10ffff();
45- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
46- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
47- uint8 lo2, uint8 hi2, bool fold2);
48
49 // New suffix that matches the byte range lo-hi, then goes to next.
50 Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
51re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
52
53 // Converts rune range lo-hi into a fragment that recognizes
54 // the bytes that would make up those runes in the current
55- // encoding (Latin 1, UTF-8, or UCS-2).
56+ // encoding (Latin 1 or UTF-8).
57 // This lets the machine work byte-by-byte even when
58 // using multibyte encodings.
59
60re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
61 case kEncodingLatin1:
62 AddRuneRangeLatin1(lo, hi, foldcase);
63 break;
64- case kEncodingUCS2:
65- AddRuneRangeUCS2(lo, hi, foldcase);
66- break;
67 }
68 }
69
70re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
71 AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
72 }
73
74- // Test whether 16-bit values are big or little endian.
75- static bool BigEndian() {
76- union {
77- char byte[2];
78- int16 endian;
79- } u;
80-
81- u.byte[0] = 1;
82- u.byte[1] = 2;
83- return u.endian == 0x0102;
84- }
85-
86- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
87- uint8 lo2, uint8 hi2, bool fold2) {
88- Inst* ip;
89- if (reversed_) {
90- ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
91- ip = RuneByteSuffix(lo2, hi2, fold2, ip);
92- } else {
93- ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
94- ip = RuneByteSuffix(lo1, hi1, fold1, ip);
95- }
96- AddSuffix(ip);
97- }
98-
99- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
100- if (lo > hi || lo > 0xFFFF)
101- return;
102- if (hi > 0xFFFF)
103- hi = 0xFFFF;
104-
105- // We'll assemble a pattern assuming big endian.
106- // If the machine isn't, tell Cat to reverse its arguments.
107- bool oldreversed = reversed_;
108- if (!BigEndian()) {
109- reversed_ = !oldreversed;
110- }
111-
112- // Split into bytes.
113- int lo1 = lo >> 8;
114- int lo2 = lo & 0xFF;
115- int hi1 = hi >> 8;
116- int hi2 = hi & 0xFF;
117-
118- if (lo1 == hi1) {
119- // Easy case: high bits are same in both.
120- // Only do ASCII case folding on the second byte if the top byte is 00.
121- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
122- } else {
123- // Harder case: different second byte ranges depending on first byte.
124-
125- // Initial fragment.
126- if (lo2 > 0) {
127- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
128- lo1++;
129- }
130-
131- // Trailing fragment.
132- if (hi2 < 0xFF) {
133- AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
134- hi1--;
135- }
136-
137- // Inner ranges.
138- if (lo1 <= hi1) {
139- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
140- }
141- }
142-
143- // Restore reverse setting.
144- reversed_ = oldreversed;
145- }
146-
147 // Table describing how to make a UTF-8 matching machine
148 // for the rune range 80-10FFFF (Runeself-Runemax).
149 // This range happens frequently enough (for example /./ and /[^a-z]/)
150re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
151
152 Frag Compiler::Literal(Rune r, bool foldcase) {
153 switch (encoding_) {
154- default: // UCS-2 or something new
155- BeginRange();
156- AddRuneRange(r, r, foldcase);
157- return EndRange();
158+ default:
159+ return kNullFrag;
160
161 case kEncodingLatin1:
162 return ByteRange(r, r, foldcase);
163re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
164
165 if (re->parse_flags() & Regexp::Latin1)
166 c.encoding_ = kEncodingLatin1;
167- else if (re->parse_flags() & Regexp::UCS2)
168- c.encoding_ = kEncodingUCS2;
169 c.reversed_ = reversed;
170 if (max_mem <= 0) {
171 c.max_inst_ = 100000; // more than enough
172re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
173 c.prog_->set_start_unanchored(c.prog_->start());
174 } else {
175 Frag dot;
176- if (c.encoding_ == kEncodingUCS2) {
177- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
178- } else {
179- dot = c.ByteRange(0x00, 0xFF, false);
180- }
181+ dot = c.ByteRange(0x00, 0xFF, false);
182 Frag dotloop = c.Star(dot, true);
183 Frag unanchored = c.Cat(dotloop, all);
184 c.prog_->set_start_unanchored(unanchored.begin);
185==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
186re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
187 const char* bp = context.begin();
188 int c = -1;
189 int wasword = 0;
190- bool ucs2 = prog_->flags() & Regexp::UCS2;
191
192 if (text.begin() > context.begin()) {
193 c = text.begin()[-1] & 0xFF;
194re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
195 // If there's a required first byte for an unanchored search
196 // and we're not in the middle of any possible matches,
197 // use memchr to search for the byte quickly.
198- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
199+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
200 p < text.end() && (p[0] & 0xFF) != first_byte_) {
201 p = reinterpret_cast<const char*>(memchr(p, first_byte_,
202 text.end() - p));
203re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
204 flag = Prog::EmptyFlags(context, p);
205 }
206
207- // In UCS-2 mode, if we need to start a new thread,
208- // make sure to do it on an even boundary.
209- if(ucs2 && runq->size() == 0 &&
210- (p - context.begin()) % 2 && p < text.end()) {
211- p++;
212- flag = Prog::EmptyFlags(context, p);
213- }
214-
215 // Steal match storage (cleared but unused as of yet)
216 // temporarily to hold match boundaries for new thread.
217- // In UCS-2 mode, only start the thread on a 2-byte boundary.
218- if(!ucs2 || (p - context.begin()) % 2 == 0) {
219- match_[0] = p;
220- AddToThreadq(runq, start_, flag, p, match_);
221- match_[0] = NULL;
222- }
223+ match_[0] = p;
224+ AddToThreadq(runq, start_, flag, p, match_);
225+ match_[0] = NULL;
226 }
227
228 // If all the threads have died, stop early.
229==== re2/parse.cc#22 - re2/parse.cc#23 ====
230re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
231 status_(status), stacktop_(NULL), ncap_(0) {
232 if (flags_ & Latin1)
233 rune_max_ = 0xFF;
234- else if (flags & UCS2)
235- rune_max_ = 0xFFFF;
236 else
237 rune_max_ = Runemax;
238 }
239re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
240 bool Regexp::ParseState::PushCarat() {
241 if (flags_ & OneLine) {
242 return PushSimpleOp(kRegexpBeginText);
243- } else {
244- if (flags_ & UCS2) {
245- status_->set_code(kRegexpUnsupported);
246- status_->set_error_arg("multiline ^ in UCS-2 mode");
247- return false;
248- }
249- return PushSimpleOp(kRegexpBeginLine);
250 }
251+ return PushSimpleOp(kRegexpBeginLine);
252 }
253
254 // Pushes a \b or \B onto the stack.
255 bool Regexp::ParseState::PushWordBoundary(bool word) {
256- if (flags_ & UCS2) {
257- status_->set_code(kRegexpUnsupported);
258- status_->set_error_arg("\\b or \\B in UCS-2 mode");
259- return false;
260- }
261 if (word)
262 return PushSimpleOp(kRegexpWordBoundary);
263 return PushSimpleOp(kRegexpNoWordBoundary);
264re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
265 bool ret = PushSimpleOp(kRegexpEndText);
266 flags_ = oflags;
267 return ret;
268- }
269- if (flags_ & UCS2) {
270- status_->set_code(kRegexpUnsupported);
271- status_->set_error_arg("multiline $ in UCS-2 mode");
272- return false;
273 }
274 return PushSimpleOp(kRegexpEndLine);
275 }
276==== re2/re2.cc#34 - re2/re2.cc#35 ====
277re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
278 return RE2::ErrorBadUTF8;
279 case re2::kRegexpBadNamedCapture:
280 return RE2::ErrorBadNamedCapture;
281- case re2::kRegexpUnsupported:
282- return RE2::ErrorUnsupported;
283 }
284 return RE2::ErrorInternal;
285 }
286re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
287 break;
288 case RE2::Options::EncodingLatin1:
289 flags |= Regexp::Latin1;
290- break;
291- case RE2::Options::EncodingUCS2:
292- flags |= Regexp::UCS2;
293 break;
294 }
295
296==== re2/re2.h#36 - re2/re2.h#37 ====
297re2/re2.h#36:246,252 - re2/re2.h#37:246,251
298 ErrorBadUTF8, // invalid UTF-8 in regexp
299 ErrorBadNamedCapture, // bad named capture group
300 ErrorPatternTooLarge, // pattern too large (compile failed)
301- ErrorUnsupported, // unsupported feature (in UCS-2 mode)
302 };
303
304 // Predefined common options.
305re2/re2.h#36:570,576 - re2/re2.h#37:569,574
306
307 enum Encoding {
308 EncodingUTF8 = 1,
309- EncodingUCS2, // 16-bit Unicode 0-FFFF only
310 EncodingLatin1
311 };
312
313==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
314re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
315 // the regexp that remains after the prefix. The prefix might
316 // be ASCII case-insensitive.
317 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
318- // Don't even bother for UCS-2; it's time to throw that code away.
319- if (parse_flags_ & UCS2)
320- return false;
321-
322 // No need for a walker: the regexp must be of the form
323 // 1. some number of ^ anchors
324 // 2. a literal char or string
325==== re2/regexp.h#20 - re2/regexp.h#21 ====
326re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
327 kRegexpBadPerlOp, // bad perl operator
328 kRegexpBadUTF8, // invalid UTF-8 in regexp
329 kRegexpBadNamedCapture, // bad named capture
330- kRegexpUnsupported, // unsupported operator
331 };
332
333 // Error status for certain operations.
334re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
335 // \Q and \E to disable/enable metacharacters
336 // (?P<name>expr) for named captures
337 // \C to match any single byte
338- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8.
339- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
340+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
341 // and \P{Han} for its negation.
342- NeverNL = 1<<12, // Never match NL, even if the regexp mentions
343+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions
344 // it explicitly.
345
346 // As close to Perl as we can get.
347==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
348re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
349 cap_[0] = p;
350 if (Visit(prog_->start(), p)) // Match must be leftmost; done.
351 return true;
352- if (prog_->flags() & Regexp::UCS2)
353- p++;
354 }
355 return false;
356 }
357==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
358re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
359 static ParseMode parse_modes[] = {
360 { single_line, "single-line" },
361 { single_line|Regexp::Latin1, "single-line, latin1" },
362- { single_line|Regexp::UCS2, "single-line, ucs2" },
363 { multi_line, "multiline" },
364 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
365 { multi_line|Regexp::Latin1, "multiline, latin1" },
366- { multi_line|Regexp::UCS2, "multiline, ucs2" },
367 };
368
369 static string FormatMode(Regexp::ParseFlags flags) {
370re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
371 RegexpStatus status;
372 regexp_ = Regexp::Parse(regexp_str, flags, &status);
373 if (regexp_ == NULL) {
374- if (status.code() != kRegexpUnsupported) {
375- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
376- << " mode: " << FormatMode(flags);
377- error_ = true;
378- }
379+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
380+ << " mode: " << FormatMode(flags);
381+ error_ = true;
382 return;
383 }
384 prog_ = regexp_->CompileToProg(0);
385re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
386 RE2::Options options;
387 if (flags & Regexp::Latin1)
388 options.set_encoding(RE2::Options::EncodingLatin1);
389- else if (flags & Regexp::UCS2)
390- options.set_encoding(RE2::Options::EncodingUCS2);
391 if (kind_ == Prog::kLongestMatch)
392 options.set_longest_match(true);
393 re2_ = new RE2(re, options);
394re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
395 delete re2_;
396 }
397
398- // Converts UTF-8 string in text into UCS-2 string in new_text.
399- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
400- const char* p = text.begin();
401- const char* ep = text.end();
402- uint16* q = new uint16[ep - p];
403- uint16* q0 = q;
404-
405- int n;
406- Rune r;
407- for (; p < ep; p += n) {
408- if (!fullrune(p, ep - p)) {
409- delete[] q0;
410- return false;
411- }
412- n = chartorune(&r, p);
413- if (r > 0xFFFF) {
414- delete[] q0;
415- return false;
416- }
417- *q++ = r;
418- }
419- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
420- return true;
421- }
422-
423- // Rewrites *sp from being a pointer into text8 (UTF-8)
424- // to being a pointer into text16 (equivalent text but in UCS-2).
425- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
426- StringPiece *sp) {
427- if (sp->begin() == NULL && text8.begin() != NULL)
428- return;
429-
430- int nrune = 0;
431- int n;
432- Rune r;
433- const char* p = text8.begin();
434- const char* ep = text8.end();
435- const char* spbegin = NULL;
436- const char* spend = NULL;
437- for (;;) {
438- if (p == sp->begin())
439- spbegin = text16.begin() + sizeof(uint16)*nrune;
440- if (p == sp->end())
441- spend = text16.begin() + sizeof(uint16)*nrune;
442- if (p >= ep)
443- break;
444- n = chartorune(&r, p);
445- p += n;
446- nrune++;
447- }
448- if (spbegin == NULL || spend == NULL) {
449- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
450- << CEscape(text8) << " "
451- << (int)(sp->begin() - text8.begin()) << " "
452- << (int)(sp->end() - text8.begin());
453- }
454- *sp = StringPiece(spbegin, spend - spbegin);
455- }
456-
457- // Rewrites *sp from begin a pointer into text16 (UCS-2)
458- // to being a pointer into text8 (equivalent text but in UTF-8).
459- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
460- StringPiece* sp) {
461- if (sp->begin() == NULL)
462- return;
463-
464- int nrune = 0;
465- int n;
466- Rune r;
467- const char* p = text8.begin();
468- const char* ep = text8.end();
469- const char* spbegin = NULL;
470- const char* spend = NULL;
471- for (;;) {
472- if (nrune == (sp->begin() - text16.begin())/2)
473- spbegin = p;
474- if (nrune == (sp->end() - text16.begin())/2)
475- spend = p;
476- if (p >= ep)
477- break;
478- n = chartorune(&r, p);
479- p += n;
480- nrune++;
481- }
482- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
483- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
484- << CEscape(text16) << " "
485- << (int)(sp->begin() - text16.begin()) << " "
486- << (int)(sp->end() - text16.begin());
487- }
488- *sp = StringPiece(spbegin, spend - spbegin);
489- }
490-
491 // Runs a single search using the named engine type.
492 // This interface hides all the irregularities of the various
493 // engine interfaces from the rest of this file.
494re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
495
496 StringPiece text = orig_text;
497 StringPiece context = orig_context;
498- bool ucs2 = false;
499
500- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
501- if (!ConvertUTF8ToUCS2(orig_context, &context)) {
502- result->skipped = true;
503- return;
504- }
505-
506- // Rewrite context to refer to new text.
507- AdjustUTF8ToUCS2(orig_context, context, &text);
508- ucs2 = true;
509- }
510-
511 switch (type) {
512 default:
513 LOG(FATAL) << "Bad RunSearch type: " << (int)type;
514re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
515 }
516 }
517
518- // If we did UCS-2 matching, rewrite the matches to refer
519- // to the original UTF-8 text.
520- if (ucs2) {
521- if (result->matched) {
522- if (result->have_submatch0) {
523- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
524- } else if (result->have_submatch) {
525- for (int i = 0; i < nsubmatch; i++) {
526- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
527- }
528- }
529- }
530- delete[] context.begin();
531- }
532-
533 if (!result->matched)
534 memset(result->submatch, 0, sizeof result->submatch);
535 }
536re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
537 return true;
538 }
539
540- // Check whether text uses only Unicode points <= 0xFFFF
541- // (in the BMP).
542- static bool IsBMP(const StringPiece& text) {
543- const char* p = text.begin();
544- const char* ep = text.end();
545- while (p < ep) {
546- if (!fullrune(p, ep - p))
547- return false;
548- Rune r;
549- p += chartorune(&r, p);
550- if (r > 0xFFFF)
551- return false;
552- }
553- return true;
554- }
555-
556 // Runs a single test.
557 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
558 Prog::Anchor anchor) {
559re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
560 Result correct;
561 RunSearch(kEngineBacktrack, text, context, anchor, &correct);
562 if (correct.skipped) {
563- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode
564+ if (regexp_ == NULL)
565 return true;
566 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
567 << " " << FormatMode(flags_);