blob: cdc583c5d74c0078ac46e27ea919b859972857e4 [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001use std::collections::HashMap;
2use std::iter;
3use std::result;
4use std::sync::Arc;
5
6use syntax::hir::{self, Hir};
7use syntax::is_word_byte;
8use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
9
10use prog::{
11 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
12 InstSave, InstSplit, Program,
13};
14
15use Error;
16
17type Result = result::Result<Patch, Error>;
Haibo Huang49cbe5f2020-05-28 20:14:24 -070018type ResultOrEmpty = result::Result<Option<Patch>, Error>;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070019
20#[derive(Debug)]
21struct Patch {
22 hole: Hole,
23 entry: InstPtr,
24}
25
26/// A compiler translates a regular expression AST to a sequence of
27/// instructions. The sequence of instructions represents an NFA.
28pub struct Compiler {
29 insts: Vec<MaybeInst>,
30 compiled: Program,
31 capture_name_idx: HashMap<String, usize>,
32 num_exprs: usize,
33 size_limit: usize,
34 suffix_cache: SuffixCache,
35 utf8_seqs: Option<Utf8Sequences>,
36 byte_classes: ByteClassSet,
37}
38
39impl Compiler {
40 /// Create a new regular expression compiler.
41 ///
42 /// Various options can be set before calling `compile` on an expression.
43 pub fn new() -> Self {
44 Compiler {
45 insts: vec![],
46 compiled: Program::new(),
47 capture_name_idx: HashMap::new(),
48 num_exprs: 0,
49 size_limit: 10 * (1 << 20),
50 suffix_cache: SuffixCache::new(1000),
51 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
52 byte_classes: ByteClassSet::new(),
53 }
54 }
55
56 /// The size of the resulting program is limited by size_limit. If
57 /// the program approximately exceeds the given size (in bytes), then
58 /// compilation will stop and return an error.
59 pub fn size_limit(mut self, size_limit: usize) -> Self {
60 self.size_limit = size_limit;
61 self
62 }
63
64 /// If bytes is true, then the program is compiled as a byte based
65 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
66 /// false, then the automaton is Unicode scalar value based, e.g., an
67 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
68 ///
69 /// The specific invariant is that when returning a byte based machine,
70 /// the neither the `Char` nor `Ranges` instructions are produced.
71 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
72 /// instruction is never produced.
73 ///
74 /// Note that `dfa(true)` implies `bytes(true)`.
75 pub fn bytes(mut self, yes: bool) -> Self {
76 self.compiled.is_bytes = yes;
77 self
78 }
79
80 /// When disabled, the program compiled may match arbitrary bytes.
81 ///
82 /// When enabled (the default), all compiled programs exclusively match
83 /// valid UTF-8 bytes.
84 pub fn only_utf8(mut self, yes: bool) -> Self {
85 self.compiled.only_utf8 = yes;
86 self
87 }
88
89 /// When set, the machine returned is suitable for use in the DFA matching
90 /// engine.
91 ///
92 /// In particular, this ensures that if the regex is not anchored in the
93 /// beginning, then a preceding `.*?` is included in the program. (The NFA
94 /// based engines handle the preceding `.*?` explicitly, which is difficult
95 /// or impossible in the DFA engine.)
96 pub fn dfa(mut self, yes: bool) -> Self {
97 self.compiled.is_dfa = yes;
98 self
99 }
100
101 /// When set, the machine returned is suitable for matching text in
102 /// reverse. In particular, all concatenations are flipped.
103 pub fn reverse(mut self, yes: bool) -> Self {
104 self.compiled.is_reverse = yes;
105 self
106 }
107
108 /// Compile a regular expression given its AST.
109 ///
110 /// The compiler is guaranteed to succeed unless the program exceeds the
111 /// specified size limit. If the size limit is exceeded, then compilation
112 /// stops and returns an error.
113 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
114 debug_assert!(!exprs.is_empty());
115 self.num_exprs = exprs.len();
116 if exprs.len() == 1 {
117 self.compile_one(&exprs[0])
118 } else {
119 self.compile_many(exprs)
120 }
121 }
122
123 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
124 // If we're compiling a forward DFA and we aren't anchored, then
125 // add a `.*?` before the first capture group.
126 // Other matching engines handle this by baking the logic into the
127 // matching engine itself.
128 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
129 self.compiled.is_anchored_start = expr.is_anchored_start();
130 self.compiled.is_anchored_end = expr.is_anchored_end();
131 if self.compiled.needs_dotstar() {
132 dotstar_patch = self.c_dotstar()?;
133 self.compiled.start = dotstar_patch.entry;
134 }
135 self.compiled.captures = vec![None];
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700136 let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700137 if self.compiled.needs_dotstar() {
138 self.fill(dotstar_patch.hole, patch.entry);
139 } else {
140 self.compiled.start = patch.entry;
141 }
142 self.fill_to_next(patch.hole);
143 self.compiled.matches = vec![self.insts.len()];
144 self.push_compiled(Inst::Match(0));
145 self.compile_finish()
146 }
147
148 fn compile_many(
149 mut self,
150 exprs: &[Hir],
151 ) -> result::Result<Program, Error> {
152 debug_assert!(exprs.len() > 1);
153
154 self.compiled.is_anchored_start =
155 exprs.iter().all(|e| e.is_anchored_start());
156 self.compiled.is_anchored_end =
157 exprs.iter().all(|e| e.is_anchored_end());
158 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
159 if self.compiled.needs_dotstar() {
160 dotstar_patch = self.c_dotstar()?;
161 self.compiled.start = dotstar_patch.entry;
162 } else {
163 self.compiled.start = 0; // first instruction is always split
164 }
165 self.fill_to_next(dotstar_patch.hole);
166
167 let mut prev_hole = Hole::None;
168 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
169 self.fill_to_next(prev_hole);
170 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700171 let Patch { hole, entry } =
172 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700173 self.fill_to_next(hole);
174 self.compiled.matches.push(self.insts.len());
175 self.push_compiled(Inst::Match(i));
176 prev_hole = self.fill_split(split, Some(entry), None);
177 }
178 let i = exprs.len() - 1;
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700179 let Patch { hole, entry } =
180 self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700181 self.fill(prev_hole, entry);
182 self.fill_to_next(hole);
183 self.compiled.matches.push(self.insts.len());
184 self.push_compiled(Inst::Match(i));
185 self.compile_finish()
186 }
187
188 fn compile_finish(mut self) -> result::Result<Program, Error> {
189 self.compiled.insts =
190 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
191 self.compiled.byte_classes = self.byte_classes.byte_classes();
192 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
193 Ok(self.compiled)
194 }
195
196 /// Compile expr into self.insts, returning a patch on success,
197 /// or an error if we run out of memory.
198 ///
199 /// All of the c_* methods of the compiler share the contract outlined
200 /// here.
201 ///
202 /// The main thing that a c_* method does is mutate `self.insts`
203 /// to add a list of mostly compiled instructions required to execute
204 /// the given expression. `self.insts` contains MaybeInsts rather than
205 /// Insts because there is some backpatching required.
206 ///
207 /// The `Patch` value returned by each c_* method provides metadata
208 /// about the compiled instructions emitted to `self.insts`. The
209 /// `entry` member of the patch refers to the first instruction
210 /// (the entry point), while the `hole` member contains zero or
211 /// more offsets to partial instructions that need to be backpatched.
212 /// The c_* routine can't know where its list of instructions are going to
213 /// jump to after execution, so it is up to the caller to patch
214 /// these jumps to point to the right place. So compiling some
215 /// expression, e, we would end up with a situation that looked like:
216 ///
217 /// ```text
218 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
219 /// ^ ^ ^
220 /// | \ /
221 /// entry \ /
222 /// hole
223 /// ```
224 ///
Chih-Hung Hsieh849e4452020-10-26 13:16:47 -0700225 /// To compile two expressions, e1 and e2, concatenated together we
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700226 /// would do:
227 ///
228 /// ```ignore
229 /// let patch1 = self.c(e1);
230 /// let patch2 = self.c(e2);
231 /// ```
232 ///
233 /// while leaves us with a situation that looks like
234 ///
235 /// ```text
236 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
237 /// ^ ^ ^ ^
238 /// | | | |
239 /// entry1 hole1 entry2 hole2
240 /// ```
241 ///
242 /// Then to merge the two patches together into one we would backpatch
243 /// hole1 with entry2 and return a new patch that enters at entry1
244 /// and has hole2 for a hole. In fact, if you look at the c_concat
245 /// method you will see that it does exactly this, though it handles
246 /// a list of expressions rather than just the two that we use for
247 /// an example.
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700248 ///
249 /// Ok(None) is returned when an expression is compiled to no
250 /// instruction, and so no patch.entry value makes sense.
251 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700252 use prog;
253 use syntax::hir::HirKind::*;
254
255 self.check_size()?;
256 match *expr.kind() {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700257 Empty => Ok(None),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700258 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
259 Literal(hir::Literal::Byte(b)) => {
260 assert!(self.compiled.uses_bytes());
261 self.c_byte(b)
262 }
263 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
264 Class(hir::Class::Bytes(ref cls)) => {
265 if self.compiled.uses_bytes() {
266 self.c_class_bytes(cls.ranges())
267 } else {
268 assert!(cls.is_all_ascii());
269 let mut char_ranges = vec![];
270 for r in cls.iter() {
271 let (s, e) = (r.start() as char, r.end() as char);
272 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
273 }
274 self.c_class(&char_ranges)
275 }
276 }
277 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
278 self.byte_classes.set_range(b'\n', b'\n');
279 self.c_empty_look(prog::EmptyLook::EndLine)
280 }
281 Anchor(hir::Anchor::StartLine) => {
282 self.byte_classes.set_range(b'\n', b'\n');
283 self.c_empty_look(prog::EmptyLook::StartLine)
284 }
285 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
286 self.byte_classes.set_range(b'\n', b'\n');
287 self.c_empty_look(prog::EmptyLook::StartLine)
288 }
289 Anchor(hir::Anchor::EndLine) => {
290 self.byte_classes.set_range(b'\n', b'\n');
291 self.c_empty_look(prog::EmptyLook::EndLine)
292 }
293 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
294 self.c_empty_look(prog::EmptyLook::EndText)
295 }
296 Anchor(hir::Anchor::StartText) => {
297 self.c_empty_look(prog::EmptyLook::StartText)
298 }
299 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
300 self.c_empty_look(prog::EmptyLook::StartText)
301 }
302 Anchor(hir::Anchor::EndText) => {
303 self.c_empty_look(prog::EmptyLook::EndText)
304 }
305 WordBoundary(hir::WordBoundary::Unicode) => {
306 if !cfg!(feature = "unicode-perl") {
307 return Err(Error::Syntax(
308 "Unicode word boundaries are unavailable when \
309 the unicode-perl feature is disabled"
310 .to_string(),
311 ));
312 }
313 self.compiled.has_unicode_word_boundary = true;
314 self.byte_classes.set_word_boundary();
315 self.c_empty_look(prog::EmptyLook::WordBoundary)
316 }
317 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
318 if !cfg!(feature = "unicode-perl") {
319 return Err(Error::Syntax(
320 "Unicode word boundaries are unavailable when \
321 the unicode-perl feature is disabled"
322 .to_string(),
323 ));
324 }
325 self.compiled.has_unicode_word_boundary = true;
326 self.byte_classes.set_word_boundary();
327 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
328 }
329 WordBoundary(hir::WordBoundary::Ascii) => {
330 self.byte_classes.set_word_boundary();
331 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
332 }
333 WordBoundary(hir::WordBoundary::AsciiNegate) => {
334 self.byte_classes.set_word_boundary();
335 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
336 }
337 Group(ref g) => match g.kind {
338 hir::GroupKind::NonCapturing => self.c(&g.hir),
339 hir::GroupKind::CaptureIndex(index) => {
340 if index as usize >= self.compiled.captures.len() {
341 self.compiled.captures.push(None);
342 }
343 self.c_capture(2 * index as usize, &g.hir)
344 }
345 hir::GroupKind::CaptureName { index, ref name } => {
346 if index as usize >= self.compiled.captures.len() {
347 let n = name.to_string();
348 self.compiled.captures.push(Some(n.clone()));
349 self.capture_name_idx.insert(n, index as usize);
350 }
351 self.c_capture(2 * index as usize, &g.hir)
352 }
353 },
354 Concat(ref es) => {
355 if self.compiled.is_reverse {
356 self.c_concat(es.iter().rev())
357 } else {
358 self.c_concat(es)
359 }
360 }
361 Alternation(ref es) => self.c_alternate(&**es),
362 Repetition(ref rep) => self.c_repeat(rep),
363 }
364 }
365
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700366 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700367 if self.num_exprs > 1 || self.compiled.is_dfa {
368 // Don't ever compile Save instructions for regex sets because
369 // they are never used. They are also never used in DFA programs
370 // because DFAs can't handle captures.
371 self.c(expr)
372 } else {
373 let entry = self.insts.len();
374 let hole = self.push_hole(InstHole::Save { slot: first_slot });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700375 let patch = self.c(expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700376 self.fill(hole, patch.entry);
377 self.fill_to_next(patch.hole);
378 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700379 Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700380 }
381 }
382
383 fn c_dotstar(&mut self) -> Result {
384 Ok(if !self.compiled.only_utf8() {
385 self.c(&Hir::repetition(hir::Repetition {
386 kind: hir::RepetitionKind::ZeroOrMore,
387 greedy: false,
388 hir: Box::new(Hir::any(true)),
389 }))?
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700390 .unwrap()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700391 } else {
392 self.c(&Hir::repetition(hir::Repetition {
393 kind: hir::RepetitionKind::ZeroOrMore,
394 greedy: false,
395 hir: Box::new(Hir::any(false)),
396 }))?
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700397 .unwrap()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700398 })
399 }
400
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700401 fn c_char(&mut self, c: char) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700402 if self.compiled.uses_bytes() {
403 if c.is_ascii() {
404 let b = c as u8;
405 let hole =
406 self.push_hole(InstHole::Bytes { start: b, end: b });
407 self.byte_classes.set_range(b, b);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700408 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700409 } else {
410 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
411 }
412 } else {
413 let hole = self.push_hole(InstHole::Char { c: c });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700414 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700415 }
416 }
417
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700418 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700419 assert!(!ranges.is_empty());
420 if self.compiled.uses_bytes() {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700421 Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700422 } else {
423 let ranges: Vec<(char, char)> =
424 ranges.iter().map(|r| (r.start(), r.end())).collect();
425 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
426 self.push_hole(InstHole::Char { c: ranges[0].0 })
427 } else {
428 self.push_hole(InstHole::Ranges { ranges: ranges })
429 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700430 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700431 }
432 }
433
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700434 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700435 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
436 }
437
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700438 fn c_class_bytes(
439 &mut self,
440 ranges: &[hir::ClassBytesRange],
441 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700442 debug_assert!(!ranges.is_empty());
443
444 let first_split_entry = self.insts.len();
445 let mut holes = vec![];
446 let mut prev_hole = Hole::None;
447 for r in &ranges[0..ranges.len() - 1] {
448 self.fill_to_next(prev_hole);
449 let split = self.push_split_hole();
450 let next = self.insts.len();
451 self.byte_classes.set_range(r.start(), r.end());
452 holes.push(self.push_hole(InstHole::Bytes {
453 start: r.start(),
454 end: r.end(),
455 }));
456 prev_hole = self.fill_split(split, Some(next), None);
457 }
458 let next = self.insts.len();
459 let r = &ranges[ranges.len() - 1];
460 self.byte_classes.set_range(r.start(), r.end());
461 holes.push(
462 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
463 );
464 self.fill(prev_hole, next);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700465 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700466 }
467
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700468 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700469 let hole = self.push_hole(InstHole::EmptyLook { look: look });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700470 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700471 }
472
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700473 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700474 where
475 I: IntoIterator<Item = &'a Hir>,
476 {
477 let mut exprs = exprs.into_iter();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700478 let Patch { mut hole, entry } = loop {
479 match exprs.next() {
480 None => return Ok(None),
481 Some(e) => {
482 if let Some(p) = self.c(e)? {
483 break p;
484 }
485 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700486 }
487 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700488 for e in exprs {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700489 if let Some(p) = self.c(e)? {
490 self.fill(hole, p.entry);
491 hole = p.hole;
492 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700493 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700494 Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700495 }
496
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700497 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700498 debug_assert!(
499 exprs.len() >= 2,
500 "alternates must have at least 2 exprs"
501 );
502
503 // Initial entry point is always the first split.
504 let first_split_entry = self.insts.len();
505
506 // Save up all of the holes from each alternate. They will all get
507 // patched to point to the same location.
508 let mut holes = vec![];
509
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700510 // true indicates that the hole is a split where we want to fill
511 // the second branch.
512 let mut prev_hole = (Hole::None, false);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700513 for e in &exprs[0..exprs.len() - 1] {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700514 if prev_hole.1 {
515 let next = self.insts.len();
516 self.fill_split(prev_hole.0, None, Some(next));
517 } else {
518 self.fill_to_next(prev_hole.0);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700519 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700520 let split = self.push_split_hole();
521 if let Some(Patch { hole, entry }) = self.c(e)? {
522 holes.push(hole);
523 prev_hole = (self.fill_split(split, Some(entry), None), false);
524 } else {
525 let (split1, split2) = split.dup_one();
526 holes.push(split1);
527 prev_hole = (split2, true);
528 }
529 }
530 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700531 holes.push(hole);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700532 if prev_hole.1 {
533 self.fill_split(prev_hole.0, None, Some(entry));
534 } else {
535 self.fill(prev_hole.0, entry);
536 }
537 } else {
538 // We ignore prev_hole.1. When it's true, it means we have two
539 // empty branches both pushing prev_hole.0 into holes, so both
540 // branches will go to the same place anyway.
541 holes.push(prev_hole.0);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700542 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700543 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700544 }
545
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700546 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700547 use syntax::hir::RepetitionKind::*;
548 match rep.kind {
549 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
550 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
551 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
552 Range(hir::RepetitionRange::Exactly(min_max)) => {
553 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
554 }
555 Range(hir::RepetitionRange::AtLeast(min)) => {
556 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
557 }
558 Range(hir::RepetitionRange::Bounded(min, max)) => {
559 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
560 }
561 }
562 }
563
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700564 fn c_repeat_zero_or_one(
565 &mut self,
566 expr: &Hir,
567 greedy: bool,
568 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700569 let split_entry = self.insts.len();
570 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700571 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
572 Some(p) => p,
573 None => return self.pop_split_hole(),
574 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700575 let split_hole = if greedy {
576 self.fill_split(split, Some(entry_rep), None)
577 } else {
578 self.fill_split(split, None, Some(entry_rep))
579 };
580 let holes = vec![hole_rep, split_hole];
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700581 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700582 }
583
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700584 fn c_repeat_zero_or_more(
585 &mut self,
586 expr: &Hir,
587 greedy: bool,
588 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700589 let split_entry = self.insts.len();
590 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700591 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
592 Some(p) => p,
593 None => return self.pop_split_hole(),
594 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700595
596 self.fill(hole_rep, split_entry);
597 let split_hole = if greedy {
598 self.fill_split(split, Some(entry_rep), None)
599 } else {
600 self.fill_split(split, None, Some(entry_rep))
601 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700602 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700603 }
604
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700605 fn c_repeat_one_or_more(
606 &mut self,
607 expr: &Hir,
608 greedy: bool,
609 ) -> ResultOrEmpty {
610 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
611 Some(p) => p,
612 None => return Ok(None),
613 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700614 self.fill_to_next(hole_rep);
615 let split = self.push_split_hole();
616
617 let split_hole = if greedy {
618 self.fill_split(split, Some(entry_rep), None)
619 } else {
620 self.fill_split(split, None, Some(entry_rep))
621 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700622 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700623 }
624
625 fn c_repeat_range_min_or_more(
626 &mut self,
627 expr: &Hir,
628 greedy: bool,
629 min: u32,
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700630 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700631 let min = u32_to_usize(min);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700632 // Using next_inst() is ok, because we can't return it (concat would
633 // have to return Some(_) while c_repeat_range_min_or_more returns
634 // None).
635 let patch_concat = self
636 .c_concat(iter::repeat(expr).take(min))?
637 .unwrap_or(self.next_inst());
638 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
639 self.fill(patch_concat.hole, patch_rep.entry);
640 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
641 } else {
642 Ok(None)
643 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700644 }
645
646 fn c_repeat_range(
647 &mut self,
648 expr: &Hir,
649 greedy: bool,
650 min: u32,
651 max: u32,
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700652 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700653 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700654 debug_assert!(min <= max);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700655 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700656 if min == max {
657 return Ok(patch_concat);
658 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700659 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
660 // max at this point).
661 let patch_concat = patch_concat.unwrap_or(self.next_inst());
662 let initial_entry = patch_concat.entry;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700663 // It is much simpler to compile, e.g., `a{2,5}` as:
664 //
665 // aaa?a?a?
666 //
667 // But you end up with a sequence of instructions like this:
668 //
669 // 0: 'a'
670 // 1: 'a',
671 // 2: split(3, 4)
672 // 3: 'a'
673 // 4: split(5, 6)
674 // 5: 'a'
675 // 6: split(7, 8)
676 // 7: 'a'
677 // 8: MATCH
678 //
679 // This is *incredibly* inefficient because the splits end
680 // up forming a chain, which has to be resolved everything a
681 // transition is followed.
682 let mut holes = vec![];
683 let mut prev_hole = patch_concat.hole;
684 for _ in min..max {
685 self.fill_to_next(prev_hole);
686 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700687 let Patch { hole, entry } = match self.c(expr)? {
688 Some(p) => p,
689 None => return self.pop_split_hole(),
690 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700691 prev_hole = hole;
692 if greedy {
693 holes.push(self.fill_split(split, Some(entry), None));
694 } else {
695 holes.push(self.fill_split(split, None, Some(entry)));
696 }
697 }
698 holes.push(prev_hole);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700699 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
700 }
701
702 /// Can be used as a default value for the c_* functions when the call to
703 /// c_function is followed by inserting at least one instruction that is
704 /// always executed after the ones written by the c* function.
705 fn next_inst(&self) -> Patch {
706 Patch { hole: Hole::None, entry: self.insts.len() }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700707 }
708
709 fn fill(&mut self, hole: Hole, goto: InstPtr) {
710 match hole {
711 Hole::None => {}
712 Hole::One(pc) => {
713 self.insts[pc].fill(goto);
714 }
715 Hole::Many(holes) => {
716 for hole in holes {
717 self.fill(hole, goto);
718 }
719 }
720 }
721 }
722
723 fn fill_to_next(&mut self, hole: Hole) {
724 let next = self.insts.len();
725 self.fill(hole, next);
726 }
727
728 fn fill_split(
729 &mut self,
730 hole: Hole,
731 goto1: Option<InstPtr>,
732 goto2: Option<InstPtr>,
733 ) -> Hole {
734 match hole {
735 Hole::None => Hole::None,
736 Hole::One(pc) => match (goto1, goto2) {
737 (Some(goto1), Some(goto2)) => {
738 self.insts[pc].fill_split(goto1, goto2);
739 Hole::None
740 }
741 (Some(goto1), None) => {
742 self.insts[pc].half_fill_split_goto1(goto1);
743 Hole::One(pc)
744 }
745 (None, Some(goto2)) => {
746 self.insts[pc].half_fill_split_goto2(goto2);
747 Hole::One(pc)
748 }
749 (None, None) => unreachable!(
750 "at least one of the split \
751 holes must be filled"
752 ),
753 },
754 Hole::Many(holes) => {
755 let mut new_holes = vec![];
756 for hole in holes {
757 new_holes.push(self.fill_split(hole, goto1, goto2));
758 }
759 if new_holes.is_empty() {
760 Hole::None
761 } else if new_holes.len() == 1 {
762 new_holes.pop().unwrap()
763 } else {
764 Hole::Many(new_holes)
765 }
766 }
767 }
768 }
769
770 fn push_compiled(&mut self, inst: Inst) {
771 self.insts.push(MaybeInst::Compiled(inst));
772 }
773
774 fn push_hole(&mut self, inst: InstHole) -> Hole {
775 let hole = self.insts.len();
776 self.insts.push(MaybeInst::Uncompiled(inst));
777 Hole::One(hole)
778 }
779
780 fn push_split_hole(&mut self) -> Hole {
781 let hole = self.insts.len();
782 self.insts.push(MaybeInst::Split);
783 Hole::One(hole)
784 }
785
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700786 fn pop_split_hole(&mut self) -> ResultOrEmpty {
787 self.insts.pop();
788 Ok(None)
789 }
790
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700791 fn check_size(&self) -> result::Result<(), Error> {
792 use std::mem::size_of;
793
794 if self.insts.len() * size_of::<Inst>() > self.size_limit {
795 Err(Error::CompiledTooBig(self.size_limit))
796 } else {
797 Ok(())
798 }
799 }
800}
801
802#[derive(Debug)]
803enum Hole {
804 None,
805 One(InstPtr),
806 Many(Vec<Hole>),
807}
808
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700809impl Hole {
810 fn dup_one(self) -> (Self, Self) {
811 match self {
812 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
813 Hole::None | Hole::Many(_) => {
814 unreachable!("must be called on single hole")
815 }
816 }
817 }
818}
819
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700820#[derive(Clone, Debug)]
821enum MaybeInst {
822 Compiled(Inst),
823 Uncompiled(InstHole),
824 Split,
825 Split1(InstPtr),
826 Split2(InstPtr),
827}
828
829impl MaybeInst {
830 fn fill(&mut self, goto: InstPtr) {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700831 let maybeinst = match *self {
832 MaybeInst::Split => MaybeInst::Split1(goto),
833 MaybeInst::Uncompiled(ref inst) => {
834 MaybeInst::Compiled(inst.fill(goto))
835 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700836 MaybeInst::Split1(goto1) => {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700837 MaybeInst::Compiled(Inst::Split(InstSplit {
838 goto1: goto1,
839 goto2: goto,
840 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700841 }
842 MaybeInst::Split2(goto2) => {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700843 MaybeInst::Compiled(Inst::Split(InstSplit {
844 goto1: goto,
845 goto2: goto2,
846 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700847 }
848 _ => unreachable!(
849 "not all instructions were compiled! \
850 found uncompiled instruction: {:?}",
851 self
852 ),
853 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700854 *self = maybeinst;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700855 }
856
857 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
858 let filled = match *self {
859 MaybeInst::Split => {
860 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
861 }
862 _ => unreachable!(
863 "must be called on Split instruction, \
864 instead it was called on: {:?}",
865 self
866 ),
867 };
868 *self = MaybeInst::Compiled(filled);
869 }
870
871 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
872 let half_filled = match *self {
873 MaybeInst::Split => goto1,
874 _ => unreachable!(
875 "must be called on Split instruction, \
876 instead it was called on: {:?}",
877 self
878 ),
879 };
880 *self = MaybeInst::Split1(half_filled);
881 }
882
883 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
884 let half_filled = match *self {
885 MaybeInst::Split => goto2,
886 _ => unreachable!(
887 "must be called on Split instruction, \
888 instead it was called on: {:?}",
889 self
890 ),
891 };
892 *self = MaybeInst::Split2(half_filled);
893 }
894
895 fn unwrap(self) -> Inst {
896 match self {
897 MaybeInst::Compiled(inst) => inst,
898 _ => unreachable!(
899 "must be called on a compiled instruction, \
900 instead it was called on: {:?}",
901 self
902 ),
903 }
904 }
905}
906
907#[derive(Clone, Debug)]
908enum InstHole {
909 Save { slot: usize },
910 EmptyLook { look: EmptyLook },
911 Char { c: char },
912 Ranges { ranges: Vec<(char, char)> },
913 Bytes { start: u8, end: u8 },
914}
915
916impl InstHole {
917 fn fill(&self, goto: InstPtr) -> Inst {
918 match *self {
919 InstHole::Save { slot } => {
920 Inst::Save(InstSave { goto: goto, slot: slot })
921 }
922 InstHole::EmptyLook { look } => {
923 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
924 }
925 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
926 InstHole::Ranges { ref ranges } => {
927 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
928 }
929 InstHole::Bytes { start, end } => {
930 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
931 }
932 }
933 }
934}
935
936struct CompileClass<'a, 'b> {
937 c: &'a mut Compiler,
938 ranges: &'b [hir::ClassUnicodeRange],
939}
940
941impl<'a, 'b> CompileClass<'a, 'b> {
942 fn compile(mut self) -> Result {
943 let mut holes = vec![];
944 let mut initial_entry = None;
945 let mut last_split = Hole::None;
946 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
947 self.c.suffix_cache.clear();
948
949 for (i, range) in self.ranges.iter().enumerate() {
950 let is_last_range = i + 1 == self.ranges.len();
951 utf8_seqs.reset(range.start(), range.end());
952 let mut it = (&mut utf8_seqs).peekable();
953 loop {
954 let utf8_seq = match it.next() {
955 None => break,
956 Some(utf8_seq) => utf8_seq,
957 };
958 if is_last_range && it.peek().is_none() {
959 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
960 holes.push(hole);
961 self.c.fill(last_split, entry);
962 last_split = Hole::None;
963 if initial_entry.is_none() {
964 initial_entry = Some(entry);
965 }
966 } else {
967 if initial_entry.is_none() {
968 initial_entry = Some(self.c.insts.len());
969 }
970 self.c.fill_to_next(last_split);
971 last_split = self.c.push_split_hole();
972 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
973 holes.push(hole);
974 last_split =
975 self.c.fill_split(last_split, Some(entry), None);
976 }
977 }
978 }
979 self.c.utf8_seqs = Some(utf8_seqs);
980 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
981 }
982
983 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
984 if self.c.compiled.is_reverse {
985 self.c_utf8_seq_(seq)
986 } else {
987 self.c_utf8_seq_(seq.into_iter().rev())
988 }
989 }
990
991 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
992 where
993 I: IntoIterator<Item = &'r Utf8Range>,
994 {
995 // The initial instruction for each UTF-8 sequence should be the same.
996 let mut from_inst = ::std::usize::MAX;
997 let mut last_hole = Hole::None;
998 for byte_range in seq {
999 let key = SuffixCacheKey {
1000 from_inst: from_inst,
1001 start: byte_range.start,
1002 end: byte_range.end,
1003 };
1004 {
1005 let pc = self.c.insts.len();
1006 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1007 from_inst = cached_pc;
1008 continue;
1009 }
1010 }
1011 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1012 if from_inst == ::std::usize::MAX {
1013 last_hole = self.c.push_hole(InstHole::Bytes {
1014 start: byte_range.start,
1015 end: byte_range.end,
1016 });
1017 } else {
1018 self.c.push_compiled(Inst::Bytes(InstBytes {
1019 goto: from_inst,
1020 start: byte_range.start,
1021 end: byte_range.end,
1022 }));
1023 }
1024 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1025 debug_assert!(from_inst < ::std::usize::MAX);
1026 }
1027 debug_assert!(from_inst < ::std::usize::MAX);
1028 Ok(Patch { hole: last_hole, entry: from_inst })
1029 }
1030}
1031
1032/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1033/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1034/// The set of byte ranges looks like this:
1035///
1036/// [0-7F]
1037/// [C2-DF][80-BF]
1038/// [E0][A0-BF][80-BF]
1039/// [E1-EC][80-BF][80-BF]
1040/// [ED][80-9F][80-BF]
1041/// [EE-EF][80-BF][80-BF]
1042///
1043/// Each line above translates to one alternate in the compiled regex program.
1044/// However, all but one of the alternates end in the same suffix, which is
1045/// a waste of an instruction. The suffix cache facilitates reusing them across
1046/// alternates.
1047///
1048/// Note that a HashMap could be trivially used for this, but we don't need its
1049/// overhead. Some small bounded space (LRU style) is more than enough.
1050///
1051/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1052/// except it uses hashes as original indices and then compares full keys for
1053/// validation against `dense` array.
1054struct SuffixCache {
1055 sparse: Box<[usize]>,
1056 dense: Vec<SuffixCacheEntry>,
1057}
1058
1059#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1060struct SuffixCacheEntry {
1061 key: SuffixCacheKey,
1062 pc: InstPtr,
1063}
1064
1065#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1066struct SuffixCacheKey {
1067 from_inst: InstPtr,
1068 start: u8,
1069 end: u8,
1070}
1071
1072impl SuffixCache {
1073 fn new(size: usize) -> Self {
1074 SuffixCache {
1075 sparse: vec![0usize; size].into(),
1076 dense: Vec::with_capacity(size),
1077 }
1078 }
1079
1080 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1081 let hash = self.hash(&key);
1082 let pos = &mut self.sparse[hash];
1083 if let Some(entry) = self.dense.get(*pos) {
1084 if entry.key == key {
1085 return Some(entry.pc);
1086 }
1087 }
1088 *pos = self.dense.len();
1089 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1090 None
1091 }
1092
1093 fn clear(&mut self) {
1094 self.dense.clear();
1095 }
1096
1097 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1098 // Basic FNV-1a hash as described:
1099 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1100 const FNV_PRIME: u64 = 1099511628211;
1101 let mut h = 14695981039346656037;
1102 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1103 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1104 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1105 (h as usize) % self.sparse.len()
1106 }
1107}
1108
1109struct ByteClassSet([bool; 256]);
1110
1111impl ByteClassSet {
1112 fn new() -> Self {
1113 ByteClassSet([false; 256])
1114 }
1115
1116 fn set_range(&mut self, start: u8, end: u8) {
1117 debug_assert!(start <= end);
1118 if start > 0 {
1119 self.0[start as usize - 1] = true;
1120 }
1121 self.0[end as usize] = true;
1122 }
1123
1124 fn set_word_boundary(&mut self) {
1125 // We need to mark all ranges of bytes whose pairs result in
1126 // evaluating \b differently.
1127 let iswb = is_word_byte;
1128 let mut b1: u16 = 0;
1129 let mut b2: u16;
1130 while b1 <= 255 {
1131 b2 = b1 + 1;
1132 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1133 b2 += 1;
1134 }
1135 self.set_range(b1 as u8, (b2 - 1) as u8);
1136 b1 = b2;
1137 }
1138 }
1139
1140 fn byte_classes(&self) -> Vec<u8> {
1141 // N.B. If you're debugging the DFA, it's useful to simply return
1142 // `(0..256).collect()`, which effectively removes the byte classes
1143 // and makes the transitions easier to read.
1144 // (0usize..256).map(|x| x as u8).collect()
1145 let mut byte_classes = vec![0; 256];
1146 let mut class = 0u8;
1147 let mut i = 0;
1148 loop {
1149 byte_classes[i] = class as u8;
1150 if i >= 255 {
1151 break;
1152 }
1153 if self.0[i] {
1154 class = class.checked_add(1).unwrap();
1155 }
1156 i += 1;
1157 }
1158 byte_classes
1159 }
1160}
1161
1162fn u32_to_usize(n: u32) -> usize {
1163 // In case usize is less than 32 bits, we need to guard against overflow.
1164 // On most platforms this compiles to nothing.
1165 // TODO Use `std::convert::TryFrom` once it's stable.
1166 if (n as u64) > (::std::usize::MAX as u64) {
1167 panic!("BUG: {} is too big to be pointer sized", n)
1168 }
1169 n as usize
1170}
1171
1172#[cfg(test)]
1173mod tests {
1174 use super::ByteClassSet;
1175
1176 #[test]
1177 fn byte_classes() {
1178 let mut set = ByteClassSet::new();
1179 set.set_range(b'a', b'z');
1180 let classes = set.byte_classes();
1181 assert_eq!(classes[0], 0);
1182 assert_eq!(classes[1], 0);
1183 assert_eq!(classes[2], 0);
1184 assert_eq!(classes[b'a' as usize - 1], 0);
1185 assert_eq!(classes[b'a' as usize], 1);
1186 assert_eq!(classes[b'm' as usize], 1);
1187 assert_eq!(classes[b'z' as usize], 1);
1188 assert_eq!(classes[b'z' as usize + 1], 2);
1189 assert_eq!(classes[254], 2);
1190 assert_eq!(classes[255], 2);
1191
1192 let mut set = ByteClassSet::new();
1193 set.set_range(0, 2);
1194 set.set_range(4, 6);
1195 let classes = set.byte_classes();
1196 assert_eq!(classes[0], 0);
1197 assert_eq!(classes[1], 0);
1198 assert_eq!(classes[2], 0);
1199 assert_eq!(classes[3], 1);
1200 assert_eq!(classes[4], 2);
1201 assert_eq!(classes[5], 2);
1202 assert_eq!(classes[6], 2);
1203 assert_eq!(classes[7], 3);
1204 assert_eq!(classes[255], 3);
1205 }
1206
1207 #[test]
1208 fn full_byte_classes() {
1209 let mut set = ByteClassSet::new();
1210 for i in 0..256u16 {
1211 set.set_range(i as u8, i as u8);
1212 }
1213 assert_eq!(set.byte_classes().len(), 256);
1214 }
1215}