blob: 9ffd347044afbeb13d31eeb9adec9ab4d3389d3e [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001use std::collections::HashMap;
Haibo Huang47619dd2021-01-08 17:05:43 -08002use std::fmt;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07003use std::iter;
4use std::result;
5use std::sync::Arc;
6
7use syntax::hir::{self, Hir};
8use syntax::is_word_byte;
9use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11use prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14};
15
16use Error;
17
18type Result = result::Result<Patch, Error>;
Haibo Huang49cbe5f2020-05-28 20:14:24 -070019type ResultOrEmpty = result::Result<Option<Patch>, Error>;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070020
21#[derive(Debug)]
22struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25}
26
27/// A compiler translates a regular expression AST to a sequence of
28/// instructions. The sequence of instructions represents an NFA.
Haibo Huang47619dd2021-01-08 17:05:43 -080029// `Compiler` is only public via the `internal` module, so avoid deriving
30// `Debug`.
31#[allow(missing_debug_implementations)]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070032pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41}
42
43impl Compiler {
44 /// Create a new regular expression compiler.
45 ///
46 /// Various options can be set before calling `compile` on an expression.
47 pub fn new() -> Self {
48 Compiler {
49 insts: vec![],
50 compiled: Program::new(),
51 capture_name_idx: HashMap::new(),
52 num_exprs: 0,
53 size_limit: 10 * (1 << 20),
54 suffix_cache: SuffixCache::new(1000),
55 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
56 byte_classes: ByteClassSet::new(),
57 }
58 }
59
60 /// The size of the resulting program is limited by size_limit. If
61 /// the program approximately exceeds the given size (in bytes), then
62 /// compilation will stop and return an error.
63 pub fn size_limit(mut self, size_limit: usize) -> Self {
64 self.size_limit = size_limit;
65 self
66 }
67
68 /// If bytes is true, then the program is compiled as a byte based
69 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
70 /// false, then the automaton is Unicode scalar value based, e.g., an
71 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
72 ///
73 /// The specific invariant is that when returning a byte based machine,
74 /// the neither the `Char` nor `Ranges` instructions are produced.
75 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
76 /// instruction is never produced.
77 ///
78 /// Note that `dfa(true)` implies `bytes(true)`.
79 pub fn bytes(mut self, yes: bool) -> Self {
80 self.compiled.is_bytes = yes;
81 self
82 }
83
84 /// When disabled, the program compiled may match arbitrary bytes.
85 ///
86 /// When enabled (the default), all compiled programs exclusively match
87 /// valid UTF-8 bytes.
88 pub fn only_utf8(mut self, yes: bool) -> Self {
89 self.compiled.only_utf8 = yes;
90 self
91 }
92
93 /// When set, the machine returned is suitable for use in the DFA matching
94 /// engine.
95 ///
96 /// In particular, this ensures that if the regex is not anchored in the
97 /// beginning, then a preceding `.*?` is included in the program. (The NFA
98 /// based engines handle the preceding `.*?` explicitly, which is difficult
99 /// or impossible in the DFA engine.)
100 pub fn dfa(mut self, yes: bool) -> Self {
101 self.compiled.is_dfa = yes;
102 self
103 }
104
105 /// When set, the machine returned is suitable for matching text in
106 /// reverse. In particular, all concatenations are flipped.
107 pub fn reverse(mut self, yes: bool) -> Self {
108 self.compiled.is_reverse = yes;
109 self
110 }
111
112 /// Compile a regular expression given its AST.
113 ///
114 /// The compiler is guaranteed to succeed unless the program exceeds the
115 /// specified size limit. If the size limit is exceeded, then compilation
116 /// stops and returns an error.
117 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
118 debug_assert!(!exprs.is_empty());
119 self.num_exprs = exprs.len();
120 if exprs.len() == 1 {
121 self.compile_one(&exprs[0])
122 } else {
123 self.compile_many(exprs)
124 }
125 }
126
127 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
128 // If we're compiling a forward DFA and we aren't anchored, then
129 // add a `.*?` before the first capture group.
130 // Other matching engines handle this by baking the logic into the
131 // matching engine itself.
132 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
133 self.compiled.is_anchored_start = expr.is_anchored_start();
134 self.compiled.is_anchored_end = expr.is_anchored_end();
135 if self.compiled.needs_dotstar() {
136 dotstar_patch = self.c_dotstar()?;
137 self.compiled.start = dotstar_patch.entry;
138 }
139 self.compiled.captures = vec![None];
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700140 let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700141 if self.compiled.needs_dotstar() {
142 self.fill(dotstar_patch.hole, patch.entry);
143 } else {
144 self.compiled.start = patch.entry;
145 }
146 self.fill_to_next(patch.hole);
147 self.compiled.matches = vec![self.insts.len()];
148 self.push_compiled(Inst::Match(0));
149 self.compile_finish()
150 }
151
152 fn compile_many(
153 mut self,
154 exprs: &[Hir],
155 ) -> result::Result<Program, Error> {
156 debug_assert!(exprs.len() > 1);
157
158 self.compiled.is_anchored_start =
159 exprs.iter().all(|e| e.is_anchored_start());
160 self.compiled.is_anchored_end =
161 exprs.iter().all(|e| e.is_anchored_end());
162 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
163 if self.compiled.needs_dotstar() {
164 dotstar_patch = self.c_dotstar()?;
165 self.compiled.start = dotstar_patch.entry;
166 } else {
167 self.compiled.start = 0; // first instruction is always split
168 }
169 self.fill_to_next(dotstar_patch.hole);
170
171 let mut prev_hole = Hole::None;
172 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
173 self.fill_to_next(prev_hole);
174 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700175 let Patch { hole, entry } =
176 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700177 self.fill_to_next(hole);
178 self.compiled.matches.push(self.insts.len());
179 self.push_compiled(Inst::Match(i));
180 prev_hole = self.fill_split(split, Some(entry), None);
181 }
182 let i = exprs.len() - 1;
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700183 let Patch { hole, entry } =
184 self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700185 self.fill(prev_hole, entry);
186 self.fill_to_next(hole);
187 self.compiled.matches.push(self.insts.len());
188 self.push_compiled(Inst::Match(i));
189 self.compile_finish()
190 }
191
192 fn compile_finish(mut self) -> result::Result<Program, Error> {
193 self.compiled.insts =
194 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
195 self.compiled.byte_classes = self.byte_classes.byte_classes();
196 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
197 Ok(self.compiled)
198 }
199
200 /// Compile expr into self.insts, returning a patch on success,
201 /// or an error if we run out of memory.
202 ///
203 /// All of the c_* methods of the compiler share the contract outlined
204 /// here.
205 ///
206 /// The main thing that a c_* method does is mutate `self.insts`
207 /// to add a list of mostly compiled instructions required to execute
208 /// the given expression. `self.insts` contains MaybeInsts rather than
209 /// Insts because there is some backpatching required.
210 ///
211 /// The `Patch` value returned by each c_* method provides metadata
212 /// about the compiled instructions emitted to `self.insts`. The
213 /// `entry` member of the patch refers to the first instruction
214 /// (the entry point), while the `hole` member contains zero or
215 /// more offsets to partial instructions that need to be backpatched.
216 /// The c_* routine can't know where its list of instructions are going to
217 /// jump to after execution, so it is up to the caller to patch
218 /// these jumps to point to the right place. So compiling some
219 /// expression, e, we would end up with a situation that looked like:
220 ///
221 /// ```text
222 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
223 /// ^ ^ ^
224 /// | \ /
225 /// entry \ /
226 /// hole
227 /// ```
228 ///
Chih-Hung Hsieh849e4452020-10-26 13:16:47 -0700229 /// To compile two expressions, e1 and e2, concatenated together we
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700230 /// would do:
231 ///
232 /// ```ignore
233 /// let patch1 = self.c(e1);
234 /// let patch2 = self.c(e2);
235 /// ```
236 ///
237 /// while leaves us with a situation that looks like
238 ///
239 /// ```text
240 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
241 /// ^ ^ ^ ^
242 /// | | | |
243 /// entry1 hole1 entry2 hole2
244 /// ```
245 ///
246 /// Then to merge the two patches together into one we would backpatch
247 /// hole1 with entry2 and return a new patch that enters at entry1
248 /// and has hole2 for a hole. In fact, if you look at the c_concat
249 /// method you will see that it does exactly this, though it handles
250 /// a list of expressions rather than just the two that we use for
251 /// an example.
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700252 ///
253 /// Ok(None) is returned when an expression is compiled to no
254 /// instruction, and so no patch.entry value makes sense.
255 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700256 use prog;
257 use syntax::hir::HirKind::*;
258
259 self.check_size()?;
260 match *expr.kind() {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700261 Empty => Ok(None),
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700262 Literal(hir::Literal::Unicode(c)) => self.c_char(c),
263 Literal(hir::Literal::Byte(b)) => {
264 assert!(self.compiled.uses_bytes());
265 self.c_byte(b)
266 }
267 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
268 Class(hir::Class::Bytes(ref cls)) => {
269 if self.compiled.uses_bytes() {
270 self.c_class_bytes(cls.ranges())
271 } else {
272 assert!(cls.is_all_ascii());
273 let mut char_ranges = vec![];
274 for r in cls.iter() {
275 let (s, e) = (r.start() as char, r.end() as char);
276 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
277 }
278 self.c_class(&char_ranges)
279 }
280 }
281 Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => {
282 self.byte_classes.set_range(b'\n', b'\n');
283 self.c_empty_look(prog::EmptyLook::EndLine)
284 }
285 Anchor(hir::Anchor::StartLine) => {
286 self.byte_classes.set_range(b'\n', b'\n');
287 self.c_empty_look(prog::EmptyLook::StartLine)
288 }
289 Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => {
290 self.byte_classes.set_range(b'\n', b'\n');
291 self.c_empty_look(prog::EmptyLook::StartLine)
292 }
293 Anchor(hir::Anchor::EndLine) => {
294 self.byte_classes.set_range(b'\n', b'\n');
295 self.c_empty_look(prog::EmptyLook::EndLine)
296 }
297 Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => {
298 self.c_empty_look(prog::EmptyLook::EndText)
299 }
300 Anchor(hir::Anchor::StartText) => {
301 self.c_empty_look(prog::EmptyLook::StartText)
302 }
303 Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => {
304 self.c_empty_look(prog::EmptyLook::StartText)
305 }
306 Anchor(hir::Anchor::EndText) => {
307 self.c_empty_look(prog::EmptyLook::EndText)
308 }
309 WordBoundary(hir::WordBoundary::Unicode) => {
310 if !cfg!(feature = "unicode-perl") {
311 return Err(Error::Syntax(
312 "Unicode word boundaries are unavailable when \
313 the unicode-perl feature is disabled"
314 .to_string(),
315 ));
316 }
317 self.compiled.has_unicode_word_boundary = true;
318 self.byte_classes.set_word_boundary();
319 self.c_empty_look(prog::EmptyLook::WordBoundary)
320 }
321 WordBoundary(hir::WordBoundary::UnicodeNegate) => {
322 if !cfg!(feature = "unicode-perl") {
323 return Err(Error::Syntax(
324 "Unicode word boundaries are unavailable when \
325 the unicode-perl feature is disabled"
326 .to_string(),
327 ));
328 }
329 self.compiled.has_unicode_word_boundary = true;
330 self.byte_classes.set_word_boundary();
331 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
332 }
333 WordBoundary(hir::WordBoundary::Ascii) => {
334 self.byte_classes.set_word_boundary();
335 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
336 }
337 WordBoundary(hir::WordBoundary::AsciiNegate) => {
338 self.byte_classes.set_word_boundary();
339 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
340 }
341 Group(ref g) => match g.kind {
342 hir::GroupKind::NonCapturing => self.c(&g.hir),
343 hir::GroupKind::CaptureIndex(index) => {
344 if index as usize >= self.compiled.captures.len() {
345 self.compiled.captures.push(None);
346 }
347 self.c_capture(2 * index as usize, &g.hir)
348 }
349 hir::GroupKind::CaptureName { index, ref name } => {
350 if index as usize >= self.compiled.captures.len() {
351 let n = name.to_string();
352 self.compiled.captures.push(Some(n.clone()));
353 self.capture_name_idx.insert(n, index as usize);
354 }
355 self.c_capture(2 * index as usize, &g.hir)
356 }
357 },
358 Concat(ref es) => {
359 if self.compiled.is_reverse {
360 self.c_concat(es.iter().rev())
361 } else {
362 self.c_concat(es)
363 }
364 }
365 Alternation(ref es) => self.c_alternate(&**es),
366 Repetition(ref rep) => self.c_repeat(rep),
367 }
368 }
369
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700370 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700371 if self.num_exprs > 1 || self.compiled.is_dfa {
372 // Don't ever compile Save instructions for regex sets because
373 // they are never used. They are also never used in DFA programs
374 // because DFAs can't handle captures.
375 self.c(expr)
376 } else {
377 let entry = self.insts.len();
378 let hole = self.push_hole(InstHole::Save { slot: first_slot });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700379 let patch = self.c(expr)?.unwrap_or(self.next_inst());
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700380 self.fill(hole, patch.entry);
381 self.fill_to_next(patch.hole);
382 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700383 Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700384 }
385 }
386
387 fn c_dotstar(&mut self) -> Result {
388 Ok(if !self.compiled.only_utf8() {
389 self.c(&Hir::repetition(hir::Repetition {
390 kind: hir::RepetitionKind::ZeroOrMore,
391 greedy: false,
392 hir: Box::new(Hir::any(true)),
393 }))?
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700394 .unwrap()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700395 } else {
396 self.c(&Hir::repetition(hir::Repetition {
397 kind: hir::RepetitionKind::ZeroOrMore,
398 greedy: false,
399 hir: Box::new(Hir::any(false)),
400 }))?
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700401 .unwrap()
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700402 })
403 }
404
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700405 fn c_char(&mut self, c: char) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700406 if self.compiled.uses_bytes() {
407 if c.is_ascii() {
408 let b = c as u8;
409 let hole =
410 self.push_hole(InstHole::Bytes { start: b, end: b });
411 self.byte_classes.set_range(b, b);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700412 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700413 } else {
414 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
415 }
416 } else {
417 let hole = self.push_hole(InstHole::Char { c: c });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700418 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700419 }
420 }
421
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700422 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700423 assert!(!ranges.is_empty());
424 if self.compiled.uses_bytes() {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700425 Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700426 } else {
427 let ranges: Vec<(char, char)> =
428 ranges.iter().map(|r| (r.start(), r.end())).collect();
429 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
430 self.push_hole(InstHole::Char { c: ranges[0].0 })
431 } else {
432 self.push_hole(InstHole::Ranges { ranges: ranges })
433 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700434 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700435 }
436 }
437
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700438 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700439 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
440 }
441
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700442 fn c_class_bytes(
443 &mut self,
444 ranges: &[hir::ClassBytesRange],
445 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700446 debug_assert!(!ranges.is_empty());
447
448 let first_split_entry = self.insts.len();
449 let mut holes = vec![];
450 let mut prev_hole = Hole::None;
451 for r in &ranges[0..ranges.len() - 1] {
452 self.fill_to_next(prev_hole);
453 let split = self.push_split_hole();
454 let next = self.insts.len();
455 self.byte_classes.set_range(r.start(), r.end());
456 holes.push(self.push_hole(InstHole::Bytes {
457 start: r.start(),
458 end: r.end(),
459 }));
460 prev_hole = self.fill_split(split, Some(next), None);
461 }
462 let next = self.insts.len();
463 let r = &ranges[ranges.len() - 1];
464 self.byte_classes.set_range(r.start(), r.end());
465 holes.push(
466 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
467 );
468 self.fill(prev_hole, next);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700469 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700470 }
471
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700472 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700473 let hole = self.push_hole(InstHole::EmptyLook { look: look });
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700474 Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700475 }
476
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700477 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700478 where
479 I: IntoIterator<Item = &'a Hir>,
480 {
481 let mut exprs = exprs.into_iter();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700482 let Patch { mut hole, entry } = loop {
483 match exprs.next() {
484 None => return Ok(None),
485 Some(e) => {
486 if let Some(p) = self.c(e)? {
487 break p;
488 }
489 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700490 }
491 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700492 for e in exprs {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700493 if let Some(p) = self.c(e)? {
494 self.fill(hole, p.entry);
495 hole = p.hole;
496 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700497 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700498 Ok(Some(Patch { hole: hole, entry: entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700499 }
500
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700501 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700502 debug_assert!(
503 exprs.len() >= 2,
504 "alternates must have at least 2 exprs"
505 );
506
507 // Initial entry point is always the first split.
508 let first_split_entry = self.insts.len();
509
510 // Save up all of the holes from each alternate. They will all get
511 // patched to point to the same location.
512 let mut holes = vec![];
513
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700514 // true indicates that the hole is a split where we want to fill
515 // the second branch.
516 let mut prev_hole = (Hole::None, false);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700517 for e in &exprs[0..exprs.len() - 1] {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700518 if prev_hole.1 {
519 let next = self.insts.len();
520 self.fill_split(prev_hole.0, None, Some(next));
521 } else {
522 self.fill_to_next(prev_hole.0);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700523 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700524 let split = self.push_split_hole();
525 if let Some(Patch { hole, entry }) = self.c(e)? {
526 holes.push(hole);
527 prev_hole = (self.fill_split(split, Some(entry), None), false);
528 } else {
529 let (split1, split2) = split.dup_one();
530 holes.push(split1);
531 prev_hole = (split2, true);
532 }
533 }
534 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700535 holes.push(hole);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700536 if prev_hole.1 {
537 self.fill_split(prev_hole.0, None, Some(entry));
538 } else {
539 self.fill(prev_hole.0, entry);
540 }
541 } else {
542 // We ignore prev_hole.1. When it's true, it means we have two
543 // empty branches both pushing prev_hole.0 into holes, so both
544 // branches will go to the same place anyway.
545 holes.push(prev_hole.0);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700546 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700547 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700548 }
549
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700550 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700551 use syntax::hir::RepetitionKind::*;
552 match rep.kind {
553 ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
554 ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
555 OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
556 Range(hir::RepetitionRange::Exactly(min_max)) => {
557 self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max)
558 }
559 Range(hir::RepetitionRange::AtLeast(min)) => {
560 self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
561 }
562 Range(hir::RepetitionRange::Bounded(min, max)) => {
563 self.c_repeat_range(&rep.hir, rep.greedy, min, max)
564 }
565 }
566 }
567
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700568 fn c_repeat_zero_or_one(
569 &mut self,
570 expr: &Hir,
571 greedy: bool,
572 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700573 let split_entry = self.insts.len();
574 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700575 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
576 Some(p) => p,
577 None => return self.pop_split_hole(),
578 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700579 let split_hole = if greedy {
580 self.fill_split(split, Some(entry_rep), None)
581 } else {
582 self.fill_split(split, None, Some(entry_rep))
583 };
584 let holes = vec![hole_rep, split_hole];
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700585 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700586 }
587
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700588 fn c_repeat_zero_or_more(
589 &mut self,
590 expr: &Hir,
591 greedy: bool,
592 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700593 let split_entry = self.insts.len();
594 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700595 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
596 Some(p) => p,
597 None => return self.pop_split_hole(),
598 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700599
600 self.fill(hole_rep, split_entry);
601 let split_hole = if greedy {
602 self.fill_split(split, Some(entry_rep), None)
603 } else {
604 self.fill_split(split, None, Some(entry_rep))
605 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700606 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700607 }
608
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700609 fn c_repeat_one_or_more(
610 &mut self,
611 expr: &Hir,
612 greedy: bool,
613 ) -> ResultOrEmpty {
614 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
615 Some(p) => p,
616 None => return Ok(None),
617 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700618 self.fill_to_next(hole_rep);
619 let split = self.push_split_hole();
620
621 let split_hole = if greedy {
622 self.fill_split(split, Some(entry_rep), None)
623 } else {
624 self.fill_split(split, None, Some(entry_rep))
625 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700626 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700627 }
628
629 fn c_repeat_range_min_or_more(
630 &mut self,
631 expr: &Hir,
632 greedy: bool,
633 min: u32,
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700634 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700635 let min = u32_to_usize(min);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700636 // Using next_inst() is ok, because we can't return it (concat would
637 // have to return Some(_) while c_repeat_range_min_or_more returns
638 // None).
639 let patch_concat = self
640 .c_concat(iter::repeat(expr).take(min))?
641 .unwrap_or(self.next_inst());
642 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
643 self.fill(patch_concat.hole, patch_rep.entry);
644 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
645 } else {
646 Ok(None)
647 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700648 }
649
650 fn c_repeat_range(
651 &mut self,
652 expr: &Hir,
653 greedy: bool,
654 min: u32,
655 max: u32,
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700656 ) -> ResultOrEmpty {
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700657 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700658 debug_assert!(min <= max);
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700659 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700660 if min == max {
661 return Ok(patch_concat);
662 }
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700663 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
664 // max at this point).
665 let patch_concat = patch_concat.unwrap_or(self.next_inst());
666 let initial_entry = patch_concat.entry;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700667 // It is much simpler to compile, e.g., `a{2,5}` as:
668 //
669 // aaa?a?a?
670 //
671 // But you end up with a sequence of instructions like this:
672 //
673 // 0: 'a'
674 // 1: 'a',
675 // 2: split(3, 4)
676 // 3: 'a'
677 // 4: split(5, 6)
678 // 5: 'a'
679 // 6: split(7, 8)
680 // 7: 'a'
681 // 8: MATCH
682 //
683 // This is *incredibly* inefficient because the splits end
684 // up forming a chain, which has to be resolved everything a
685 // transition is followed.
686 let mut holes = vec![];
687 let mut prev_hole = patch_concat.hole;
688 for _ in min..max {
689 self.fill_to_next(prev_hole);
690 let split = self.push_split_hole();
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700691 let Patch { hole, entry } = match self.c(expr)? {
692 Some(p) => p,
693 None => return self.pop_split_hole(),
694 };
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700695 prev_hole = hole;
696 if greedy {
697 holes.push(self.fill_split(split, Some(entry), None));
698 } else {
699 holes.push(self.fill_split(split, None, Some(entry)));
700 }
701 }
702 holes.push(prev_hole);
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700703 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
704 }
705
706 /// Can be used as a default value for the c_* functions when the call to
707 /// c_function is followed by inserting at least one instruction that is
708 /// always executed after the ones written by the c* function.
709 fn next_inst(&self) -> Patch {
710 Patch { hole: Hole::None, entry: self.insts.len() }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700711 }
712
713 fn fill(&mut self, hole: Hole, goto: InstPtr) {
714 match hole {
715 Hole::None => {}
716 Hole::One(pc) => {
717 self.insts[pc].fill(goto);
718 }
719 Hole::Many(holes) => {
720 for hole in holes {
721 self.fill(hole, goto);
722 }
723 }
724 }
725 }
726
727 fn fill_to_next(&mut self, hole: Hole) {
728 let next = self.insts.len();
729 self.fill(hole, next);
730 }
731
732 fn fill_split(
733 &mut self,
734 hole: Hole,
735 goto1: Option<InstPtr>,
736 goto2: Option<InstPtr>,
737 ) -> Hole {
738 match hole {
739 Hole::None => Hole::None,
740 Hole::One(pc) => match (goto1, goto2) {
741 (Some(goto1), Some(goto2)) => {
742 self.insts[pc].fill_split(goto1, goto2);
743 Hole::None
744 }
745 (Some(goto1), None) => {
746 self.insts[pc].half_fill_split_goto1(goto1);
747 Hole::One(pc)
748 }
749 (None, Some(goto2)) => {
750 self.insts[pc].half_fill_split_goto2(goto2);
751 Hole::One(pc)
752 }
753 (None, None) => unreachable!(
754 "at least one of the split \
755 holes must be filled"
756 ),
757 },
758 Hole::Many(holes) => {
759 let mut new_holes = vec![];
760 for hole in holes {
761 new_holes.push(self.fill_split(hole, goto1, goto2));
762 }
763 if new_holes.is_empty() {
764 Hole::None
765 } else if new_holes.len() == 1 {
766 new_holes.pop().unwrap()
767 } else {
768 Hole::Many(new_holes)
769 }
770 }
771 }
772 }
773
774 fn push_compiled(&mut self, inst: Inst) {
775 self.insts.push(MaybeInst::Compiled(inst));
776 }
777
778 fn push_hole(&mut self, inst: InstHole) -> Hole {
779 let hole = self.insts.len();
780 self.insts.push(MaybeInst::Uncompiled(inst));
781 Hole::One(hole)
782 }
783
784 fn push_split_hole(&mut self) -> Hole {
785 let hole = self.insts.len();
786 self.insts.push(MaybeInst::Split);
787 Hole::One(hole)
788 }
789
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700790 fn pop_split_hole(&mut self) -> ResultOrEmpty {
791 self.insts.pop();
792 Ok(None)
793 }
794
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700795 fn check_size(&self) -> result::Result<(), Error> {
796 use std::mem::size_of;
797
798 if self.insts.len() * size_of::<Inst>() > self.size_limit {
799 Err(Error::CompiledTooBig(self.size_limit))
800 } else {
801 Ok(())
802 }
803 }
804}
805
806#[derive(Debug)]
807enum Hole {
808 None,
809 One(InstPtr),
810 Many(Vec<Hole>),
811}
812
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700813impl Hole {
814 fn dup_one(self) -> (Self, Self) {
815 match self {
816 Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
817 Hole::None | Hole::Many(_) => {
818 unreachable!("must be called on single hole")
819 }
820 }
821 }
822}
823
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700824#[derive(Clone, Debug)]
825enum MaybeInst {
826 Compiled(Inst),
827 Uncompiled(InstHole),
828 Split,
829 Split1(InstPtr),
830 Split2(InstPtr),
831}
832
833impl MaybeInst {
834 fn fill(&mut self, goto: InstPtr) {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700835 let maybeinst = match *self {
836 MaybeInst::Split => MaybeInst::Split1(goto),
837 MaybeInst::Uncompiled(ref inst) => {
838 MaybeInst::Compiled(inst.fill(goto))
839 }
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700840 MaybeInst::Split1(goto1) => {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700841 MaybeInst::Compiled(Inst::Split(InstSplit {
842 goto1: goto1,
843 goto2: goto,
844 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700845 }
846 MaybeInst::Split2(goto2) => {
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700847 MaybeInst::Compiled(Inst::Split(InstSplit {
848 goto1: goto,
849 goto2: goto2,
850 }))
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700851 }
852 _ => unreachable!(
853 "not all instructions were compiled! \
854 found uncompiled instruction: {:?}",
855 self
856 ),
857 };
Haibo Huang49cbe5f2020-05-28 20:14:24 -0700858 *self = maybeinst;
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -0700859 }
860
861 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
862 let filled = match *self {
863 MaybeInst::Split => {
864 Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
865 }
866 _ => unreachable!(
867 "must be called on Split instruction, \
868 instead it was called on: {:?}",
869 self
870 ),
871 };
872 *self = MaybeInst::Compiled(filled);
873 }
874
875 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
876 let half_filled = match *self {
877 MaybeInst::Split => goto1,
878 _ => unreachable!(
879 "must be called on Split instruction, \
880 instead it was called on: {:?}",
881 self
882 ),
883 };
884 *self = MaybeInst::Split1(half_filled);
885 }
886
887 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
888 let half_filled = match *self {
889 MaybeInst::Split => goto2,
890 _ => unreachable!(
891 "must be called on Split instruction, \
892 instead it was called on: {:?}",
893 self
894 ),
895 };
896 *self = MaybeInst::Split2(half_filled);
897 }
898
899 fn unwrap(self) -> Inst {
900 match self {
901 MaybeInst::Compiled(inst) => inst,
902 _ => unreachable!(
903 "must be called on a compiled instruction, \
904 instead it was called on: {:?}",
905 self
906 ),
907 }
908 }
909}
910
911#[derive(Clone, Debug)]
912enum InstHole {
913 Save { slot: usize },
914 EmptyLook { look: EmptyLook },
915 Char { c: char },
916 Ranges { ranges: Vec<(char, char)> },
917 Bytes { start: u8, end: u8 },
918}
919
920impl InstHole {
921 fn fill(&self, goto: InstPtr) -> Inst {
922 match *self {
923 InstHole::Save { slot } => {
924 Inst::Save(InstSave { goto: goto, slot: slot })
925 }
926 InstHole::EmptyLook { look } => {
927 Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
928 }
929 InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
930 InstHole::Ranges { ref ranges } => {
931 Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
932 }
933 InstHole::Bytes { start, end } => {
934 Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
935 }
936 }
937 }
938}
939
940struct CompileClass<'a, 'b> {
941 c: &'a mut Compiler,
942 ranges: &'b [hir::ClassUnicodeRange],
943}
944
945impl<'a, 'b> CompileClass<'a, 'b> {
946 fn compile(mut self) -> Result {
947 let mut holes = vec![];
948 let mut initial_entry = None;
949 let mut last_split = Hole::None;
950 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
951 self.c.suffix_cache.clear();
952
953 for (i, range) in self.ranges.iter().enumerate() {
954 let is_last_range = i + 1 == self.ranges.len();
955 utf8_seqs.reset(range.start(), range.end());
956 let mut it = (&mut utf8_seqs).peekable();
957 loop {
958 let utf8_seq = match it.next() {
959 None => break,
960 Some(utf8_seq) => utf8_seq,
961 };
962 if is_last_range && it.peek().is_none() {
963 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
964 holes.push(hole);
965 self.c.fill(last_split, entry);
966 last_split = Hole::None;
967 if initial_entry.is_none() {
968 initial_entry = Some(entry);
969 }
970 } else {
971 if initial_entry.is_none() {
972 initial_entry = Some(self.c.insts.len());
973 }
974 self.c.fill_to_next(last_split);
975 last_split = self.c.push_split_hole();
976 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
977 holes.push(hole);
978 last_split =
979 self.c.fill_split(last_split, Some(entry), None);
980 }
981 }
982 }
983 self.c.utf8_seqs = Some(utf8_seqs);
984 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
985 }
986
987 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
988 if self.c.compiled.is_reverse {
989 self.c_utf8_seq_(seq)
990 } else {
991 self.c_utf8_seq_(seq.into_iter().rev())
992 }
993 }
994
995 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
996 where
997 I: IntoIterator<Item = &'r Utf8Range>,
998 {
999 // The initial instruction for each UTF-8 sequence should be the same.
1000 let mut from_inst = ::std::usize::MAX;
1001 let mut last_hole = Hole::None;
1002 for byte_range in seq {
1003 let key = SuffixCacheKey {
1004 from_inst: from_inst,
1005 start: byte_range.start,
1006 end: byte_range.end,
1007 };
1008 {
1009 let pc = self.c.insts.len();
1010 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1011 from_inst = cached_pc;
1012 continue;
1013 }
1014 }
1015 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1016 if from_inst == ::std::usize::MAX {
1017 last_hole = self.c.push_hole(InstHole::Bytes {
1018 start: byte_range.start,
1019 end: byte_range.end,
1020 });
1021 } else {
1022 self.c.push_compiled(Inst::Bytes(InstBytes {
1023 goto: from_inst,
1024 start: byte_range.start,
1025 end: byte_range.end,
1026 }));
1027 }
1028 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1029 debug_assert!(from_inst < ::std::usize::MAX);
1030 }
1031 debug_assert!(from_inst < ::std::usize::MAX);
1032 Ok(Patch { hole: last_hole, entry: from_inst })
1033 }
1034}
1035
1036/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1037/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1038/// The set of byte ranges looks like this:
1039///
1040/// [0-7F]
1041/// [C2-DF][80-BF]
1042/// [E0][A0-BF][80-BF]
1043/// [E1-EC][80-BF][80-BF]
1044/// [ED][80-9F][80-BF]
1045/// [EE-EF][80-BF][80-BF]
1046///
1047/// Each line above translates to one alternate in the compiled regex program.
1048/// However, all but one of the alternates end in the same suffix, which is
1049/// a waste of an instruction. The suffix cache facilitates reusing them across
1050/// alternates.
1051///
1052/// Note that a HashMap could be trivially used for this, but we don't need its
1053/// overhead. Some small bounded space (LRU style) is more than enough.
1054///
1055/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1056/// except it uses hashes as original indices and then compares full keys for
1057/// validation against `dense` array.
Haibo Huang47619dd2021-01-08 17:05:43 -08001058#[derive(Debug)]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001059struct SuffixCache {
1060 sparse: Box<[usize]>,
1061 dense: Vec<SuffixCacheEntry>,
1062}
1063
1064#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1065struct SuffixCacheEntry {
1066 key: SuffixCacheKey,
1067 pc: InstPtr,
1068}
1069
1070#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1071struct SuffixCacheKey {
1072 from_inst: InstPtr,
1073 start: u8,
1074 end: u8,
1075}
1076
1077impl SuffixCache {
1078 fn new(size: usize) -> Self {
1079 SuffixCache {
1080 sparse: vec![0usize; size].into(),
1081 dense: Vec::with_capacity(size),
1082 }
1083 }
1084
1085 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1086 let hash = self.hash(&key);
1087 let pos = &mut self.sparse[hash];
1088 if let Some(entry) = self.dense.get(*pos) {
1089 if entry.key == key {
1090 return Some(entry.pc);
1091 }
1092 }
1093 *pos = self.dense.len();
1094 self.dense.push(SuffixCacheEntry { key: key, pc: pc });
1095 None
1096 }
1097
1098 fn clear(&mut self) {
1099 self.dense.clear();
1100 }
1101
1102 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1103 // Basic FNV-1a hash as described:
1104 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1105 const FNV_PRIME: u64 = 1099511628211;
1106 let mut h = 14695981039346656037;
1107 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1108 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1109 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1110 (h as usize) % self.sparse.len()
1111 }
1112}
1113
1114struct ByteClassSet([bool; 256]);
1115
1116impl ByteClassSet {
1117 fn new() -> Self {
1118 ByteClassSet([false; 256])
1119 }
1120
1121 fn set_range(&mut self, start: u8, end: u8) {
1122 debug_assert!(start <= end);
1123 if start > 0 {
1124 self.0[start as usize - 1] = true;
1125 }
1126 self.0[end as usize] = true;
1127 }
1128
1129 fn set_word_boundary(&mut self) {
1130 // We need to mark all ranges of bytes whose pairs result in
1131 // evaluating \b differently.
1132 let iswb = is_word_byte;
1133 let mut b1: u16 = 0;
1134 let mut b2: u16;
1135 while b1 <= 255 {
1136 b2 = b1 + 1;
1137 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1138 b2 += 1;
1139 }
1140 self.set_range(b1 as u8, (b2 - 1) as u8);
1141 b1 = b2;
1142 }
1143 }
1144
1145 fn byte_classes(&self) -> Vec<u8> {
1146 // N.B. If you're debugging the DFA, it's useful to simply return
1147 // `(0..256).collect()`, which effectively removes the byte classes
1148 // and makes the transitions easier to read.
1149 // (0usize..256).map(|x| x as u8).collect()
1150 let mut byte_classes = vec![0; 256];
1151 let mut class = 0u8;
1152 let mut i = 0;
1153 loop {
1154 byte_classes[i] = class as u8;
1155 if i >= 255 {
1156 break;
1157 }
1158 if self.0[i] {
1159 class = class.checked_add(1).unwrap();
1160 }
1161 i += 1;
1162 }
1163 byte_classes
1164 }
1165}
1166
Haibo Huang47619dd2021-01-08 17:05:43 -08001167impl fmt::Debug for ByteClassSet {
1168 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1169 f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
1170 }
1171}
1172
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001173fn u32_to_usize(n: u32) -> usize {
1174 // In case usize is less than 32 bits, we need to guard against overflow.
1175 // On most platforms this compiles to nothing.
1176 // TODO Use `std::convert::TryFrom` once it's stable.
1177 if (n as u64) > (::std::usize::MAX as u64) {
1178 panic!("BUG: {} is too big to be pointer sized", n)
1179 }
1180 n as usize
1181}
1182
1183#[cfg(test)]
1184mod tests {
1185 use super::ByteClassSet;
1186
1187 #[test]
1188 fn byte_classes() {
1189 let mut set = ByteClassSet::new();
1190 set.set_range(b'a', b'z');
1191 let classes = set.byte_classes();
1192 assert_eq!(classes[0], 0);
1193 assert_eq!(classes[1], 0);
1194 assert_eq!(classes[2], 0);
1195 assert_eq!(classes[b'a' as usize - 1], 0);
1196 assert_eq!(classes[b'a' as usize], 1);
1197 assert_eq!(classes[b'm' as usize], 1);
1198 assert_eq!(classes[b'z' as usize], 1);
1199 assert_eq!(classes[b'z' as usize + 1], 2);
1200 assert_eq!(classes[254], 2);
1201 assert_eq!(classes[255], 2);
1202
1203 let mut set = ByteClassSet::new();
1204 set.set_range(0, 2);
1205 set.set_range(4, 6);
1206 let classes = set.byte_classes();
1207 assert_eq!(classes[0], 0);
1208 assert_eq!(classes[1], 0);
1209 assert_eq!(classes[2], 0);
1210 assert_eq!(classes[3], 1);
1211 assert_eq!(classes[4], 2);
1212 assert_eq!(classes[5], 2);
1213 assert_eq!(classes[6], 2);
1214 assert_eq!(classes[7], 3);
1215 assert_eq!(classes[255], 3);
1216 }
1217
1218 #[test]
1219 fn full_byte_classes() {
1220 let mut set = ByteClassSet::new();
1221 for i in 0..256u16 {
1222 set.set_range(i as u8, i as u8);
1223 }
1224 assert_eq!(set.byte_classes().len(), 256);
1225 }
1226}