Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame^] | 1 | use std::cmp::Ordering; |
| 2 | use std::collections::HashMap; |
| 3 | use std::fmt; |
| 4 | use std::mem; |
| 5 | use std::ops::Deref; |
| 6 | use std::slice; |
| 7 | use std::sync::Arc; |
| 8 | |
| 9 | use input::Char; |
| 10 | use literal::LiteralSearcher; |
| 11 | |
| 12 | /// `InstPtr` represents the index of an instruction in a regex program. |
| 13 | pub type InstPtr = usize; |
| 14 | |
| 15 | /// Program is a sequence of instructions and various facts about thos |
| 16 | /// instructions. |
| 17 | #[derive(Clone)] |
| 18 | pub struct Program { |
| 19 | /// A sequence of instructions that represents an NFA. |
| 20 | pub insts: Vec<Inst>, |
| 21 | /// Pointers to each Match instruction in the sequence. |
| 22 | /// |
| 23 | /// This is always length 1 unless this program represents a regex set. |
| 24 | pub matches: Vec<InstPtr>, |
| 25 | /// The ordered sequence of all capture groups extracted from the AST. |
| 26 | /// Unnamed groups are `None`. |
| 27 | pub captures: Vec<Option<String>>, |
| 28 | /// Pointers to all named capture groups into `captures`. |
| 29 | pub capture_name_idx: Arc<HashMap<String, usize>>, |
| 30 | /// A pointer to the start instruction. This can vary depending on how |
| 31 | /// the program was compiled. For example, programs for use with the DFA |
| 32 | /// engine have a `.*?` inserted at the beginning of unanchored regular |
| 33 | /// expressions. The actual starting point of the program is after the |
| 34 | /// `.*?`. |
| 35 | pub start: InstPtr, |
| 36 | /// A set of equivalence classes for discriminating bytes in the compiled |
| 37 | /// program. |
| 38 | pub byte_classes: Vec<u8>, |
| 39 | /// When true, this program can only match valid UTF-8. |
| 40 | pub only_utf8: bool, |
| 41 | /// When true, this program uses byte range instructions instead of Unicode |
| 42 | /// range instructions. |
| 43 | pub is_bytes: bool, |
| 44 | /// When true, the program is compiled for DFA matching. For example, this |
| 45 | /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored |
| 46 | /// regexes. |
| 47 | pub is_dfa: bool, |
| 48 | /// When true, the program matches text in reverse (for use only in the |
| 49 | /// DFA). |
| 50 | pub is_reverse: bool, |
| 51 | /// Whether the regex must match from the start of the input. |
| 52 | pub is_anchored_start: bool, |
| 53 | /// Whether the regex must match at the end of the input. |
| 54 | pub is_anchored_end: bool, |
| 55 | /// Whether this program contains a Unicode word boundary instruction. |
| 56 | pub has_unicode_word_boundary: bool, |
| 57 | /// A possibly empty machine for very quickly matching prefix literals. |
| 58 | pub prefixes: LiteralSearcher, |
| 59 | /// A limit on the size of the cache that the DFA is allowed to use while |
| 60 | /// matching. |
| 61 | /// |
| 62 | /// The cache limit specifies approximately how much space we're willing to |
| 63 | /// give to the state cache. Once the state cache exceeds the size, it is |
| 64 | /// wiped and all states must be re-computed. |
| 65 | /// |
| 66 | /// Note that this value does not impact correctness. It can be set to 0 |
| 67 | /// and the DFA will run just fine. (It will only ever store exactly one |
| 68 | /// state in the cache, and will likely run very slowly, but it will work.) |
| 69 | /// |
| 70 | /// Also note that this limit is *per thread of execution*. That is, |
| 71 | /// if the same regex is used to search text across multiple threads |
| 72 | /// simultaneously, then the DFA cache is not shared. Instead, copies are |
| 73 | /// made. |
| 74 | pub dfa_size_limit: usize, |
| 75 | } |
| 76 | |
| 77 | impl Program { |
| 78 | /// Creates an empty instruction sequence. Fields are given default |
| 79 | /// values. |
| 80 | pub fn new() -> Self { |
| 81 | Program { |
| 82 | insts: vec![], |
| 83 | matches: vec![], |
| 84 | captures: vec![], |
| 85 | capture_name_idx: Arc::new(HashMap::new()), |
| 86 | start: 0, |
| 87 | byte_classes: vec![0; 256], |
| 88 | only_utf8: true, |
| 89 | is_bytes: false, |
| 90 | is_dfa: false, |
| 91 | is_reverse: false, |
| 92 | is_anchored_start: false, |
| 93 | is_anchored_end: false, |
| 94 | has_unicode_word_boundary: false, |
| 95 | prefixes: LiteralSearcher::empty(), |
| 96 | dfa_size_limit: 2 * (1 << 20), |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | /// If pc is an index to a no-op instruction (like Save), then return the |
| 101 | /// next pc that is not a no-op instruction. |
| 102 | pub fn skip(&self, mut pc: usize) -> usize { |
| 103 | loop { |
| 104 | match self[pc] { |
| 105 | Inst::Save(ref i) => pc = i.goto, |
| 106 | _ => return pc, |
| 107 | } |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | /// Return true if and only if an execution engine at instruction `pc` will |
| 112 | /// always lead to a match. |
| 113 | pub fn leads_to_match(&self, pc: usize) -> bool { |
| 114 | if self.matches.len() > 1 { |
| 115 | // If we have a regex set, then we have more than one ending |
| 116 | // state, so leading to one of those states is generally |
| 117 | // meaningless. |
| 118 | return false; |
| 119 | } |
| 120 | match self[self.skip(pc)] { |
| 121 | Inst::Match(_) => true, |
| 122 | _ => false, |
| 123 | } |
| 124 | } |
| 125 | |
| 126 | /// Returns true if the current configuration demands that an implicit |
| 127 | /// `.*?` be prepended to the instruction sequence. |
| 128 | pub fn needs_dotstar(&self) -> bool { |
| 129 | self.is_dfa && !self.is_reverse && !self.is_anchored_start |
| 130 | } |
| 131 | |
| 132 | /// Returns true if this program uses Byte instructions instead of |
| 133 | /// Char/Range instructions. |
| 134 | pub fn uses_bytes(&self) -> bool { |
| 135 | self.is_bytes || self.is_dfa |
| 136 | } |
| 137 | |
| 138 | /// Returns true if this program exclusively matches valid UTF-8 bytes. |
| 139 | /// |
| 140 | /// That is, if an invalid UTF-8 byte is seen, then no match is possible. |
| 141 | pub fn only_utf8(&self) -> bool { |
| 142 | self.only_utf8 |
| 143 | } |
| 144 | |
| 145 | /// Return the approximate heap usage of this instruction sequence in |
| 146 | /// bytes. |
| 147 | pub fn approximate_size(&self) -> usize { |
| 148 | // The only instruction that uses heap space is Ranges (for |
| 149 | // Unicode codepoint programs) to store non-overlapping codepoint |
| 150 | // ranges. To keep this operation constant time, we ignore them. |
| 151 | (self.len() * mem::size_of::<Inst>()) |
| 152 | + (self.matches.len() * mem::size_of::<InstPtr>()) |
| 153 | + (self.captures.len() * mem::size_of::<Option<String>>()) |
| 154 | + (self.capture_name_idx.len() |
| 155 | * (mem::size_of::<String>() + mem::size_of::<usize>())) |
| 156 | + (self.byte_classes.len() * mem::size_of::<u8>()) |
| 157 | + self.prefixes.approximate_size() |
| 158 | } |
| 159 | } |
| 160 | |
| 161 | impl Deref for Program { |
| 162 | type Target = [Inst]; |
| 163 | |
| 164 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 165 | fn deref(&self) -> &Self::Target { |
| 166 | &*self.insts |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | impl fmt::Debug for Program { |
| 171 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 172 | use self::Inst::*; |
| 173 | |
| 174 | fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { |
| 175 | if goto == cur + 1 { |
| 176 | fmtd |
| 177 | } else { |
| 178 | format!("{} (goto: {})", fmtd, goto) |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | fn visible_byte(b: u8) -> String { |
| 183 | use std::ascii::escape_default; |
| 184 | let escaped = escape_default(b).collect::<Vec<u8>>(); |
| 185 | String::from_utf8_lossy(&escaped).into_owned() |
| 186 | } |
| 187 | |
| 188 | for (pc, inst) in self.iter().enumerate() { |
| 189 | match *inst { |
| 190 | Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, |
| 191 | Save(ref inst) => { |
| 192 | let s = format!("{:04} Save({})", pc, inst.slot); |
| 193 | write!(f, "{}", with_goto(pc, inst.goto, s))?; |
| 194 | } |
| 195 | Split(ref inst) => { |
| 196 | write!( |
| 197 | f, |
| 198 | "{:04} Split({}, {})", |
| 199 | pc, inst.goto1, inst.goto2 |
| 200 | )?; |
| 201 | } |
| 202 | EmptyLook(ref inst) => { |
| 203 | let s = format!("{:?}", inst.look); |
| 204 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| 205 | } |
| 206 | Char(ref inst) => { |
| 207 | let s = format!("{:?}", inst.c); |
| 208 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| 209 | } |
| 210 | Ranges(ref inst) => { |
| 211 | let ranges = inst |
| 212 | .ranges |
| 213 | .iter() |
| 214 | .map(|r| format!("{:?}-{:?}", r.0, r.1)) |
| 215 | .collect::<Vec<String>>() |
| 216 | .join(", "); |
| 217 | write!( |
| 218 | f, |
| 219 | "{:04} {}", |
| 220 | pc, |
| 221 | with_goto(pc, inst.goto, ranges) |
| 222 | )?; |
| 223 | } |
| 224 | Bytes(ref inst) => { |
| 225 | let s = format!( |
| 226 | "Bytes({}, {})", |
| 227 | visible_byte(inst.start), |
| 228 | visible_byte(inst.end) |
| 229 | ); |
| 230 | write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; |
| 231 | } |
| 232 | } |
| 233 | if pc == self.start { |
| 234 | write!(f, " (start)")?; |
| 235 | } |
| 236 | write!(f, "\n")?; |
| 237 | } |
| 238 | Ok(()) |
| 239 | } |
| 240 | } |
| 241 | |
| 242 | impl<'a> IntoIterator for &'a Program { |
| 243 | type Item = &'a Inst; |
| 244 | type IntoIter = slice::Iter<'a, Inst>; |
| 245 | fn into_iter(self) -> Self::IntoIter { |
| 246 | self.iter() |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | /// Inst is an instruction code in a Regex program. |
| 251 | /// |
| 252 | /// Regrettably, a regex program either contains Unicode codepoint |
| 253 | /// instructions (Char and Ranges) or it contains byte instructions (Bytes). |
| 254 | /// A regex program can never contain both. |
| 255 | /// |
| 256 | /// It would be worth investigating splitting this into two distinct types and |
| 257 | /// then figuring out how to make the matching engines polymorphic over those |
| 258 | /// types without sacrificing performance. |
| 259 | /// |
| 260 | /// Other than the benefit of moving invariants into the type system, another |
| 261 | /// benefit is the decreased size. If we remove the `Char` and `Ranges` |
| 262 | /// instructions from the `Inst` enum, then its size shrinks from 40 bytes to |
| 263 | /// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` |
| 264 | /// variant.) Given that byte based machines are typically much bigger than |
| 265 | /// their Unicode analogues (because they can decode UTF-8 directly), this ends |
| 266 | /// up being a pretty significant savings. |
| 267 | #[derive(Clone, Debug)] |
| 268 | pub enum Inst { |
| 269 | /// Match indicates that the program has reached a match state. |
| 270 | /// |
| 271 | /// The number in the match corresponds to the Nth logical regular |
| 272 | /// expression in this program. This index is always 0 for normal regex |
| 273 | /// programs. Values greater than 0 appear when compiling regex sets, and |
| 274 | /// each match instruction gets its own unique value. The value corresponds |
| 275 | /// to the Nth regex in the set. |
| 276 | Match(usize), |
| 277 | /// Save causes the program to save the current location of the input in |
| 278 | /// the slot indicated by InstSave. |
| 279 | Save(InstSave), |
| 280 | /// Split causes the program to diverge to one of two paths in the |
| 281 | /// program, preferring goto1 in InstSplit. |
| 282 | Split(InstSplit), |
| 283 | /// EmptyLook represents a zero-width assertion in a regex program. A |
| 284 | /// zero-width assertion does not consume any of the input text. |
| 285 | EmptyLook(InstEmptyLook), |
| 286 | /// Char requires the regex program to match the character in InstChar at |
| 287 | /// the current position in the input. |
| 288 | Char(InstChar), |
| 289 | /// Ranges requires the regex program to match the character at the current |
| 290 | /// position in the input with one of the ranges specified in InstRanges. |
| 291 | Ranges(InstRanges), |
| 292 | /// Bytes is like Ranges, except it expresses a single byte range. It is |
| 293 | /// used in conjunction with Split instructions to implement multi-byte |
| 294 | /// character classes. |
| 295 | Bytes(InstBytes), |
| 296 | } |
| 297 | |
| 298 | impl Inst { |
| 299 | /// Returns true if and only if this is a match instruction. |
| 300 | pub fn is_match(&self) -> bool { |
| 301 | match *self { |
| 302 | Inst::Match(_) => true, |
| 303 | _ => false, |
| 304 | } |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | /// Representation of the Save instruction. |
| 309 | #[derive(Clone, Debug)] |
| 310 | pub struct InstSave { |
| 311 | /// The next location to execute in the program. |
| 312 | pub goto: InstPtr, |
| 313 | /// The capture slot (there are two slots for every capture in a regex, |
| 314 | /// including the zeroth capture for the entire match). |
| 315 | pub slot: usize, |
| 316 | } |
| 317 | |
| 318 | /// Representation of the Split instruction. |
| 319 | #[derive(Clone, Debug)] |
| 320 | pub struct InstSplit { |
| 321 | /// The first instruction to try. A match resulting from following goto1 |
| 322 | /// has precedence over a match resulting from following goto2. |
| 323 | pub goto1: InstPtr, |
| 324 | /// The second instruction to try. A match resulting from following goto1 |
| 325 | /// has precedence over a match resulting from following goto2. |
| 326 | pub goto2: InstPtr, |
| 327 | } |
| 328 | |
| 329 | /// Representation of the `EmptyLook` instruction. |
| 330 | #[derive(Clone, Debug)] |
| 331 | pub struct InstEmptyLook { |
| 332 | /// The next location to execute in the program if this instruction |
| 333 | /// succeeds. |
| 334 | pub goto: InstPtr, |
| 335 | /// The type of zero-width assertion to check. |
| 336 | pub look: EmptyLook, |
| 337 | } |
| 338 | |
| 339 | /// The set of zero-width match instructions. |
| 340 | #[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 341 | pub enum EmptyLook { |
| 342 | /// Start of line or input. |
| 343 | StartLine, |
| 344 | /// End of line or input. |
| 345 | EndLine, |
| 346 | /// Start of input. |
| 347 | StartText, |
| 348 | /// End of input. |
| 349 | EndText, |
| 350 | /// Word character on one side and non-word character on other. |
| 351 | WordBoundary, |
| 352 | /// Word character on both sides or non-word character on both sides. |
| 353 | NotWordBoundary, |
| 354 | /// ASCII word boundary. |
| 355 | WordBoundaryAscii, |
| 356 | /// Not ASCII word boundary. |
| 357 | NotWordBoundaryAscii, |
| 358 | } |
| 359 | |
| 360 | /// Representation of the Char instruction. |
| 361 | #[derive(Clone, Debug)] |
| 362 | pub struct InstChar { |
| 363 | /// The next location to execute in the program if this instruction |
| 364 | /// succeeds. |
| 365 | pub goto: InstPtr, |
| 366 | /// The character to test. |
| 367 | pub c: char, |
| 368 | } |
| 369 | |
| 370 | /// Representation of the Ranges instruction. |
| 371 | #[derive(Clone, Debug)] |
| 372 | pub struct InstRanges { |
| 373 | /// The next location to execute in the program if this instruction |
| 374 | /// succeeds. |
| 375 | pub goto: InstPtr, |
| 376 | /// The set of Unicode scalar value ranges to test. |
| 377 | pub ranges: Vec<(char, char)>, |
| 378 | } |
| 379 | |
| 380 | impl InstRanges { |
| 381 | /// Tests whether the given input character matches this instruction. |
| 382 | pub fn matches(&self, c: Char) -> bool { |
| 383 | // This speeds up the `match_class_unicode` benchmark by checking |
| 384 | // some common cases quickly without binary search. e.g., Matching |
| 385 | // a Unicode class on predominantly ASCII text. |
| 386 | for r in self.ranges.iter().take(4) { |
| 387 | if c < r.0 { |
| 388 | return false; |
| 389 | } |
| 390 | if c <= r.1 { |
| 391 | return true; |
| 392 | } |
| 393 | } |
| 394 | self.ranges |
| 395 | .binary_search_by(|r| { |
| 396 | if r.1 < c { |
| 397 | Ordering::Less |
| 398 | } else if r.0 > c { |
| 399 | Ordering::Greater |
| 400 | } else { |
| 401 | Ordering::Equal |
| 402 | } |
| 403 | }) |
| 404 | .is_ok() |
| 405 | } |
| 406 | |
| 407 | /// Return the number of distinct characters represented by all of the |
| 408 | /// ranges. |
| 409 | pub fn num_chars(&self) -> usize { |
| 410 | self.ranges |
| 411 | .iter() |
| 412 | .map(|&(s, e)| 1 + (e as u32) - (s as u32)) |
| 413 | .sum::<u32>() as usize |
| 414 | } |
| 415 | } |
| 416 | |
| 417 | /// Representation of the Bytes instruction. |
| 418 | #[derive(Clone, Debug)] |
| 419 | pub struct InstBytes { |
| 420 | /// The next location to execute in the program if this instruction |
| 421 | /// succeeds. |
| 422 | pub goto: InstPtr, |
| 423 | /// The start (inclusive) of this byte range. |
| 424 | pub start: u8, |
| 425 | /// The end (inclusive) of this byte range. |
| 426 | pub end: u8, |
| 427 | } |
| 428 | |
| 429 | impl InstBytes { |
| 430 | /// Returns true if and only if the given byte is in this range. |
| 431 | pub fn matches(&self, byte: u8) -> bool { |
| 432 | self.start <= byte && byte <= self.end |
| 433 | } |
| 434 | } |