Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::char; |
| 2 | use std::cmp::Ordering; |
| 3 | use std::fmt; |
| 4 | use std::ops; |
| 5 | use std::u32; |
| 6 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 7 | use crate::literal::LiteralSearcher; |
| 8 | use crate::prog::InstEmptyLook; |
| 9 | use crate::utf8::{decode_last_utf8, decode_utf8}; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 10 | |
| 11 | /// Represents a location in the input. |
| 12 | #[derive(Clone, Copy, Debug)] |
| 13 | pub struct InputAt { |
| 14 | pos: usize, |
| 15 | c: Char, |
| 16 | byte: Option<u8>, |
| 17 | len: usize, |
| 18 | } |
| 19 | |
| 20 | impl InputAt { |
| 21 | /// Returns true iff this position is at the beginning of the input. |
| 22 | pub fn is_start(&self) -> bool { |
| 23 | self.pos == 0 |
| 24 | } |
| 25 | |
| 26 | /// Returns true iff this position is past the end of the input. |
| 27 | pub fn is_end(&self) -> bool { |
| 28 | self.c.is_none() && self.byte.is_none() |
| 29 | } |
| 30 | |
| 31 | /// Returns the character at this position. |
| 32 | /// |
| 33 | /// If this position is just before or after the input, then an absent |
| 34 | /// character is returned. |
| 35 | pub fn char(&self) -> Char { |
| 36 | self.c |
| 37 | } |
| 38 | |
| 39 | /// Returns the byte at this position. |
| 40 | pub fn byte(&self) -> Option<u8> { |
| 41 | self.byte |
| 42 | } |
| 43 | |
| 44 | /// Returns the UTF-8 width of the character at this position. |
| 45 | pub fn len(&self) -> usize { |
| 46 | self.len |
| 47 | } |
| 48 | |
| 49 | /// Returns whether the UTF-8 width of the character at this position |
| 50 | /// is zero. |
| 51 | pub fn is_empty(&self) -> bool { |
| 52 | self.len == 0 |
| 53 | } |
| 54 | |
| 55 | /// Returns the byte offset of this position. |
| 56 | pub fn pos(&self) -> usize { |
| 57 | self.pos |
| 58 | } |
| 59 | |
| 60 | /// Returns the byte offset of the next position in the input. |
| 61 | pub fn next_pos(&self) -> usize { |
| 62 | self.pos + self.len |
| 63 | } |
| 64 | } |
| 65 | |
| 66 | /// An abstraction over input used in the matching engines. |
| 67 | pub trait Input: fmt::Debug { |
| 68 | /// Return an encoding of the position at byte offset `i`. |
| 69 | fn at(&self, i: usize) -> InputAt; |
| 70 | |
| 71 | /// Return the Unicode character occurring next to `at`. |
| 72 | /// |
| 73 | /// If no such character could be decoded, then `Char` is absent. |
| 74 | fn next_char(&self, at: InputAt) -> Char; |
| 75 | |
| 76 | /// Return the Unicode character occurring previous to `at`. |
| 77 | /// |
| 78 | /// If no such character could be decoded, then `Char` is absent. |
| 79 | fn previous_char(&self, at: InputAt) -> Char; |
| 80 | |
| 81 | /// Return true if the given empty width instruction matches at the |
| 82 | /// input position given. |
| 83 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; |
| 84 | |
| 85 | /// Scan the input for a matching prefix. |
| 86 | fn prefix_at( |
| 87 | &self, |
| 88 | prefixes: &LiteralSearcher, |
| 89 | at: InputAt, |
| 90 | ) -> Option<InputAt>; |
| 91 | |
| 92 | /// The number of bytes in the input. |
| 93 | fn len(&self) -> usize; |
| 94 | |
| 95 | /// Whether the input is empty. |
| 96 | fn is_empty(&self) -> bool { |
| 97 | self.len() == 0 |
| 98 | } |
| 99 | |
| 100 | /// Return the given input as a sequence of bytes. |
| 101 | fn as_bytes(&self) -> &[u8]; |
| 102 | } |
| 103 | |
| 104 | impl<'a, T: Input> Input for &'a T { |
| 105 | fn at(&self, i: usize) -> InputAt { |
| 106 | (**self).at(i) |
| 107 | } |
| 108 | |
| 109 | fn next_char(&self, at: InputAt) -> Char { |
| 110 | (**self).next_char(at) |
| 111 | } |
| 112 | |
| 113 | fn previous_char(&self, at: InputAt) -> Char { |
| 114 | (**self).previous_char(at) |
| 115 | } |
| 116 | |
| 117 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
| 118 | (**self).is_empty_match(at, empty) |
| 119 | } |
| 120 | |
| 121 | fn prefix_at( |
| 122 | &self, |
| 123 | prefixes: &LiteralSearcher, |
| 124 | at: InputAt, |
| 125 | ) -> Option<InputAt> { |
| 126 | (**self).prefix_at(prefixes, at) |
| 127 | } |
| 128 | |
| 129 | fn len(&self) -> usize { |
| 130 | (**self).len() |
| 131 | } |
| 132 | |
| 133 | fn as_bytes(&self) -> &[u8] { |
| 134 | (**self).as_bytes() |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | /// An input reader over characters. |
| 139 | #[derive(Clone, Copy, Debug)] |
| 140 | pub struct CharInput<'t>(&'t [u8]); |
| 141 | |
| 142 | impl<'t> CharInput<'t> { |
| 143 | /// Return a new character input reader for the given string. |
| 144 | pub fn new(s: &'t [u8]) -> CharInput<'t> { |
| 145 | CharInput(s) |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | impl<'t> ops::Deref for CharInput<'t> { |
| 150 | type Target = [u8]; |
| 151 | |
| 152 | fn deref(&self) -> &[u8] { |
| 153 | self.0 |
| 154 | } |
| 155 | } |
| 156 | |
| 157 | impl<'t> Input for CharInput<'t> { |
| 158 | fn at(&self, i: usize) -> InputAt { |
| 159 | if i >= self.len() { |
| 160 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
| 161 | } else { |
| 162 | let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); |
| 163 | InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | fn next_char(&self, at: InputAt) -> Char { |
| 168 | at.char() |
| 169 | } |
| 170 | |
| 171 | fn previous_char(&self, at: InputAt) -> Char { |
| 172 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
| 173 | } |
| 174 | |
| 175 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 176 | use crate::prog::EmptyLook::*; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 177 | match empty.look { |
| 178 | StartLine => { |
| 179 | let c = self.previous_char(at); |
| 180 | at.pos() == 0 || c == '\n' |
| 181 | } |
| 182 | EndLine => { |
| 183 | let c = self.next_char(at); |
| 184 | at.pos() == self.len() || c == '\n' |
| 185 | } |
| 186 | StartText => at.pos() == 0, |
| 187 | EndText => at.pos() == self.len(), |
| 188 | WordBoundary => { |
| 189 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 190 | c1.is_word_char() != c2.is_word_char() |
| 191 | } |
| 192 | NotWordBoundary => { |
| 193 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 194 | c1.is_word_char() == c2.is_word_char() |
| 195 | } |
| 196 | WordBoundaryAscii => { |
| 197 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 198 | c1.is_word_byte() != c2.is_word_byte() |
| 199 | } |
| 200 | NotWordBoundaryAscii => { |
| 201 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 202 | c1.is_word_byte() == c2.is_word_byte() |
| 203 | } |
| 204 | } |
| 205 | } |
| 206 | |
| 207 | fn prefix_at( |
| 208 | &self, |
| 209 | prefixes: &LiteralSearcher, |
| 210 | at: InputAt, |
| 211 | ) -> Option<InputAt> { |
| 212 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
| 213 | } |
| 214 | |
| 215 | fn len(&self) -> usize { |
| 216 | self.0.len() |
| 217 | } |
| 218 | |
| 219 | fn as_bytes(&self) -> &[u8] { |
| 220 | self.0 |
| 221 | } |
| 222 | } |
| 223 | |
| 224 | /// An input reader over bytes. |
| 225 | #[derive(Clone, Copy, Debug)] |
| 226 | pub struct ByteInput<'t> { |
| 227 | text: &'t [u8], |
| 228 | only_utf8: bool, |
| 229 | } |
| 230 | |
| 231 | impl<'t> ByteInput<'t> { |
| 232 | /// Return a new byte-based input reader for the given string. |
| 233 | pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { |
| 234 | ByteInput { text: text, only_utf8: only_utf8 } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | impl<'t> ops::Deref for ByteInput<'t> { |
| 239 | type Target = [u8]; |
| 240 | |
| 241 | fn deref(&self) -> &[u8] { |
| 242 | self.text |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | impl<'t> Input for ByteInput<'t> { |
| 247 | fn at(&self, i: usize) -> InputAt { |
| 248 | if i >= self.len() { |
| 249 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
| 250 | } else { |
| 251 | InputAt { |
| 252 | pos: i, |
| 253 | c: None.into(), |
| 254 | byte: self.get(i).cloned(), |
| 255 | len: 1, |
| 256 | } |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | fn next_char(&self, at: InputAt) -> Char { |
| 261 | decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() |
| 262 | } |
| 263 | |
| 264 | fn previous_char(&self, at: InputAt) -> Char { |
| 265 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
| 266 | } |
| 267 | |
| 268 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 269 | use crate::prog::EmptyLook::*; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 270 | match empty.look { |
| 271 | StartLine => { |
| 272 | let c = self.previous_char(at); |
| 273 | at.pos() == 0 || c == '\n' |
| 274 | } |
| 275 | EndLine => { |
| 276 | let c = self.next_char(at); |
| 277 | at.pos() == self.len() || c == '\n' |
| 278 | } |
| 279 | StartText => at.pos() == 0, |
| 280 | EndText => at.pos() == self.len(), |
| 281 | WordBoundary => { |
| 282 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 283 | c1.is_word_char() != c2.is_word_char() |
| 284 | } |
| 285 | NotWordBoundary => { |
| 286 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 287 | c1.is_word_char() == c2.is_word_char() |
| 288 | } |
| 289 | WordBoundaryAscii => { |
| 290 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 291 | if self.only_utf8 { |
| 292 | // If we must match UTF-8, then we can't match word |
| 293 | // boundaries at invalid UTF-8. |
| 294 | if c1.is_none() && !at.is_start() { |
| 295 | return false; |
| 296 | } |
| 297 | if c2.is_none() && !at.is_end() { |
| 298 | return false; |
| 299 | } |
| 300 | } |
| 301 | c1.is_word_byte() != c2.is_word_byte() |
| 302 | } |
| 303 | NotWordBoundaryAscii => { |
| 304 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 305 | if self.only_utf8 { |
| 306 | // If we must match UTF-8, then we can't match word |
| 307 | // boundaries at invalid UTF-8. |
| 308 | if c1.is_none() && !at.is_start() { |
| 309 | return false; |
| 310 | } |
| 311 | if c2.is_none() && !at.is_end() { |
| 312 | return false; |
| 313 | } |
| 314 | } |
| 315 | c1.is_word_byte() == c2.is_word_byte() |
| 316 | } |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | fn prefix_at( |
| 321 | &self, |
| 322 | prefixes: &LiteralSearcher, |
| 323 | at: InputAt, |
| 324 | ) -> Option<InputAt> { |
| 325 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
| 326 | } |
| 327 | |
| 328 | fn len(&self) -> usize { |
| 329 | self.text.len() |
| 330 | } |
| 331 | |
| 332 | fn as_bytes(&self) -> &[u8] { |
| 333 | self.text |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | /// An inline representation of `Option<char>`. |
| 338 | /// |
| 339 | /// This eliminates the need to do case analysis on `Option<char>` to determine |
| 340 | /// ordinality with other characters. |
| 341 | /// |
| 342 | /// (The `Option<char>` is not related to encoding. Instead, it is used in the |
| 343 | /// matching engines to represent the beginning and ending boundaries of the |
| 344 | /// search text.) |
| 345 | #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] |
| 346 | pub struct Char(u32); |
| 347 | |
| 348 | impl fmt::Debug for Char { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 349 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 350 | match char::from_u32(self.0) { |
| 351 | None => write!(f, "Empty"), |
| 352 | Some(c) => write!(f, "{:?}", c), |
| 353 | } |
| 354 | } |
| 355 | } |
| 356 | |
| 357 | impl Char { |
| 358 | /// Returns true iff the character is absent. |
| 359 | #[inline] |
| 360 | pub fn is_none(self) -> bool { |
| 361 | self.0 == u32::MAX |
| 362 | } |
| 363 | |
| 364 | /// Returns the length of the character's UTF-8 encoding. |
| 365 | /// |
| 366 | /// If the character is absent, then `1` is returned. |
| 367 | #[inline] |
| 368 | pub fn len_utf8(self) -> usize { |
| 369 | char::from_u32(self.0).map_or(1, |c| c.len_utf8()) |
| 370 | } |
| 371 | |
| 372 | /// Returns true iff the character is a word character. |
| 373 | /// |
| 374 | /// If the character is absent, then false is returned. |
| 375 | pub fn is_word_char(self) -> bool { |
| 376 | // is_word_character can panic if the Unicode data for \w isn't |
| 377 | // available. However, our compiler ensures that if a Unicode word |
| 378 | // boundary is used, then the data must also be available. If it isn't, |
| 379 | // then the compiler returns an error. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 380 | char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 381 | } |
| 382 | |
| 383 | /// Returns true iff the byte is a word byte. |
| 384 | /// |
| 385 | /// If the byte is absent, then false is returned. |
| 386 | pub fn is_word_byte(self) -> bool { |
| 387 | match char::from_u32(self.0) { |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 388 | Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 389 | None | Some(_) => false, |
| 390 | } |
| 391 | } |
| 392 | } |
| 393 | |
| 394 | impl From<char> for Char { |
| 395 | fn from(c: char) -> Char { |
| 396 | Char(c as u32) |
| 397 | } |
| 398 | } |
| 399 | |
| 400 | impl From<Option<char>> for Char { |
| 401 | fn from(c: Option<char>) -> Char { |
| 402 | c.map_or(Char(u32::MAX), |c| c.into()) |
| 403 | } |
| 404 | } |
| 405 | |
| 406 | impl PartialEq<char> for Char { |
| 407 | #[inline] |
| 408 | fn eq(&self, other: &char) -> bool { |
| 409 | self.0 == *other as u32 |
| 410 | } |
| 411 | } |
| 412 | |
| 413 | impl PartialEq<Char> for char { |
| 414 | #[inline] |
| 415 | fn eq(&self, other: &Char) -> bool { |
| 416 | *self as u32 == other.0 |
| 417 | } |
| 418 | } |
| 419 | |
| 420 | impl PartialOrd<char> for Char { |
| 421 | #[inline] |
| 422 | fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
| 423 | self.0.partial_cmp(&(*other as u32)) |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | impl PartialOrd<Char> for char { |
| 428 | #[inline] |
| 429 | fn partial_cmp(&self, other: &Char) -> Option<Ordering> { |
| 430 | (*self as u32).partial_cmp(&other.0) |
| 431 | } |
| 432 | } |