Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame^] | 1 | use std::char; |
| 2 | use std::cmp::Ordering; |
| 3 | use std::fmt; |
| 4 | use std::ops; |
| 5 | use std::u32; |
| 6 | |
| 7 | use syntax; |
| 8 | |
| 9 | use literal::LiteralSearcher; |
| 10 | use prog::InstEmptyLook; |
| 11 | use utf8::{decode_last_utf8, decode_utf8}; |
| 12 | |
| 13 | /// Represents a location in the input. |
| 14 | #[derive(Clone, Copy, Debug)] |
| 15 | pub struct InputAt { |
| 16 | pos: usize, |
| 17 | c: Char, |
| 18 | byte: Option<u8>, |
| 19 | len: usize, |
| 20 | } |
| 21 | |
| 22 | impl InputAt { |
| 23 | /// Returns true iff this position is at the beginning of the input. |
| 24 | pub fn is_start(&self) -> bool { |
| 25 | self.pos == 0 |
| 26 | } |
| 27 | |
| 28 | /// Returns true iff this position is past the end of the input. |
| 29 | pub fn is_end(&self) -> bool { |
| 30 | self.c.is_none() && self.byte.is_none() |
| 31 | } |
| 32 | |
| 33 | /// Returns the character at this position. |
| 34 | /// |
| 35 | /// If this position is just before or after the input, then an absent |
| 36 | /// character is returned. |
| 37 | pub fn char(&self) -> Char { |
| 38 | self.c |
| 39 | } |
| 40 | |
| 41 | /// Returns the byte at this position. |
| 42 | pub fn byte(&self) -> Option<u8> { |
| 43 | self.byte |
| 44 | } |
| 45 | |
| 46 | /// Returns the UTF-8 width of the character at this position. |
| 47 | pub fn len(&self) -> usize { |
| 48 | self.len |
| 49 | } |
| 50 | |
| 51 | /// Returns whether the UTF-8 width of the character at this position |
| 52 | /// is zero. |
| 53 | pub fn is_empty(&self) -> bool { |
| 54 | self.len == 0 |
| 55 | } |
| 56 | |
| 57 | /// Returns the byte offset of this position. |
| 58 | pub fn pos(&self) -> usize { |
| 59 | self.pos |
| 60 | } |
| 61 | |
| 62 | /// Returns the byte offset of the next position in the input. |
| 63 | pub fn next_pos(&self) -> usize { |
| 64 | self.pos + self.len |
| 65 | } |
| 66 | } |
| 67 | |
| 68 | /// An abstraction over input used in the matching engines. |
| 69 | pub trait Input: fmt::Debug { |
| 70 | /// Return an encoding of the position at byte offset `i`. |
| 71 | fn at(&self, i: usize) -> InputAt; |
| 72 | |
| 73 | /// Return the Unicode character occurring next to `at`. |
| 74 | /// |
| 75 | /// If no such character could be decoded, then `Char` is absent. |
| 76 | fn next_char(&self, at: InputAt) -> Char; |
| 77 | |
| 78 | /// Return the Unicode character occurring previous to `at`. |
| 79 | /// |
| 80 | /// If no such character could be decoded, then `Char` is absent. |
| 81 | fn previous_char(&self, at: InputAt) -> Char; |
| 82 | |
| 83 | /// Return true if the given empty width instruction matches at the |
| 84 | /// input position given. |
| 85 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; |
| 86 | |
| 87 | /// Scan the input for a matching prefix. |
| 88 | fn prefix_at( |
| 89 | &self, |
| 90 | prefixes: &LiteralSearcher, |
| 91 | at: InputAt, |
| 92 | ) -> Option<InputAt>; |
| 93 | |
| 94 | /// The number of bytes in the input. |
| 95 | fn len(&self) -> usize; |
| 96 | |
| 97 | /// Whether the input is empty. |
| 98 | fn is_empty(&self) -> bool { |
| 99 | self.len() == 0 |
| 100 | } |
| 101 | |
| 102 | /// Return the given input as a sequence of bytes. |
| 103 | fn as_bytes(&self) -> &[u8]; |
| 104 | } |
| 105 | |
| 106 | impl<'a, T: Input> Input for &'a T { |
| 107 | fn at(&self, i: usize) -> InputAt { |
| 108 | (**self).at(i) |
| 109 | } |
| 110 | |
| 111 | fn next_char(&self, at: InputAt) -> Char { |
| 112 | (**self).next_char(at) |
| 113 | } |
| 114 | |
| 115 | fn previous_char(&self, at: InputAt) -> Char { |
| 116 | (**self).previous_char(at) |
| 117 | } |
| 118 | |
| 119 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
| 120 | (**self).is_empty_match(at, empty) |
| 121 | } |
| 122 | |
| 123 | fn prefix_at( |
| 124 | &self, |
| 125 | prefixes: &LiteralSearcher, |
| 126 | at: InputAt, |
| 127 | ) -> Option<InputAt> { |
| 128 | (**self).prefix_at(prefixes, at) |
| 129 | } |
| 130 | |
| 131 | fn len(&self) -> usize { |
| 132 | (**self).len() |
| 133 | } |
| 134 | |
| 135 | fn as_bytes(&self) -> &[u8] { |
| 136 | (**self).as_bytes() |
| 137 | } |
| 138 | } |
| 139 | |
| 140 | /// An input reader over characters. |
| 141 | #[derive(Clone, Copy, Debug)] |
| 142 | pub struct CharInput<'t>(&'t [u8]); |
| 143 | |
| 144 | impl<'t> CharInput<'t> { |
| 145 | /// Return a new character input reader for the given string. |
| 146 | pub fn new(s: &'t [u8]) -> CharInput<'t> { |
| 147 | CharInput(s) |
| 148 | } |
| 149 | } |
| 150 | |
| 151 | impl<'t> ops::Deref for CharInput<'t> { |
| 152 | type Target = [u8]; |
| 153 | |
| 154 | fn deref(&self) -> &[u8] { |
| 155 | self.0 |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | impl<'t> Input for CharInput<'t> { |
| 160 | fn at(&self, i: usize) -> InputAt { |
| 161 | if i >= self.len() { |
| 162 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
| 163 | } else { |
| 164 | let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); |
| 165 | InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | fn next_char(&self, at: InputAt) -> Char { |
| 170 | at.char() |
| 171 | } |
| 172 | |
| 173 | fn previous_char(&self, at: InputAt) -> Char { |
| 174 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
| 175 | } |
| 176 | |
| 177 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
| 178 | use prog::EmptyLook::*; |
| 179 | match empty.look { |
| 180 | StartLine => { |
| 181 | let c = self.previous_char(at); |
| 182 | at.pos() == 0 || c == '\n' |
| 183 | } |
| 184 | EndLine => { |
| 185 | let c = self.next_char(at); |
| 186 | at.pos() == self.len() || c == '\n' |
| 187 | } |
| 188 | StartText => at.pos() == 0, |
| 189 | EndText => at.pos() == self.len(), |
| 190 | WordBoundary => { |
| 191 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 192 | c1.is_word_char() != c2.is_word_char() |
| 193 | } |
| 194 | NotWordBoundary => { |
| 195 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 196 | c1.is_word_char() == c2.is_word_char() |
| 197 | } |
| 198 | WordBoundaryAscii => { |
| 199 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 200 | c1.is_word_byte() != c2.is_word_byte() |
| 201 | } |
| 202 | NotWordBoundaryAscii => { |
| 203 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 204 | c1.is_word_byte() == c2.is_word_byte() |
| 205 | } |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | fn prefix_at( |
| 210 | &self, |
| 211 | prefixes: &LiteralSearcher, |
| 212 | at: InputAt, |
| 213 | ) -> Option<InputAt> { |
| 214 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
| 215 | } |
| 216 | |
| 217 | fn len(&self) -> usize { |
| 218 | self.0.len() |
| 219 | } |
| 220 | |
| 221 | fn as_bytes(&self) -> &[u8] { |
| 222 | self.0 |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | /// An input reader over bytes. |
| 227 | #[derive(Clone, Copy, Debug)] |
| 228 | pub struct ByteInput<'t> { |
| 229 | text: &'t [u8], |
| 230 | only_utf8: bool, |
| 231 | } |
| 232 | |
| 233 | impl<'t> ByteInput<'t> { |
| 234 | /// Return a new byte-based input reader for the given string. |
| 235 | pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { |
| 236 | ByteInput { text: text, only_utf8: only_utf8 } |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | impl<'t> ops::Deref for ByteInput<'t> { |
| 241 | type Target = [u8]; |
| 242 | |
| 243 | fn deref(&self) -> &[u8] { |
| 244 | self.text |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | impl<'t> Input for ByteInput<'t> { |
| 249 | fn at(&self, i: usize) -> InputAt { |
| 250 | if i >= self.len() { |
| 251 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
| 252 | } else { |
| 253 | InputAt { |
| 254 | pos: i, |
| 255 | c: None.into(), |
| 256 | byte: self.get(i).cloned(), |
| 257 | len: 1, |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | fn next_char(&self, at: InputAt) -> Char { |
| 263 | decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() |
| 264 | } |
| 265 | |
| 266 | fn previous_char(&self, at: InputAt) -> Char { |
| 267 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
| 268 | } |
| 269 | |
| 270 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
| 271 | use prog::EmptyLook::*; |
| 272 | match empty.look { |
| 273 | StartLine => { |
| 274 | let c = self.previous_char(at); |
| 275 | at.pos() == 0 || c == '\n' |
| 276 | } |
| 277 | EndLine => { |
| 278 | let c = self.next_char(at); |
| 279 | at.pos() == self.len() || c == '\n' |
| 280 | } |
| 281 | StartText => at.pos() == 0, |
| 282 | EndText => at.pos() == self.len(), |
| 283 | WordBoundary => { |
| 284 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 285 | c1.is_word_char() != c2.is_word_char() |
| 286 | } |
| 287 | NotWordBoundary => { |
| 288 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 289 | c1.is_word_char() == c2.is_word_char() |
| 290 | } |
| 291 | WordBoundaryAscii => { |
| 292 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 293 | if self.only_utf8 { |
| 294 | // If we must match UTF-8, then we can't match word |
| 295 | // boundaries at invalid UTF-8. |
| 296 | if c1.is_none() && !at.is_start() { |
| 297 | return false; |
| 298 | } |
| 299 | if c2.is_none() && !at.is_end() { |
| 300 | return false; |
| 301 | } |
| 302 | } |
| 303 | c1.is_word_byte() != c2.is_word_byte() |
| 304 | } |
| 305 | NotWordBoundaryAscii => { |
| 306 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
| 307 | if self.only_utf8 { |
| 308 | // If we must match UTF-8, then we can't match word |
| 309 | // boundaries at invalid UTF-8. |
| 310 | if c1.is_none() && !at.is_start() { |
| 311 | return false; |
| 312 | } |
| 313 | if c2.is_none() && !at.is_end() { |
| 314 | return false; |
| 315 | } |
| 316 | } |
| 317 | c1.is_word_byte() == c2.is_word_byte() |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | fn prefix_at( |
| 323 | &self, |
| 324 | prefixes: &LiteralSearcher, |
| 325 | at: InputAt, |
| 326 | ) -> Option<InputAt> { |
| 327 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
| 328 | } |
| 329 | |
| 330 | fn len(&self) -> usize { |
| 331 | self.text.len() |
| 332 | } |
| 333 | |
| 334 | fn as_bytes(&self) -> &[u8] { |
| 335 | self.text |
| 336 | } |
| 337 | } |
| 338 | |
| 339 | /// An inline representation of `Option<char>`. |
| 340 | /// |
| 341 | /// This eliminates the need to do case analysis on `Option<char>` to determine |
| 342 | /// ordinality with other characters. |
| 343 | /// |
| 344 | /// (The `Option<char>` is not related to encoding. Instead, it is used in the |
| 345 | /// matching engines to represent the beginning and ending boundaries of the |
| 346 | /// search text.) |
| 347 | #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] |
| 348 | pub struct Char(u32); |
| 349 | |
| 350 | impl fmt::Debug for Char { |
| 351 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 352 | match char::from_u32(self.0) { |
| 353 | None => write!(f, "Empty"), |
| 354 | Some(c) => write!(f, "{:?}", c), |
| 355 | } |
| 356 | } |
| 357 | } |
| 358 | |
| 359 | impl Char { |
| 360 | /// Returns true iff the character is absent. |
| 361 | #[inline] |
| 362 | pub fn is_none(self) -> bool { |
| 363 | self.0 == u32::MAX |
| 364 | } |
| 365 | |
| 366 | /// Returns the length of the character's UTF-8 encoding. |
| 367 | /// |
| 368 | /// If the character is absent, then `1` is returned. |
| 369 | #[inline] |
| 370 | pub fn len_utf8(self) -> usize { |
| 371 | char::from_u32(self.0).map_or(1, |c| c.len_utf8()) |
| 372 | } |
| 373 | |
| 374 | /// Returns true iff the character is a word character. |
| 375 | /// |
| 376 | /// If the character is absent, then false is returned. |
| 377 | pub fn is_word_char(self) -> bool { |
| 378 | // is_word_character can panic if the Unicode data for \w isn't |
| 379 | // available. However, our compiler ensures that if a Unicode word |
| 380 | // boundary is used, then the data must also be available. If it isn't, |
| 381 | // then the compiler returns an error. |
| 382 | char::from_u32(self.0).map_or(false, syntax::is_word_character) |
| 383 | } |
| 384 | |
| 385 | /// Returns true iff the byte is a word byte. |
| 386 | /// |
| 387 | /// If the byte is absent, then false is returned. |
| 388 | pub fn is_word_byte(self) -> bool { |
| 389 | match char::from_u32(self.0) { |
| 390 | Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), |
| 391 | None | Some(_) => false, |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | impl From<char> for Char { |
| 397 | fn from(c: char) -> Char { |
| 398 | Char(c as u32) |
| 399 | } |
| 400 | } |
| 401 | |
| 402 | impl From<Option<char>> for Char { |
| 403 | fn from(c: Option<char>) -> Char { |
| 404 | c.map_or(Char(u32::MAX), |c| c.into()) |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | impl PartialEq<char> for Char { |
| 409 | #[inline] |
| 410 | fn eq(&self, other: &char) -> bool { |
| 411 | self.0 == *other as u32 |
| 412 | } |
| 413 | } |
| 414 | |
| 415 | impl PartialEq<Char> for char { |
| 416 | #[inline] |
| 417 | fn eq(&self, other: &Char) -> bool { |
| 418 | *self as u32 == other.0 |
| 419 | } |
| 420 | } |
| 421 | |
| 422 | impl PartialOrd<char> for Char { |
| 423 | #[inline] |
| 424 | fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
| 425 | self.0.partial_cmp(&(*other as u32)) |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | impl PartialOrd<Char> for char { |
| 430 | #[inline] |
| 431 | fn partial_cmp(&self, other: &Char) -> Option<Ordering> { |
| 432 | (*self as u32).partial_cmp(&other.0) |
| 433 | } |
| 434 | } |