Blame - src/input.rs - platform/external/rust/crates/regex

blob: 5d50ee34094d1a05161d1ff62173c1959e5b7a90 [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	1	use std::char;
				2	use std::cmp::Ordering;
				3	use std::fmt;
				4	use std::ops;
				5	use std::u32;
				6
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	7	use crate::literal::LiteralSearcher;
				8	use crate::prog::InstEmptyLook;
				9	use crate::utf8::{decode_last_utf8, decode_utf8};
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	10
				11	/// Represents a location in the input.
				12	#[derive(Clone, Copy, Debug)]
				13	pub struct InputAt {
				14	pos: usize,
				15	c: Char,
				16	byte: Option<u8>,
				17	len: usize,
				18	}
				19
				20	impl InputAt {
				21	/// Returns true iff this position is at the beginning of the input.
				22	pub fn is_start(&self) -> bool {
				23	self.pos == 0
				24	}
				25
				26	/// Returns true iff this position is past the end of the input.
				27	pub fn is_end(&self) -> bool {
				28	self.c.is_none() && self.byte.is_none()
				29	}
				30
				31	/// Returns the character at this position.
				32	///
				33	/// If this position is just before or after the input, then an absent
				34	/// character is returned.
				35	pub fn char(&self) -> Char {
				36	self.c
				37	}
				38
				39	/// Returns the byte at this position.
				40	pub fn byte(&self) -> Option<u8> {
				41	self.byte
				42	}
				43
				44	/// Returns the UTF-8 width of the character at this position.
				45	pub fn len(&self) -> usize {
				46	self.len
				47	}
				48
				49	/// Returns whether the UTF-8 width of the character at this position
				50	/// is zero.
				51	pub fn is_empty(&self) -> bool {
				52	self.len == 0
				53	}
				54
				55	/// Returns the byte offset of this position.
				56	pub fn pos(&self) -> usize {
				57	self.pos
				58	}
				59
				60	/// Returns the byte offset of the next position in the input.
				61	pub fn next_pos(&self) -> usize {
				62	self.pos + self.len
				63	}
				64	}
				65
				66	/// An abstraction over input used in the matching engines.
				67	pub trait Input: fmt::Debug {
				68	/// Return an encoding of the position at byte offset `i`.
				69	fn at(&self, i: usize) -> InputAt;
				70
				71	/// Return the Unicode character occurring next to `at`.
				72	///
				73	/// If no such character could be decoded, then `Char` is absent.
				74	fn next_char(&self, at: InputAt) -> Char;
				75
				76	/// Return the Unicode character occurring previous to `at`.
				77	///
				78	/// If no such character could be decoded, then `Char` is absent.
				79	fn previous_char(&self, at: InputAt) -> Char;
				80
				81	/// Return true if the given empty width instruction matches at the
				82	/// input position given.
				83	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
				84
				85	/// Scan the input for a matching prefix.
				86	fn prefix_at(
				87	&self,
				88	prefixes: &LiteralSearcher,
				89	at: InputAt,
				90	) -> Option<InputAt>;
				91
				92	/// The number of bytes in the input.
				93	fn len(&self) -> usize;
				94
				95	/// Whether the input is empty.
				96	fn is_empty(&self) -> bool {
				97	self.len() == 0
				98	}
				99
				100	/// Return the given input as a sequence of bytes.
				101	fn as_bytes(&self) -> &[u8];
				102	}
				103
				104	impl<'a, T: Input> Input for &'a T {
				105	fn at(&self, i: usize) -> InputAt {
				106	(**self).at(i)
				107	}
				108
				109	fn next_char(&self, at: InputAt) -> Char {
				110	(**self).next_char(at)
				111	}
				112
				113	fn previous_char(&self, at: InputAt) -> Char {
				114	(**self).previous_char(at)
				115	}
				116
				117	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
				118	(**self).is_empty_match(at, empty)
				119	}
				120
				121	fn prefix_at(
				122	&self,
				123	prefixes: &LiteralSearcher,
				124	at: InputAt,
				125	) -> Option<InputAt> {
				126	(**self).prefix_at(prefixes, at)
				127	}
				128
				129	fn len(&self) -> usize {
				130	(**self).len()
				131	}
				132
				133	fn as_bytes(&self) -> &[u8] {
				134	(**self).as_bytes()
				135	}
				136	}
				137
				138	/// An input reader over characters.
				139	#[derive(Clone, Copy, Debug)]
				140	pub struct CharInput<'t>(&'t [u8]);
				141
				142	impl<'t> CharInput<'t> {
				143	/// Return a new character input reader for the given string.
				144	pub fn new(s: &'t [u8]) -> CharInput<'t> {
				145	CharInput(s)
				146	}
				147	}
				148
				149	impl<'t> ops::Deref for CharInput<'t> {
				150	type Target = [u8];
				151
				152	fn deref(&self) -> &[u8] {
				153	self.0
				154	}
				155	}
				156
				157	impl<'t> Input for CharInput<'t> {
				158	fn at(&self, i: usize) -> InputAt {
				159	if i >= self.len() {
				160	InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
				161	} else {
				162	let c = decode_utf8(&self[i..]).map(\|(c, _)\| c).into();
				163	InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
				164	}
				165	}
				166
				167	fn next_char(&self, at: InputAt) -> Char {
				168	at.char()
				169	}
				170
				171	fn previous_char(&self, at: InputAt) -> Char {
				172	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
				173	}
				174
				175	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	176	use crate::prog::EmptyLook::*;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	177	match empty.look {
				178	StartLine => {
				179	let c = self.previous_char(at);
				180	at.pos() == 0 \|\| c == '\n'
				181	}
				182	EndLine => {
				183	let c = self.next_char(at);
				184	at.pos() == self.len() \|\| c == '\n'
				185	}
				186	StartText => at.pos() == 0,
				187	EndText => at.pos() == self.len(),
				188	WordBoundary => {
				189	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				190	c1.is_word_char() != c2.is_word_char()
				191	}
				192	NotWordBoundary => {
				193	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				194	c1.is_word_char() == c2.is_word_char()
				195	}
				196	WordBoundaryAscii => {
				197	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				198	c1.is_word_byte() != c2.is_word_byte()
				199	}
				200	NotWordBoundaryAscii => {
				201	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				202	c1.is_word_byte() == c2.is_word_byte()
				203	}
				204	}
				205	}
				206
				207	fn prefix_at(
				208	&self,
				209	prefixes: &LiteralSearcher,
				210	at: InputAt,
				211	) -> Option<InputAt> {
				212	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
				213	}
				214
				215	fn len(&self) -> usize {
				216	self.0.len()
				217	}
				218
				219	fn as_bytes(&self) -> &[u8] {
				220	self.0
				221	}
				222	}
				223
				224	/// An input reader over bytes.
				225	#[derive(Clone, Copy, Debug)]
				226	pub struct ByteInput<'t> {
				227	text: &'t [u8],
				228	only_utf8: bool,
				229	}
				230
				231	impl<'t> ByteInput<'t> {
				232	/// Return a new byte-based input reader for the given string.
				233	pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
				234	ByteInput { text: text, only_utf8: only_utf8 }
				235	}
				236	}
				237
				238	impl<'t> ops::Deref for ByteInput<'t> {
				239	type Target = [u8];
				240
				241	fn deref(&self) -> &[u8] {
				242	self.text
				243	}
				244	}
				245
				246	impl<'t> Input for ByteInput<'t> {
				247	fn at(&self, i: usize) -> InputAt {
				248	if i >= self.len() {
				249	InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
				250	} else {
				251	InputAt {
				252	pos: i,
				253	c: None.into(),
				254	byte: self.get(i).cloned(),
				255	len: 1,
				256	}
				257	}
				258	}
				259
				260	fn next_char(&self, at: InputAt) -> Char {
				261	decode_utf8(&self[at.pos()..]).map(\|(c, _)\| c).into()
				262	}
				263
				264	fn previous_char(&self, at: InputAt) -> Char {
				265	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
				266	}
				267
				268	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	269	use crate::prog::EmptyLook::*;
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	270	match empty.look {
				271	StartLine => {
				272	let c = self.previous_char(at);
				273	at.pos() == 0 \|\| c == '\n'
				274	}
				275	EndLine => {
				276	let c = self.next_char(at);
				277	at.pos() == self.len() \|\| c == '\n'
				278	}
				279	StartText => at.pos() == 0,
				280	EndText => at.pos() == self.len(),
				281	WordBoundary => {
				282	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				283	c1.is_word_char() != c2.is_word_char()
				284	}
				285	NotWordBoundary => {
				286	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				287	c1.is_word_char() == c2.is_word_char()
				288	}
				289	WordBoundaryAscii => {
				290	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				291	if self.only_utf8 {
				292	// If we must match UTF-8, then we can't match word
				293	// boundaries at invalid UTF-8.
				294	if c1.is_none() && !at.is_start() {
				295	return false;
				296	}
				297	if c2.is_none() && !at.is_end() {
				298	return false;
				299	}
				300	}
				301	c1.is_word_byte() != c2.is_word_byte()
				302	}
				303	NotWordBoundaryAscii => {
				304	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				305	if self.only_utf8 {
				306	// If we must match UTF-8, then we can't match word
				307	// boundaries at invalid UTF-8.
				308	if c1.is_none() && !at.is_start() {
				309	return false;
				310	}
				311	if c2.is_none() && !at.is_end() {
				312	return false;
				313	}
				314	}
				315	c1.is_word_byte() == c2.is_word_byte()
				316	}
				317	}
				318	}
				319
				320	fn prefix_at(
				321	&self,
				322	prefixes: &LiteralSearcher,
				323	at: InputAt,
				324	) -> Option<InputAt> {
				325	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
				326	}
				327
				328	fn len(&self) -> usize {
				329	self.text.len()
				330	}
				331
				332	fn as_bytes(&self) -> &[u8] {
				333	self.text
				334	}
				335	}
				336
				337	/// An inline representation of `Option<char>`.
				338	///
				339	/// This eliminates the need to do case analysis on `Option<char>` to determine
				340	/// ordinality with other characters.
				341	///
				342	/// (The `Option<char>` is not related to encoding. Instead, it is used in the
				343	/// matching engines to represent the beginning and ending boundaries of the
				344	/// search text.)
				345	#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
				346	pub struct Char(u32);
				347
				348	impl fmt::Debug for Char {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	349	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	350	match char::from_u32(self.0) {
				351	None => write!(f, "Empty"),
				352	Some(c) => write!(f, "{:?}", c),
				353	}
				354	}
				355	}
				356
				357	impl Char {
				358	/// Returns true iff the character is absent.
				359	#[inline]
				360	pub fn is_none(self) -> bool {
				361	self.0 == u32::MAX
				362	}
				363
				364	/// Returns the length of the character's UTF-8 encoding.
				365	///
				366	/// If the character is absent, then `1` is returned.
				367	#[inline]
				368	pub fn len_utf8(self) -> usize {
				369	char::from_u32(self.0).map_or(1, \|c\| c.len_utf8())
				370	}
				371
				372	/// Returns true iff the character is a word character.
				373	///
				374	/// If the character is absent, then false is returned.
				375	pub fn is_word_char(self) -> bool {
				376	// is_word_character can panic if the Unicode data for \w isn't
				377	// available. However, our compiler ensures that if a Unicode word
				378	// boundary is used, then the data must also be available. If it isn't,
				379	// then the compiler returns an error.
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	380	char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	381	}
				382
				383	/// Returns true iff the byte is a word byte.
				384	///
				385	/// If the byte is absent, then false is returned.
				386	pub fn is_word_byte(self) -> bool {
				387	match char::from_u32(self.0) {
Joel Galenson	3874808	2021-05-19 16:51:51 -0700	[diff] [blame^]	388	Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame]	389	None \| Some(_) => false,
				390	}
				391	}
				392	}
				393
				394	impl From<char> for Char {
				395	fn from(c: char) -> Char {
				396	Char(c as u32)
				397	}
				398	}
				399
				400	impl From<Option<char>> for Char {
				401	fn from(c: Option<char>) -> Char {
				402	c.map_or(Char(u32::MAX), \|c\| c.into())
				403	}
				404	}
				405
				406	impl PartialEq<char> for Char {
				407	#[inline]
				408	fn eq(&self, other: &char) -> bool {
				409	self.0 == *other as u32
				410	}
				411	}
				412
				413	impl PartialEq<Char> for char {
				414	#[inline]
				415	fn eq(&self, other: &Char) -> bool {
				416	*self as u32 == other.0
				417	}
				418	}
				419
				420	impl PartialOrd<char> for Char {
				421	#[inline]
				422	fn partial_cmp(&self, other: &char) -> Option<Ordering> {
				423	self.0.partial_cmp(&(*other as u32))
				424	}
				425	}
				426
				427	impl PartialOrd<Char> for char {
				428	#[inline]
				429	fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
				430	(*self as u32).partial_cmp(&other.0)
				431	}
				432	}