Blame - src/input.rs - platform/external/rust/crates/regex

blob: 3afa2d0f6c955208b21eea4cee985c263d43288d [file] [log] [blame]

Chih-Hung Hsieh	e42c505	2020-04-16 10:44:21 -0700	[diff] [blame^]	1	use std::char;
				2	use std::cmp::Ordering;
				3	use std::fmt;
				4	use std::ops;
				5	use std::u32;
				6
				7	use syntax;
				8
				9	use literal::LiteralSearcher;
				10	use prog::InstEmptyLook;
				11	use utf8::{decode_last_utf8, decode_utf8};
				12
				13	/// Represents a location in the input.
				14	#[derive(Clone, Copy, Debug)]
				15	pub struct InputAt {
				16	pos: usize,
				17	c: Char,
				18	byte: Option<u8>,
				19	len: usize,
				20	}
				21
				22	impl InputAt {
				23	/// Returns true iff this position is at the beginning of the input.
				24	pub fn is_start(&self) -> bool {
				25	self.pos == 0
				26	}
				27
				28	/// Returns true iff this position is past the end of the input.
				29	pub fn is_end(&self) -> bool {
				30	self.c.is_none() && self.byte.is_none()
				31	}
				32
				33	/// Returns the character at this position.
				34	///
				35	/// If this position is just before or after the input, then an absent
				36	/// character is returned.
				37	pub fn char(&self) -> Char {
				38	self.c
				39	}
				40
				41	/// Returns the byte at this position.
				42	pub fn byte(&self) -> Option<u8> {
				43	self.byte
				44	}
				45
				46	/// Returns the UTF-8 width of the character at this position.
				47	pub fn len(&self) -> usize {
				48	self.len
				49	}
				50
				51	/// Returns whether the UTF-8 width of the character at this position
				52	/// is zero.
				53	pub fn is_empty(&self) -> bool {
				54	self.len == 0
				55	}
				56
				57	/// Returns the byte offset of this position.
				58	pub fn pos(&self) -> usize {
				59	self.pos
				60	}
				61
				62	/// Returns the byte offset of the next position in the input.
				63	pub fn next_pos(&self) -> usize {
				64	self.pos + self.len
				65	}
				66	}
				67
				68	/// An abstraction over input used in the matching engines.
				69	pub trait Input: fmt::Debug {
				70	/// Return an encoding of the position at byte offset `i`.
				71	fn at(&self, i: usize) -> InputAt;
				72
				73	/// Return the Unicode character occurring next to `at`.
				74	///
				75	/// If no such character could be decoded, then `Char` is absent.
				76	fn next_char(&self, at: InputAt) -> Char;
				77
				78	/// Return the Unicode character occurring previous to `at`.
				79	///
				80	/// If no such character could be decoded, then `Char` is absent.
				81	fn previous_char(&self, at: InputAt) -> Char;
				82
				83	/// Return true if the given empty width instruction matches at the
				84	/// input position given.
				85	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
				86
				87	/// Scan the input for a matching prefix.
				88	fn prefix_at(
				89	&self,
				90	prefixes: &LiteralSearcher,
				91	at: InputAt,
				92	) -> Option<InputAt>;
				93
				94	/// The number of bytes in the input.
				95	fn len(&self) -> usize;
				96
				97	/// Whether the input is empty.
				98	fn is_empty(&self) -> bool {
				99	self.len() == 0
				100	}
				101
				102	/// Return the given input as a sequence of bytes.
				103	fn as_bytes(&self) -> &[u8];
				104	}
				105
				106	impl<'a, T: Input> Input for &'a T {
				107	fn at(&self, i: usize) -> InputAt {
				108	(**self).at(i)
				109	}
				110
				111	fn next_char(&self, at: InputAt) -> Char {
				112	(**self).next_char(at)
				113	}
				114
				115	fn previous_char(&self, at: InputAt) -> Char {
				116	(**self).previous_char(at)
				117	}
				118
				119	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
				120	(**self).is_empty_match(at, empty)
				121	}
				122
				123	fn prefix_at(
				124	&self,
				125	prefixes: &LiteralSearcher,
				126	at: InputAt,
				127	) -> Option<InputAt> {
				128	(**self).prefix_at(prefixes, at)
				129	}
				130
				131	fn len(&self) -> usize {
				132	(**self).len()
				133	}
				134
				135	fn as_bytes(&self) -> &[u8] {
				136	(**self).as_bytes()
				137	}
				138	}
				139
				140	/// An input reader over characters.
				141	#[derive(Clone, Copy, Debug)]
				142	pub struct CharInput<'t>(&'t [u8]);
				143
				144	impl<'t> CharInput<'t> {
				145	/// Return a new character input reader for the given string.
				146	pub fn new(s: &'t [u8]) -> CharInput<'t> {
				147	CharInput(s)
				148	}
				149	}
				150
				151	impl<'t> ops::Deref for CharInput<'t> {
				152	type Target = [u8];
				153
				154	fn deref(&self) -> &[u8] {
				155	self.0
				156	}
				157	}
				158
				159	impl<'t> Input for CharInput<'t> {
				160	fn at(&self, i: usize) -> InputAt {
				161	if i >= self.len() {
				162	InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
				163	} else {
				164	let c = decode_utf8(&self[i..]).map(\|(c, _)\| c).into();
				165	InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
				166	}
				167	}
				168
				169	fn next_char(&self, at: InputAt) -> Char {
				170	at.char()
				171	}
				172
				173	fn previous_char(&self, at: InputAt) -> Char {
				174	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
				175	}
				176
				177	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
				178	use prog::EmptyLook::*;
				179	match empty.look {
				180	StartLine => {
				181	let c = self.previous_char(at);
				182	at.pos() == 0 \|\| c == '\n'
				183	}
				184	EndLine => {
				185	let c = self.next_char(at);
				186	at.pos() == self.len() \|\| c == '\n'
				187	}
				188	StartText => at.pos() == 0,
				189	EndText => at.pos() == self.len(),
				190	WordBoundary => {
				191	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				192	c1.is_word_char() != c2.is_word_char()
				193	}
				194	NotWordBoundary => {
				195	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				196	c1.is_word_char() == c2.is_word_char()
				197	}
				198	WordBoundaryAscii => {
				199	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				200	c1.is_word_byte() != c2.is_word_byte()
				201	}
				202	NotWordBoundaryAscii => {
				203	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				204	c1.is_word_byte() == c2.is_word_byte()
				205	}
				206	}
				207	}
				208
				209	fn prefix_at(
				210	&self,
				211	prefixes: &LiteralSearcher,
				212	at: InputAt,
				213	) -> Option<InputAt> {
				214	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
				215	}
				216
				217	fn len(&self) -> usize {
				218	self.0.len()
				219	}
				220
				221	fn as_bytes(&self) -> &[u8] {
				222	self.0
				223	}
				224	}
				225
				226	/// An input reader over bytes.
				227	#[derive(Clone, Copy, Debug)]
				228	pub struct ByteInput<'t> {
				229	text: &'t [u8],
				230	only_utf8: bool,
				231	}
				232
				233	impl<'t> ByteInput<'t> {
				234	/// Return a new byte-based input reader for the given string.
				235	pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
				236	ByteInput { text: text, only_utf8: only_utf8 }
				237	}
				238	}
				239
				240	impl<'t> ops::Deref for ByteInput<'t> {
				241	type Target = [u8];
				242
				243	fn deref(&self) -> &[u8] {
				244	self.text
				245	}
				246	}
				247
				248	impl<'t> Input for ByteInput<'t> {
				249	fn at(&self, i: usize) -> InputAt {
				250	if i >= self.len() {
				251	InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
				252	} else {
				253	InputAt {
				254	pos: i,
				255	c: None.into(),
				256	byte: self.get(i).cloned(),
				257	len: 1,
				258	}
				259	}
				260	}
				261
				262	fn next_char(&self, at: InputAt) -> Char {
				263	decode_utf8(&self[at.pos()..]).map(\|(c, _)\| c).into()
				264	}
				265
				266	fn previous_char(&self, at: InputAt) -> Char {
				267	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
				268	}
				269
				270	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
				271	use prog::EmptyLook::*;
				272	match empty.look {
				273	StartLine => {
				274	let c = self.previous_char(at);
				275	at.pos() == 0 \|\| c == '\n'
				276	}
				277	EndLine => {
				278	let c = self.next_char(at);
				279	at.pos() == self.len() \|\| c == '\n'
				280	}
				281	StartText => at.pos() == 0,
				282	EndText => at.pos() == self.len(),
				283	WordBoundary => {
				284	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				285	c1.is_word_char() != c2.is_word_char()
				286	}
				287	NotWordBoundary => {
				288	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				289	c1.is_word_char() == c2.is_word_char()
				290	}
				291	WordBoundaryAscii => {
				292	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				293	if self.only_utf8 {
				294	// If we must match UTF-8, then we can't match word
				295	// boundaries at invalid UTF-8.
				296	if c1.is_none() && !at.is_start() {
				297	return false;
				298	}
				299	if c2.is_none() && !at.is_end() {
				300	return false;
				301	}
				302	}
				303	c1.is_word_byte() != c2.is_word_byte()
				304	}
				305	NotWordBoundaryAscii => {
				306	let (c1, c2) = (self.previous_char(at), self.next_char(at));
				307	if self.only_utf8 {
				308	// If we must match UTF-8, then we can't match word
				309	// boundaries at invalid UTF-8.
				310	if c1.is_none() && !at.is_start() {
				311	return false;
				312	}
				313	if c2.is_none() && !at.is_end() {
				314	return false;
				315	}
				316	}
				317	c1.is_word_byte() == c2.is_word_byte()
				318	}
				319	}
				320	}
				321
				322	fn prefix_at(
				323	&self,
				324	prefixes: &LiteralSearcher,
				325	at: InputAt,
				326	) -> Option<InputAt> {
				327	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
				328	}
				329
				330	fn len(&self) -> usize {
				331	self.text.len()
				332	}
				333
				334	fn as_bytes(&self) -> &[u8] {
				335	self.text
				336	}
				337	}
				338
				339	/// An inline representation of `Option<char>`.
				340	///
				341	/// This eliminates the need to do case analysis on `Option<char>` to determine
				342	/// ordinality with other characters.
				343	///
				344	/// (The `Option<char>` is not related to encoding. Instead, it is used in the
				345	/// matching engines to represent the beginning and ending boundaries of the
				346	/// search text.)
				347	#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
				348	pub struct Char(u32);
				349
				350	impl fmt::Debug for Char {
				351	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				352	match char::from_u32(self.0) {
				353	None => write!(f, "Empty"),
				354	Some(c) => write!(f, "{:?}", c),
				355	}
				356	}
				357	}
				358
				359	impl Char {
				360	/// Returns true iff the character is absent.
				361	#[inline]
				362	pub fn is_none(self) -> bool {
				363	self.0 == u32::MAX
				364	}
				365
				366	/// Returns the length of the character's UTF-8 encoding.
				367	///
				368	/// If the character is absent, then `1` is returned.
				369	#[inline]
				370	pub fn len_utf8(self) -> usize {
				371	char::from_u32(self.0).map_or(1, \|c\| c.len_utf8())
				372	}
				373
				374	/// Returns true iff the character is a word character.
				375	///
				376	/// If the character is absent, then false is returned.
				377	pub fn is_word_char(self) -> bool {
				378	// is_word_character can panic if the Unicode data for \w isn't
				379	// available. However, our compiler ensures that if a Unicode word
				380	// boundary is used, then the data must also be available. If it isn't,
				381	// then the compiler returns an error.
				382	char::from_u32(self.0).map_or(false, syntax::is_word_character)
				383	}
				384
				385	/// Returns true iff the byte is a word byte.
				386	///
				387	/// If the byte is absent, then false is returned.
				388	pub fn is_word_byte(self) -> bool {
				389	match char::from_u32(self.0) {
				390	Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
				391	None \| Some(_) => false,
				392	}
				393	}
				394	}
				395
				396	impl From<char> for Char {
				397	fn from(c: char) -> Char {
				398	Char(c as u32)
				399	}
				400	}
				401
				402	impl From<Option<char>> for Char {
				403	fn from(c: Option<char>) -> Char {
				404	c.map_or(Char(u32::MAX), \|c\| c.into())
				405	}
				406	}
				407
				408	impl PartialEq<char> for Char {
				409	#[inline]
				410	fn eq(&self, other: &char) -> bool {
				411	self.0 == *other as u32
				412	}
				413	}
				414
				415	impl PartialEq<Char> for char {
				416	#[inline]
				417	fn eq(&self, other: &Char) -> bool {
				418	*self as u32 == other.0
				419	}
				420	}
				421
				422	impl PartialOrd<char> for Char {
				423	#[inline]
				424	fn partial_cmp(&self, other: &char) -> Option<Ordering> {
				425	self.0.partial_cmp(&(*other as u32))
				426	}
				427	}
				428
				429	impl PartialOrd<Char> for char {
				430	#[inline]
				431	fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
				432	(*self as u32).partial_cmp(&other.0)
				433	}
				434	}