| /// Slot is a single saved capture location. Note that there are two slots for |
| /// every capture in a regular expression (one slot each for the start and end |
| /// of the capture). |
| pub type Slot = Option<usize>; |
| |
| /// Locations represents the offsets of each capturing group in a regex for |
| /// a single match. |
| /// |
| /// Unlike `Captures`, a `Locations` value only stores offsets. |
| #[doc(hidden)] |
| #[derive(Clone, Debug)] |
| pub struct Locations(Vec<Slot>); |
| |
| impl Locations { |
| /// Returns the start and end positions of the Nth capture group. Returns |
| /// `None` if `i` is not a valid capture group or if the capture group did |
| /// not match anything. The positions returned are *always* byte indices |
| /// with respect to the original string matched. |
| pub fn pos(&self, i: usize) -> Option<(usize, usize)> { |
| let (s, e) = (i * 2, i * 2 + 1); |
| match (self.0.get(s), self.0.get(e)) { |
| (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), |
| _ => None, |
| } |
| } |
| |
| /// Creates an iterator of all the capture group positions in order of |
| /// appearance in the regular expression. Positions are byte indices |
| /// in terms of the original string matched. |
| pub fn iter(&self) -> SubCapturesPosIter { |
| SubCapturesPosIter { idx: 0, locs: self } |
| } |
| |
| /// Returns the total number of capturing groups. |
| /// |
| /// This is always at least `1` since every regex has at least `1` |
| /// capturing group that corresponds to the entire match. |
| pub fn len(&self) -> usize { |
| self.0.len() / 2 |
| } |
| |
| /// Return the individual slots as a slice. |
| pub(crate) fn as_slots(&mut self) -> &mut [Slot] { |
| &mut self.0 |
| } |
| } |
| |
| /// An iterator over capture group positions for a particular match of a |
| /// regular expression. |
| /// |
| /// Positions are byte indices in terms of the original string matched. |
| /// |
| /// `'c` is the lifetime of the captures. |
| pub struct SubCapturesPosIter<'c> { |
| idx: usize, |
| locs: &'c Locations, |
| } |
| |
| impl<'c> Iterator for SubCapturesPosIter<'c> { |
| type Item = Option<(usize, usize)>; |
| |
| fn next(&mut self) -> Option<Option<(usize, usize)>> { |
| if self.idx >= self.locs.len() { |
| return None; |
| } |
| let x = match self.locs.pos(self.idx) { |
| None => Some(None), |
| Some((s, e)) => Some(Some((s, e))), |
| }; |
| self.idx += 1; |
| x |
| } |
| } |
| |
| /// `RegularExpression` describes types that can implement regex searching. |
| /// |
| /// This trait is my attempt at reducing code duplication and to standardize |
| /// the internal API. Specific duplication that is avoided are the `find` |
| /// and `capture` iterators, which are slightly tricky. |
| /// |
| /// It's not clear whether this trait is worth it, and it also isn't |
| /// clear whether it's useful as a public trait or not. Methods like |
| /// `next_after_empty` reak of bad design, but the rest of the methods seem |
| /// somewhat reasonable. One particular thing this trait would expose would be |
| /// the ability to start the search of a regex anywhere in a haystack, which |
| /// isn't possible in the current public API. |
| pub trait RegularExpression: Sized { |
| /// The type of the haystack. |
| type Text: ?Sized; |
| |
| /// The number of capture slots in the compiled regular expression. This is |
| /// always two times the number of capture groups (two slots per group). |
| fn slots_len(&self) -> usize; |
| |
| /// Allocates fresh space for all capturing groups in this regex. |
| fn locations(&self) -> Locations { |
| Locations(vec![None; self.slots_len()]) |
| } |
| |
| /// Returns the position of the next character after `i`. |
| /// |
| /// For example, a haystack with type `&[u8]` probably returns `i+1`, |
| /// whereas a haystack with type `&str` probably returns `i` plus the |
| /// length of the next UTF-8 sequence. |
| fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; |
| |
| /// Returns the location of the shortest match. |
| fn shortest_match_at( |
| &self, |
| text: &Self::Text, |
| start: usize, |
| ) -> Option<usize>; |
| |
| /// Returns whether the regex matches the text given. |
| fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; |
| |
| /// Returns the leftmost-first match location if one exists. |
| fn find_at( |
| &self, |
| text: &Self::Text, |
| start: usize, |
| ) -> Option<(usize, usize)>; |
| |
| /// Returns the leftmost-first match location if one exists, and also |
| /// fills in any matching capture slot locations. |
| fn captures_read_at( |
| &self, |
| locs: &mut Locations, |
| text: &Self::Text, |
| start: usize, |
| ) -> Option<(usize, usize)>; |
| |
| /// Returns an iterator over all non-overlapping successive leftmost-first |
| /// matches. |
| fn find_iter(self, text: &Self::Text) -> Matches<Self> { |
| Matches { re: self, text: text, last_end: 0, last_match: None } |
| } |
| |
| /// Returns an iterator over all non-overlapping successive leftmost-first |
| /// matches with captures. |
| fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> { |
| CaptureMatches(self.find_iter(text)) |
| } |
| } |
| |
| /// An iterator over all non-overlapping successive leftmost-first matches. |
| pub struct Matches<'t, R> |
| where |
| R: RegularExpression, |
| R::Text: 't, |
| { |
| re: R, |
| text: &'t R::Text, |
| last_end: usize, |
| last_match: Option<usize>, |
| } |
| |
| impl<'t, R> Matches<'t, R> |
| where |
| R: RegularExpression, |
| R::Text: 't, |
| { |
| /// Return the text being searched. |
| pub fn text(&self) -> &'t R::Text { |
| self.text |
| } |
| |
| /// Return the underlying regex. |
| pub fn regex(&self) -> &R { |
| &self.re |
| } |
| } |
| |
| impl<'t, R> Iterator for Matches<'t, R> |
| where |
| R: RegularExpression, |
| R::Text: 't + AsRef<[u8]>, |
| { |
| type Item = (usize, usize); |
| |
| fn next(&mut self) -> Option<(usize, usize)> { |
| if self.last_end > self.text.as_ref().len() { |
| return None; |
| } |
| let (s, e) = match self.re.find_at(self.text, self.last_end) { |
| None => return None, |
| Some((s, e)) => (s, e), |
| }; |
| if s == e { |
| // This is an empty match. To ensure we make progress, start |
| // the next search at the smallest possible starting position |
| // of the next match following this one. |
| self.last_end = self.re.next_after_empty(self.text, e); |
| // Don't accept empty matches immediately following a match. |
| // Just move on to the next match. |
| if Some(e) == self.last_match { |
| return self.next(); |
| } |
| } else { |
| self.last_end = e; |
| } |
| self.last_match = Some(e); |
| Some((s, e)) |
| } |
| } |
| |
| /// An iterator over all non-overlapping successive leftmost-first matches with |
| /// captures. |
| pub struct CaptureMatches<'t, R>(Matches<'t, R>) |
| where |
| R: RegularExpression, |
| R::Text: 't; |
| |
| impl<'t, R> CaptureMatches<'t, R> |
| where |
| R: RegularExpression, |
| R::Text: 't, |
| { |
| /// Return the text being searched. |
| pub fn text(&self) -> &'t R::Text { |
| self.0.text() |
| } |
| |
| /// Return the underlying regex. |
| pub fn regex(&self) -> &R { |
| self.0.regex() |
| } |
| } |
| |
| impl<'t, R> Iterator for CaptureMatches<'t, R> |
| where |
| R: RegularExpression, |
| R::Text: 't + AsRef<[u8]>, |
| { |
| type Item = Locations; |
| |
| fn next(&mut self) -> Option<Locations> { |
| if self.0.last_end > self.0.text.as_ref().len() { |
| return None; |
| } |
| let mut locs = self.0.re.locations(); |
| let (s, e) = match self.0.re.captures_read_at( |
| &mut locs, |
| self.0.text, |
| self.0.last_end, |
| ) { |
| None => return None, |
| Some((s, e)) => (s, e), |
| }; |
| if s == e { |
| self.0.last_end = self.0.re.next_after_empty(self.0.text, e); |
| if Some(e) == self.0.last_match { |
| return self.next(); |
| } |
| } else { |
| self.0.last_end = e; |
| } |
| self.0.last_match = Some(e); |
| Some(locs) |
| } |
| } |