Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 1 | use std::fmt; |
| 2 | use std::iter::FusedIterator; |
| 3 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 4 | /// Slot is a single saved capture location. Note that there are two slots for |
| 5 | /// every capture in a regular expression (one slot each for the start and end |
| 6 | /// of the capture). |
| 7 | pub type Slot = Option<usize>; |
| 8 | |
| 9 | /// Locations represents the offsets of each capturing group in a regex for |
| 10 | /// a single match. |
| 11 | /// |
| 12 | /// Unlike `Captures`, a `Locations` value only stores offsets. |
| 13 | #[doc(hidden)] |
| 14 | #[derive(Clone, Debug)] |
| 15 | pub struct Locations(Vec<Slot>); |
| 16 | |
| 17 | impl Locations { |
| 18 | /// Returns the start and end positions of the Nth capture group. Returns |
| 19 | /// `None` if `i` is not a valid capture group or if the capture group did |
| 20 | /// not match anything. The positions returned are *always* byte indices |
| 21 | /// with respect to the original string matched. |
| 22 | pub fn pos(&self, i: usize) -> Option<(usize, usize)> { |
| 23 | let (s, e) = (i * 2, i * 2 + 1); |
| 24 | match (self.0.get(s), self.0.get(e)) { |
| 25 | (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), |
| 26 | _ => None, |
| 27 | } |
| 28 | } |
| 29 | |
| 30 | /// Creates an iterator of all the capture group positions in order of |
| 31 | /// appearance in the regular expression. Positions are byte indices |
| 32 | /// in terms of the original string matched. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 33 | pub fn iter(&self) -> SubCapturesPosIter<'_> { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 34 | SubCapturesPosIter { idx: 0, locs: self } |
| 35 | } |
| 36 | |
| 37 | /// Returns the total number of capturing groups. |
| 38 | /// |
| 39 | /// This is always at least `1` since every regex has at least `1` |
| 40 | /// capturing group that corresponds to the entire match. |
| 41 | pub fn len(&self) -> usize { |
| 42 | self.0.len() / 2 |
| 43 | } |
| 44 | |
| 45 | /// Return the individual slots as a slice. |
| 46 | pub(crate) fn as_slots(&mut self) -> &mut [Slot] { |
| 47 | &mut self.0 |
| 48 | } |
| 49 | } |
| 50 | |
| 51 | /// An iterator over capture group positions for a particular match of a |
| 52 | /// regular expression. |
| 53 | /// |
| 54 | /// Positions are byte indices in terms of the original string matched. |
| 55 | /// |
| 56 | /// `'c` is the lifetime of the captures. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 57 | #[derive(Clone, Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 58 | pub struct SubCapturesPosIter<'c> { |
| 59 | idx: usize, |
| 60 | locs: &'c Locations, |
| 61 | } |
| 62 | |
| 63 | impl<'c> Iterator for SubCapturesPosIter<'c> { |
| 64 | type Item = Option<(usize, usize)>; |
| 65 | |
| 66 | fn next(&mut self) -> Option<Option<(usize, usize)>> { |
| 67 | if self.idx >= self.locs.len() { |
| 68 | return None; |
| 69 | } |
| 70 | let x = match self.locs.pos(self.idx) { |
| 71 | None => Some(None), |
| 72 | Some((s, e)) => Some(Some((s, e))), |
| 73 | }; |
| 74 | self.idx += 1; |
| 75 | x |
| 76 | } |
| 77 | } |
| 78 | |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 79 | impl<'c> FusedIterator for SubCapturesPosIter<'c> {} |
| 80 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 81 | /// `RegularExpression` describes types that can implement regex searching. |
| 82 | /// |
| 83 | /// This trait is my attempt at reducing code duplication and to standardize |
| 84 | /// the internal API. Specific duplication that is avoided are the `find` |
| 85 | /// and `capture` iterators, which are slightly tricky. |
| 86 | /// |
| 87 | /// It's not clear whether this trait is worth it, and it also isn't |
| 88 | /// clear whether it's useful as a public trait or not. Methods like |
| 89 | /// `next_after_empty` reak of bad design, but the rest of the methods seem |
| 90 | /// somewhat reasonable. One particular thing this trait would expose would be |
| 91 | /// the ability to start the search of a regex anywhere in a haystack, which |
| 92 | /// isn't possible in the current public API. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 93 | pub trait RegularExpression: Sized + fmt::Debug { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 94 | /// The type of the haystack. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 95 | type Text: ?Sized + fmt::Debug; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 96 | |
| 97 | /// The number of capture slots in the compiled regular expression. This is |
| 98 | /// always two times the number of capture groups (two slots per group). |
| 99 | fn slots_len(&self) -> usize; |
| 100 | |
| 101 | /// Allocates fresh space for all capturing groups in this regex. |
| 102 | fn locations(&self) -> Locations { |
| 103 | Locations(vec![None; self.slots_len()]) |
| 104 | } |
| 105 | |
| 106 | /// Returns the position of the next character after `i`. |
| 107 | /// |
| 108 | /// For example, a haystack with type `&[u8]` probably returns `i+1`, |
| 109 | /// whereas a haystack with type `&str` probably returns `i` plus the |
| 110 | /// length of the next UTF-8 sequence. |
| 111 | fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; |
| 112 | |
| 113 | /// Returns the location of the shortest match. |
| 114 | fn shortest_match_at( |
| 115 | &self, |
| 116 | text: &Self::Text, |
| 117 | start: usize, |
| 118 | ) -> Option<usize>; |
| 119 | |
| 120 | /// Returns whether the regex matches the text given. |
| 121 | fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; |
| 122 | |
| 123 | /// Returns the leftmost-first match location if one exists. |
| 124 | fn find_at( |
| 125 | &self, |
| 126 | text: &Self::Text, |
| 127 | start: usize, |
| 128 | ) -> Option<(usize, usize)>; |
| 129 | |
| 130 | /// Returns the leftmost-first match location if one exists, and also |
| 131 | /// fills in any matching capture slot locations. |
| 132 | fn captures_read_at( |
| 133 | &self, |
| 134 | locs: &mut Locations, |
| 135 | text: &Self::Text, |
| 136 | start: usize, |
| 137 | ) -> Option<(usize, usize)>; |
| 138 | |
| 139 | /// Returns an iterator over all non-overlapping successive leftmost-first |
| 140 | /// matches. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 141 | fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 142 | Matches { re: self, text: text, last_end: 0, last_match: None } |
| 143 | } |
| 144 | |
| 145 | /// Returns an iterator over all non-overlapping successive leftmost-first |
| 146 | /// matches with captures. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 147 | fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 148 | CaptureMatches(self.find_iter(text)) |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | /// An iterator over all non-overlapping successive leftmost-first matches. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 153 | #[derive(Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 154 | pub struct Matches<'t, R> |
| 155 | where |
| 156 | R: RegularExpression, |
| 157 | R::Text: 't, |
| 158 | { |
| 159 | re: R, |
| 160 | text: &'t R::Text, |
| 161 | last_end: usize, |
| 162 | last_match: Option<usize>, |
| 163 | } |
| 164 | |
| 165 | impl<'t, R> Matches<'t, R> |
| 166 | where |
| 167 | R: RegularExpression, |
| 168 | R::Text: 't, |
| 169 | { |
| 170 | /// Return the text being searched. |
| 171 | pub fn text(&self) -> &'t R::Text { |
| 172 | self.text |
| 173 | } |
| 174 | |
| 175 | /// Return the underlying regex. |
| 176 | pub fn regex(&self) -> &R { |
| 177 | &self.re |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | impl<'t, R> Iterator for Matches<'t, R> |
| 182 | where |
| 183 | R: RegularExpression, |
| 184 | R::Text: 't + AsRef<[u8]>, |
| 185 | { |
| 186 | type Item = (usize, usize); |
| 187 | |
| 188 | fn next(&mut self) -> Option<(usize, usize)> { |
| 189 | if self.last_end > self.text.as_ref().len() { |
| 190 | return None; |
| 191 | } |
| 192 | let (s, e) = match self.re.find_at(self.text, self.last_end) { |
| 193 | None => return None, |
| 194 | Some((s, e)) => (s, e), |
| 195 | }; |
| 196 | if s == e { |
| 197 | // This is an empty match. To ensure we make progress, start |
| 198 | // the next search at the smallest possible starting position |
| 199 | // of the next match following this one. |
| 200 | self.last_end = self.re.next_after_empty(self.text, e); |
| 201 | // Don't accept empty matches immediately following a match. |
| 202 | // Just move on to the next match. |
| 203 | if Some(e) == self.last_match { |
| 204 | return self.next(); |
| 205 | } |
| 206 | } else { |
| 207 | self.last_end = e; |
| 208 | } |
| 209 | self.last_match = Some(e); |
| 210 | Some((s, e)) |
| 211 | } |
| 212 | } |
| 213 | |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 214 | impl<'t, R> FusedIterator for Matches<'t, R> |
| 215 | where |
| 216 | R: RegularExpression, |
| 217 | R::Text: 't + AsRef<[u8]>, |
| 218 | { |
| 219 | } |
| 220 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 221 | /// An iterator over all non-overlapping successive leftmost-first matches with |
| 222 | /// captures. |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 223 | #[derive(Debug)] |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 224 | pub struct CaptureMatches<'t, R>(Matches<'t, R>) |
| 225 | where |
| 226 | R: RegularExpression, |
| 227 | R::Text: 't; |
| 228 | |
| 229 | impl<'t, R> CaptureMatches<'t, R> |
| 230 | where |
| 231 | R: RegularExpression, |
| 232 | R::Text: 't, |
| 233 | { |
| 234 | /// Return the text being searched. |
| 235 | pub fn text(&self) -> &'t R::Text { |
| 236 | self.0.text() |
| 237 | } |
| 238 | |
| 239 | /// Return the underlying regex. |
| 240 | pub fn regex(&self) -> &R { |
| 241 | self.0.regex() |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | impl<'t, R> Iterator for CaptureMatches<'t, R> |
| 246 | where |
| 247 | R: RegularExpression, |
| 248 | R::Text: 't + AsRef<[u8]>, |
| 249 | { |
| 250 | type Item = Locations; |
| 251 | |
| 252 | fn next(&mut self) -> Option<Locations> { |
| 253 | if self.0.last_end > self.0.text.as_ref().len() { |
| 254 | return None; |
| 255 | } |
| 256 | let mut locs = self.0.re.locations(); |
| 257 | let (s, e) = match self.0.re.captures_read_at( |
| 258 | &mut locs, |
| 259 | self.0.text, |
| 260 | self.0.last_end, |
| 261 | ) { |
| 262 | None => return None, |
| 263 | Some((s, e)) => (s, e), |
| 264 | }; |
| 265 | if s == e { |
| 266 | self.0.last_end = self.0.re.next_after_empty(self.0.text, e); |
| 267 | if Some(e) == self.0.last_match { |
| 268 | return self.next(); |
| 269 | } |
| 270 | } else { |
| 271 | self.0.last_end = e; |
| 272 | } |
| 273 | self.0.last_match = Some(e); |
| 274 | Some(locs) |
| 275 | } |
| 276 | } |
Haibo Huang | 47619dd | 2021-01-08 17:05:43 -0800 | [diff] [blame] | 277 | |
| 278 | impl<'t, R> FusedIterator for CaptureMatches<'t, R> |
| 279 | where |
| 280 | R: RegularExpression, |
| 281 | R::Text: 't + AsRef<[u8]>, |
| 282 | { |
| 283 | } |