blob: d14a9f76139ee523f8bcab55d481a8df0b7f541f [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001/// Slot is a single saved capture location. Note that there are two slots for
2/// every capture in a regular expression (one slot each for the start and end
3/// of the capture).
4pub type Slot = Option<usize>;
5
6/// Locations represents the offsets of each capturing group in a regex for
7/// a single match.
8///
9/// Unlike `Captures`, a `Locations` value only stores offsets.
10#[doc(hidden)]
11#[derive(Clone, Debug)]
12pub struct Locations(Vec<Slot>);
13
14impl Locations {
15 /// Returns the start and end positions of the Nth capture group. Returns
16 /// `None` if `i` is not a valid capture group or if the capture group did
17 /// not match anything. The positions returned are *always* byte indices
18 /// with respect to the original string matched.
19 pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
20 let (s, e) = (i * 2, i * 2 + 1);
21 match (self.0.get(s), self.0.get(e)) {
22 (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
23 _ => None,
24 }
25 }
26
27 /// Creates an iterator of all the capture group positions in order of
28 /// appearance in the regular expression. Positions are byte indices
29 /// in terms of the original string matched.
30 pub fn iter(&self) -> SubCapturesPosIter {
31 SubCapturesPosIter { idx: 0, locs: self }
32 }
33
34 /// Returns the total number of capturing groups.
35 ///
36 /// This is always at least `1` since every regex has at least `1`
37 /// capturing group that corresponds to the entire match.
38 pub fn len(&self) -> usize {
39 self.0.len() / 2
40 }
41
42 /// Return the individual slots as a slice.
43 pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
44 &mut self.0
45 }
46}
47
48/// An iterator over capture group positions for a particular match of a
49/// regular expression.
50///
51/// Positions are byte indices in terms of the original string matched.
52///
53/// `'c` is the lifetime of the captures.
Chih-Hung Hsieh849e4452020-10-26 13:16:47 -070054#[derive(Clone)]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -070055pub struct SubCapturesPosIter<'c> {
56 idx: usize,
57 locs: &'c Locations,
58}
59
60impl<'c> Iterator for SubCapturesPosIter<'c> {
61 type Item = Option<(usize, usize)>;
62
63 fn next(&mut self) -> Option<Option<(usize, usize)>> {
64 if self.idx >= self.locs.len() {
65 return None;
66 }
67 let x = match self.locs.pos(self.idx) {
68 None => Some(None),
69 Some((s, e)) => Some(Some((s, e))),
70 };
71 self.idx += 1;
72 x
73 }
74}
75
76/// `RegularExpression` describes types that can implement regex searching.
77///
78/// This trait is my attempt at reducing code duplication and to standardize
79/// the internal API. Specific duplication that is avoided are the `find`
80/// and `capture` iterators, which are slightly tricky.
81///
82/// It's not clear whether this trait is worth it, and it also isn't
83/// clear whether it's useful as a public trait or not. Methods like
84/// `next_after_empty` reak of bad design, but the rest of the methods seem
85/// somewhat reasonable. One particular thing this trait would expose would be
86/// the ability to start the search of a regex anywhere in a haystack, which
87/// isn't possible in the current public API.
88pub trait RegularExpression: Sized {
89 /// The type of the haystack.
90 type Text: ?Sized;
91
92 /// The number of capture slots in the compiled regular expression. This is
93 /// always two times the number of capture groups (two slots per group).
94 fn slots_len(&self) -> usize;
95
96 /// Allocates fresh space for all capturing groups in this regex.
97 fn locations(&self) -> Locations {
98 Locations(vec![None; self.slots_len()])
99 }
100
101 /// Returns the position of the next character after `i`.
102 ///
103 /// For example, a haystack with type `&[u8]` probably returns `i+1`,
104 /// whereas a haystack with type `&str` probably returns `i` plus the
105 /// length of the next UTF-8 sequence.
106 fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
107
108 /// Returns the location of the shortest match.
109 fn shortest_match_at(
110 &self,
111 text: &Self::Text,
112 start: usize,
113 ) -> Option<usize>;
114
115 /// Returns whether the regex matches the text given.
116 fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
117
118 /// Returns the leftmost-first match location if one exists.
119 fn find_at(
120 &self,
121 text: &Self::Text,
122 start: usize,
123 ) -> Option<(usize, usize)>;
124
125 /// Returns the leftmost-first match location if one exists, and also
126 /// fills in any matching capture slot locations.
127 fn captures_read_at(
128 &self,
129 locs: &mut Locations,
130 text: &Self::Text,
131 start: usize,
132 ) -> Option<(usize, usize)>;
133
134 /// Returns an iterator over all non-overlapping successive leftmost-first
135 /// matches.
136 fn find_iter(self, text: &Self::Text) -> Matches<Self> {
137 Matches { re: self, text: text, last_end: 0, last_match: None }
138 }
139
140 /// Returns an iterator over all non-overlapping successive leftmost-first
141 /// matches with captures.
142 fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> {
143 CaptureMatches(self.find_iter(text))
144 }
145}
146
147/// An iterator over all non-overlapping successive leftmost-first matches.
148pub struct Matches<'t, R>
149where
150 R: RegularExpression,
151 R::Text: 't,
152{
153 re: R,
154 text: &'t R::Text,
155 last_end: usize,
156 last_match: Option<usize>,
157}
158
159impl<'t, R> Matches<'t, R>
160where
161 R: RegularExpression,
162 R::Text: 't,
163{
164 /// Return the text being searched.
165 pub fn text(&self) -> &'t R::Text {
166 self.text
167 }
168
169 /// Return the underlying regex.
170 pub fn regex(&self) -> &R {
171 &self.re
172 }
173}
174
175impl<'t, R> Iterator for Matches<'t, R>
176where
177 R: RegularExpression,
178 R::Text: 't + AsRef<[u8]>,
179{
180 type Item = (usize, usize);
181
182 fn next(&mut self) -> Option<(usize, usize)> {
183 if self.last_end > self.text.as_ref().len() {
184 return None;
185 }
186 let (s, e) = match self.re.find_at(self.text, self.last_end) {
187 None => return None,
188 Some((s, e)) => (s, e),
189 };
190 if s == e {
191 // This is an empty match. To ensure we make progress, start
192 // the next search at the smallest possible starting position
193 // of the next match following this one.
194 self.last_end = self.re.next_after_empty(self.text, e);
195 // Don't accept empty matches immediately following a match.
196 // Just move on to the next match.
197 if Some(e) == self.last_match {
198 return self.next();
199 }
200 } else {
201 self.last_end = e;
202 }
203 self.last_match = Some(e);
204 Some((s, e))
205 }
206}
207
208/// An iterator over all non-overlapping successive leftmost-first matches with
209/// captures.
210pub struct CaptureMatches<'t, R>(Matches<'t, R>)
211where
212 R: RegularExpression,
213 R::Text: 't;
214
215impl<'t, R> CaptureMatches<'t, R>
216where
217 R: RegularExpression,
218 R::Text: 't,
219{
220 /// Return the text being searched.
221 pub fn text(&self) -> &'t R::Text {
222 self.0.text()
223 }
224
225 /// Return the underlying regex.
226 pub fn regex(&self) -> &R {
227 self.0.regex()
228 }
229}
230
231impl<'t, R> Iterator for CaptureMatches<'t, R>
232where
233 R: RegularExpression,
234 R::Text: 't + AsRef<[u8]>,
235{
236 type Item = Locations;
237
238 fn next(&mut self) -> Option<Locations> {
239 if self.0.last_end > self.0.text.as_ref().len() {
240 return None;
241 }
242 let mut locs = self.0.re.locations();
243 let (s, e) = match self.0.re.captures_read_at(
244 &mut locs,
245 self.0.text,
246 self.0.last_end,
247 ) {
248 None => return None,
249 Some((s, e)) => (s, e),
250 };
251 if s == e {
252 self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
253 if Some(e) == self.0.last_match {
254 return self.next();
255 }
256 } else {
257 self.0.last_end = e;
258 }
259 self.0.last_match = Some(e);
260 Some(locs)
261 }
262}