blob: 3fef99d435032fd1e99b9a98e043b3bb9d6d0748 [file] [log] [blame]
Chih-Hung Hsiehe42c5052020-04-16 10:44:21 -07001/// The set of user configurable options for compiling zero or more regexes.
2#[derive(Clone, Debug)]
3#[allow(missing_docs)]
4pub struct RegexOptions {
5 pub pats: Vec<String>,
6 pub size_limit: usize,
7 pub dfa_size_limit: usize,
8 pub nest_limit: u32,
9 pub case_insensitive: bool,
10 pub multi_line: bool,
11 pub dot_matches_new_line: bool,
12 pub swap_greed: bool,
13 pub ignore_whitespace: bool,
14 pub unicode: bool,
15 pub octal: bool,
16}
17
18impl Default for RegexOptions {
19 fn default() -> Self {
20 RegexOptions {
21 pats: vec![],
22 size_limit: 10 * (1 << 20),
23 dfa_size_limit: 2 * (1 << 20),
24 nest_limit: 250,
25 case_insensitive: false,
26 multi_line: false,
27 dot_matches_new_line: false,
28 swap_greed: false,
29 ignore_whitespace: false,
30 unicode: true,
31 octal: false,
32 }
33 }
34}
35
36macro_rules! define_builder {
37 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38 pub mod $name {
39 use super::RegexOptions;
40 use error::Error;
41 use exec::ExecBuilder;
42
43 use $regex_mod::Regex;
44
45 /// A configurable builder for a regular expression.
46 ///
47 /// A builder can be used to configure how the regex is built, for example, by
48 /// setting the default flags (which can be overridden in the expression
49 /// itself) or setting various limits.
50 pub struct RegexBuilder(RegexOptions);
51
52 impl RegexBuilder {
53 /// Create a new regular expression builder with the given pattern.
54 ///
55 /// If the pattern is invalid, then an error will be returned when
56 /// `build` is called.
57 pub fn new(pattern: &str) -> RegexBuilder {
58 let mut builder = RegexBuilder(RegexOptions::default());
59 builder.0.pats.push(pattern.to_owned());
60 builder
61 }
62
63 /// Consume the builder and compile the regular expression.
64 ///
65 /// Note that calling `as_str` on the resulting `Regex` will produce the
66 /// pattern given to `new` verbatim. Notably, it will not incorporate any
67 /// of the flags set on this builder.
68 pub fn build(&self) -> Result<Regex, Error> {
69 ExecBuilder::new_options(self.0.clone())
70 .only_utf8($only_utf8)
71 .build()
72 .map(Regex::from)
73 }
74
75 /// Set the value for the case insensitive (`i`) flag.
76 ///
77 /// When enabled, letters in the pattern will match both upper case and
78 /// lower case variants.
79 pub fn case_insensitive(
80 &mut self,
81 yes: bool,
82 ) -> &mut RegexBuilder {
83 self.0.case_insensitive = yes;
84 self
85 }
86
87 /// Set the value for the multi-line matching (`m`) flag.
88 ///
89 /// When enabled, `^` matches the beginning of lines and `$` matches the
90 /// end of lines.
91 ///
92 /// By default, they match beginning/end of the input.
93 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
94 self.0.multi_line = yes;
95 self
96 }
97
98 /// Set the value for the any character (`s`) flag, where in `.` matches
99 /// anything when `s` is set and matches anything except for new line when
100 /// it is not set (the default).
101 ///
102 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
103 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
104 /// Unicode is enabled.
105 pub fn dot_matches_new_line(
106 &mut self,
107 yes: bool,
108 ) -> &mut RegexBuilder {
109 self.0.dot_matches_new_line = yes;
110 self
111 }
112
113 /// Set the value for the greedy swap (`U`) flag.
114 ///
115 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
116 /// match) and `a*?` is greedy (tries to find longest match).
117 ///
118 /// By default, `a*` is greedy and `a*?` is lazy.
119 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
120 self.0.swap_greed = yes;
121 self
122 }
123
124 /// Set the value for the ignore whitespace (`x`) flag.
125 ///
126 /// When enabled, whitespace such as new lines and spaces will be ignored
127 /// between expressions of the pattern, and `#` can be used to start a
128 /// comment until the next new line.
129 pub fn ignore_whitespace(
130 &mut self,
131 yes: bool,
132 ) -> &mut RegexBuilder {
133 self.0.ignore_whitespace = yes;
134 self
135 }
136
137 /// Set the value for the Unicode (`u`) flag.
138 ///
139 /// Enabled by default. When disabled, character classes such as `\w` only
140 /// match ASCII word characters instead of all Unicode word characters.
141 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
142 self.0.unicode = yes;
143 self
144 }
145
146 /// Whether to support octal syntax or not.
147 ///
148 /// Octal syntax is a little-known way of uttering Unicode codepoints in
149 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
150 /// `\141` are all equivalent regular expressions, where the last example
151 /// shows octal syntax.
152 ///
153 /// While supporting octal syntax isn't in and of itself a problem, it does
154 /// make good error messages harder. That is, in PCRE based regex engines,
155 /// syntax like `\0` invokes a backreference, which is explicitly
156 /// unsupported in Rust's regex engine. However, many users expect it to
157 /// be supported. Therefore, when octal support is disabled, the error
158 /// message will explicitly mention that backreferences aren't supported.
159 ///
160 /// Octal syntax is disabled by default.
161 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
162 self.0.octal = yes;
163 self
164 }
165
166 /// Set the approximate size limit of the compiled regular expression.
167 ///
168 /// This roughly corresponds to the number of bytes occupied by a single
169 /// compiled program. If the program exceeds this number, then a
170 /// compilation error is returned.
171 pub fn size_limit(
172 &mut self,
173 limit: usize,
174 ) -> &mut RegexBuilder {
175 self.0.size_limit = limit;
176 self
177 }
178
179 /// Set the approximate size of the cache used by the DFA.
180 ///
181 /// This roughly corresponds to the number of bytes that the DFA will
182 /// use while searching.
183 ///
184 /// Note that this is a *per thread* limit. There is no way to set a global
185 /// limit. In particular, if a regex is used from multiple threads
186 /// simultaneously, then each thread may use up to the number of bytes
187 /// specified here.
188 pub fn dfa_size_limit(
189 &mut self,
190 limit: usize,
191 ) -> &mut RegexBuilder {
192 self.0.dfa_size_limit = limit;
193 self
194 }
195
196 /// Set the nesting limit for this parser.
197 ///
198 /// The nesting limit controls how deep the abstract syntax tree is allowed
199 /// to be. If the AST exceeds the given limit (e.g., with too many nested
200 /// groups), then an error is returned by the parser.
201 ///
202 /// The purpose of this limit is to act as a heuristic to prevent stack
203 /// overflow for consumers that do structural induction on an `Ast` using
204 /// explicit recursion. While this crate never does this (instead using
205 /// constant stack space and moving the call stack to the heap), other
206 /// crates may.
207 ///
208 /// This limit is not checked until the entire Ast is parsed. Therefore,
209 /// if callers want to put a limit on the amount of heap space used, then
210 /// they should impose a limit on the length, in bytes, of the concrete
211 /// pattern string. In particular, this is viable since this parser
212 /// implementation will limit itself to heap space proportional to the
213 /// length of the pattern string.
214 ///
215 /// Note that a nest limit of `0` will return a nest limit error for most
216 /// patterns but not all. For example, a nest limit of `0` permits `a` but
217 /// not `ab`, since `ab` requires a concatenation, which results in a nest
218 /// depth of `1`. In general, a nest limit is not something that manifests
219 /// in an obvious way in the concrete syntax, therefore, it should not be
220 /// used in a granular way.
221 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
222 self.0.nest_limit = limit;
223 self
224 }
225 }
226 }
227 };
228}
229
230define_builder!(bytes, re_bytes, false);
231define_builder!(unicode, re_unicode, true);
232
233macro_rules! define_set_builder {
234 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
235 pub mod $name {
236 use super::RegexOptions;
237 use error::Error;
238 use exec::ExecBuilder;
239
240 use re_set::$regex_mod::RegexSet;
241
242 /// A configurable builder for a set of regular expressions.
243 ///
244 /// A builder can be used to configure how the regexes are built, for example,
245 /// by setting the default flags (which can be overridden in the expression
246 /// itself) or setting various limits.
247 pub struct RegexSetBuilder(RegexOptions);
248
249 impl RegexSetBuilder {
250 /// Create a new regular expression builder with the given pattern.
251 ///
252 /// If the pattern is invalid, then an error will be returned when
253 /// `build` is called.
254 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
255 where
256 S: AsRef<str>,
257 I: IntoIterator<Item = S>,
258 {
259 let mut builder = RegexSetBuilder(RegexOptions::default());
260 for pat in patterns {
261 builder.0.pats.push(pat.as_ref().to_owned());
262 }
263 builder
264 }
265
266 /// Consume the builder and compile the regular expressions into a set.
267 pub fn build(&self) -> Result<RegexSet, Error> {
268 ExecBuilder::new_options(self.0.clone())
269 .only_utf8($only_utf8)
270 .build()
271 .map(RegexSet::from)
272 }
273
274 /// Set the value for the case insensitive (`i`) flag.
275 pub fn case_insensitive(
276 &mut self,
277 yes: bool,
278 ) -> &mut RegexSetBuilder {
279 self.0.case_insensitive = yes;
280 self
281 }
282
283 /// Set the value for the multi-line matching (`m`) flag.
284 pub fn multi_line(
285 &mut self,
286 yes: bool,
287 ) -> &mut RegexSetBuilder {
288 self.0.multi_line = yes;
289 self
290 }
291
292 /// Set the value for the any character (`s`) flag, where in `.` matches
293 /// anything when `s` is set and matches anything except for new line when
294 /// it is not set (the default).
295 ///
296 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
297 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
298 /// expressions.
299 pub fn dot_matches_new_line(
300 &mut self,
301 yes: bool,
302 ) -> &mut RegexSetBuilder {
303 self.0.dot_matches_new_line = yes;
304 self
305 }
306
307 /// Set the value for the greedy swap (`U`) flag.
308 pub fn swap_greed(
309 &mut self,
310 yes: bool,
311 ) -> &mut RegexSetBuilder {
312 self.0.swap_greed = yes;
313 self
314 }
315
316 /// Set the value for the ignore whitespace (`x`) flag.
317 pub fn ignore_whitespace(
318 &mut self,
319 yes: bool,
320 ) -> &mut RegexSetBuilder {
321 self.0.ignore_whitespace = yes;
322 self
323 }
324
325 /// Set the value for the Unicode (`u`) flag.
326 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
327 self.0.unicode = yes;
328 self
329 }
330
331 /// Whether to support octal syntax or not.
332 ///
333 /// Octal syntax is a little-known way of uttering Unicode codepoints in
334 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
335 /// `\141` are all equivalent regular expressions, where the last example
336 /// shows octal syntax.
337 ///
338 /// While supporting octal syntax isn't in and of itself a problem, it does
339 /// make good error messages harder. That is, in PCRE based regex engines,
340 /// syntax like `\0` invokes a backreference, which is explicitly
341 /// unsupported in Rust's regex engine. However, many users expect it to
342 /// be supported. Therefore, when octal support is disabled, the error
343 /// message will explicitly mention that backreferences aren't supported.
344 ///
345 /// Octal syntax is disabled by default.
346 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
347 self.0.octal = yes;
348 self
349 }
350
351 /// Set the approximate size limit of the compiled regular expression.
352 ///
353 /// This roughly corresponds to the number of bytes occupied by a single
354 /// compiled program. If the program exceeds this number, then a
355 /// compilation error is returned.
356 pub fn size_limit(
357 &mut self,
358 limit: usize,
359 ) -> &mut RegexSetBuilder {
360 self.0.size_limit = limit;
361 self
362 }
363
364 /// Set the approximate size of the cache used by the DFA.
365 ///
366 /// This roughly corresponds to the number of bytes that the DFA will
367 /// use while searching.
368 ///
369 /// Note that this is a *per thread* limit. There is no way to set a global
370 /// limit. In particular, if a regex is used from multiple threads
371 /// simultaneously, then each thread may use up to the number of bytes
372 /// specified here.
373 pub fn dfa_size_limit(
374 &mut self,
375 limit: usize,
376 ) -> &mut RegexSetBuilder {
377 self.0.dfa_size_limit = limit;
378 self
379 }
380
381 /// Set the nesting limit for this parser.
382 ///
383 /// The nesting limit controls how deep the abstract syntax tree is allowed
384 /// to be. If the AST exceeds the given limit (e.g., with too many nested
385 /// groups), then an error is returned by the parser.
386 ///
387 /// The purpose of this limit is to act as a heuristic to prevent stack
388 /// overflow for consumers that do structural induction on an `Ast` using
389 /// explicit recursion. While this crate never does this (instead using
390 /// constant stack space and moving the call stack to the heap), other
391 /// crates may.
392 ///
393 /// This limit is not checked until the entire Ast is parsed. Therefore,
394 /// if callers want to put a limit on the amount of heap space used, then
395 /// they should impose a limit on the length, in bytes, of the concrete
396 /// pattern string. In particular, this is viable since this parser
397 /// implementation will limit itself to heap space proportional to the
398 /// length of the pattern string.
399 ///
400 /// Note that a nest limit of `0` will return a nest limit error for most
401 /// patterns but not all. For example, a nest limit of `0` permits `a` but
402 /// not `ab`, since `ab` requires a concatenation, which results in a nest
403 /// depth of `1`. In general, a nest limit is not something that manifests
404 /// in an obvious way in the concrete syntax, therefore, it should not be
405 /// used in a granular way.
406 pub fn nest_limit(
407 &mut self,
408 limit: u32,
409 ) -> &mut RegexSetBuilder {
410 self.0.nest_limit = limit;
411 self
412 }
413 }
414 }
415 };
416}
417
418define_set_builder!(set_bytes, bytes, false);
419define_set_builder!(set_unicode, unicode, true);