blob: eb44b9381ffafbdfa96466052144edaf2ee3422b [file] [log] [blame]
Chih-Hung Hsieh048fc042020-04-16 10:44:22 -07001/*!
2This module provides a regular expression printer for `Hir`.
3*/
4
5use std::fmt;
6
7use hir::visitor::{self, Visitor};
8use hir::{self, Hir, HirKind};
9use is_meta_character;
10
11/// A builder for constructing a printer.
12///
13/// Note that since a printer doesn't have any configuration knobs, this type
14/// remains unexported.
15#[derive(Clone, Debug)]
16struct PrinterBuilder {
17 _priv: (),
18}
19
20impl Default for PrinterBuilder {
21 fn default() -> PrinterBuilder {
22 PrinterBuilder::new()
23 }
24}
25
26impl PrinterBuilder {
27 fn new() -> PrinterBuilder {
28 PrinterBuilder { _priv: () }
29 }
30
31 fn build(&self) -> Printer {
32 Printer { _priv: () }
33 }
34}
35
36/// A printer for a regular expression's high-level intermediate
37/// representation.
38///
39/// A printer converts a high-level intermediate representation (HIR) to a
40/// regular expression pattern string. This particular printer uses constant
41/// stack space and heap space proportional to the size of the HIR.
42///
43/// Since this printer is only using the HIR, the pattern it prints will likely
44/// not resemble the original pattern at all. For example, a pattern like
45/// `\pL` will have its entire class written out.
46///
47/// The purpose of this printer is to provide a means to mutate an HIR and then
48/// build a regular expression from the result of that mutation. (A regex
49/// library could provide a constructor from this HIR explicitly, but that
50/// creates an unnecessary public coupling between the regex library and this
51/// specific HIR representation.)
52#[derive(Debug)]
53pub struct Printer {
54 _priv: (),
55}
56
57impl Printer {
58 /// Create a new printer.
59 pub fn new() -> Printer {
60 PrinterBuilder::new().build()
61 }
62
63 /// Print the given `Ast` to the given writer. The writer must implement
64 /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65 /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66 /// implementations) or a `&mut String`.
67 pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68 visitor::visit(hir, Writer { printer: self, wtr: wtr })
69 }
70}
71
72#[derive(Debug)]
73struct Writer<'p, W> {
74 printer: &'p mut Printer,
75 wtr: W,
76}
77
78impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
79 type Output = ();
80 type Err = fmt::Error;
81
82 fn finish(self) -> fmt::Result {
83 Ok(())
84 }
85
86 fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
87 match *hir.kind() {
88 HirKind::Empty
89 | HirKind::Repetition(_)
90 | HirKind::Concat(_)
91 | HirKind::Alternation(_) => {}
92 HirKind::Literal(hir::Literal::Unicode(c)) => {
93 self.write_literal_char(c)?;
94 }
95 HirKind::Literal(hir::Literal::Byte(b)) => {
96 self.write_literal_byte(b)?;
97 }
98 HirKind::Class(hir::Class::Unicode(ref cls)) => {
99 self.wtr.write_str("[")?;
100 for range in cls.iter() {
101 if range.start() == range.end() {
102 self.write_literal_char(range.start())?;
103 } else {
104 self.write_literal_char(range.start())?;
105 self.wtr.write_str("-")?;
106 self.write_literal_char(range.end())?;
107 }
108 }
109 self.wtr.write_str("]")?;
110 }
111 HirKind::Class(hir::Class::Bytes(ref cls)) => {
112 self.wtr.write_str("(?-u:[")?;
113 for range in cls.iter() {
114 if range.start() == range.end() {
115 self.write_literal_class_byte(range.start())?;
116 } else {
117 self.write_literal_class_byte(range.start())?;
118 self.wtr.write_str("-")?;
119 self.write_literal_class_byte(range.end())?;
120 }
121 }
122 self.wtr.write_str("])")?;
123 }
124 HirKind::Anchor(hir::Anchor::StartLine) => {
125 self.wtr.write_str("(?m:^)")?;
126 }
127 HirKind::Anchor(hir::Anchor::EndLine) => {
128 self.wtr.write_str("(?m:$)")?;
129 }
130 HirKind::Anchor(hir::Anchor::StartText) => {
131 self.wtr.write_str(r"\A")?;
132 }
133 HirKind::Anchor(hir::Anchor::EndText) => {
134 self.wtr.write_str(r"\z")?;
135 }
136 HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
137 self.wtr.write_str(r"\b")?;
138 }
139 HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
140 self.wtr.write_str(r"\B")?;
141 }
142 HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
143 self.wtr.write_str(r"(?-u:\b)")?;
144 }
145 HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
146 self.wtr.write_str(r"(?-u:\B)")?;
147 }
148 HirKind::Group(ref x) => match x.kind {
149 hir::GroupKind::CaptureIndex(_) => {
150 self.wtr.write_str("(")?;
151 }
152 hir::GroupKind::CaptureName { ref name, .. } => {
153 write!(self.wtr, "(?P<{}>", name)?;
154 }
155 hir::GroupKind::NonCapturing => {
156 self.wtr.write_str("(?:")?;
157 }
158 },
159 }
160 Ok(())
161 }
162
163 fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
164 match *hir.kind() {
165 // Handled during visit_pre
166 HirKind::Empty
167 | HirKind::Literal(_)
168 | HirKind::Class(_)
169 | HirKind::Anchor(_)
170 | HirKind::WordBoundary(_)
171 | HirKind::Concat(_)
172 | HirKind::Alternation(_) => {}
173 HirKind::Repetition(ref x) => {
174 match x.kind {
175 hir::RepetitionKind::ZeroOrOne => {
176 self.wtr.write_str("?")?;
177 }
178 hir::RepetitionKind::ZeroOrMore => {
179 self.wtr.write_str("*")?;
180 }
181 hir::RepetitionKind::OneOrMore => {
182 self.wtr.write_str("+")?;
183 }
184 hir::RepetitionKind::Range(ref x) => match *x {
185 hir::RepetitionRange::Exactly(m) => {
186 write!(self.wtr, "{{{}}}", m)?;
187 }
188 hir::RepetitionRange::AtLeast(m) => {
189 write!(self.wtr, "{{{},}}", m)?;
190 }
191 hir::RepetitionRange::Bounded(m, n) => {
192 write!(self.wtr, "{{{},{}}}", m, n)?;
193 }
194 },
195 }
196 if !x.greedy {
197 self.wtr.write_str("?")?;
198 }
199 }
200 HirKind::Group(_) => {
201 self.wtr.write_str(")")?;
202 }
203 }
204 Ok(())
205 }
206
207 fn visit_alternation_in(&mut self) -> fmt::Result {
208 self.wtr.write_str("|")
209 }
210}
211
212impl<'p, W: fmt::Write> Writer<'p, W> {
213 fn write_literal_char(&mut self, c: char) -> fmt::Result {
214 if is_meta_character(c) {
215 self.wtr.write_str("\\")?;
216 }
217 self.wtr.write_char(c)
218 }
219
220 fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
221 let c = b as char;
222 if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
223 self.write_literal_char(c)
224 } else {
225 write!(self.wtr, "(?-u:\\x{:02X})", b)
226 }
227 }
228
229 fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
230 let c = b as char;
231 if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
232 self.write_literal_char(c)
233 } else {
234 write!(self.wtr, "\\x{:02X}", b)
235 }
236 }
237}
238
239#[cfg(test)]
240mod tests {
241 use super::Printer;
242 use ParserBuilder;
243
244 fn roundtrip(given: &str, expected: &str) {
245 roundtrip_with(|b| b, given, expected);
246 }
247
248 fn roundtrip_bytes(given: &str, expected: &str) {
249 roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
250 }
251
252 fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
253 where
254 F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
255 {
256 let mut builder = ParserBuilder::new();
257 f(&mut builder);
258 let hir = builder.build().parse(given).unwrap();
259
260 let mut printer = Printer::new();
261 let mut dst = String::new();
262 printer.print(&hir, &mut dst).unwrap();
263
264 // Check that the result is actually valid.
265 builder.build().parse(&dst).unwrap();
266
267 assert_eq!(expected, dst);
268 }
269
270 #[test]
271 fn print_literal() {
272 roundtrip("a", "a");
273 roundtrip(r"\xff", "\u{FF}");
274 roundtrip_bytes(r"\xff", "\u{FF}");
275 roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
276 roundtrip("☃", "☃");
277 }
278
279 #[test]
280 fn print_class() {
281 roundtrip(r"[a]", r"[a]");
282 roundtrip(r"[a-z]", r"[a-z]");
283 roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
284 roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
285 roundtrip(r"[-]", r"[\-]");
286 roundtrip(r"[☃-⛄]", r"[☃-⛄]");
287
288 roundtrip(r"(?-u)[a]", r"(?-u:[a])");
289 roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
290 roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
291
292 // The following test that the printer escapes meta characters
293 // in character classes.
294 roundtrip(r"[\[]", r"[\[]");
295 roundtrip(r"[Z-_]", r"[Z-_]");
296 roundtrip(r"[Z-_--Z]", r"[\[-_]");
297
298 // The following test that the printer escapes meta characters
299 // in byte oriented character classes.
300 roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
301 roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
302 roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
303 }
304
305 #[test]
306 fn print_anchor() {
307 roundtrip(r"^", r"\A");
308 roundtrip(r"$", r"\z");
309 roundtrip(r"(?m)^", r"(?m:^)");
310 roundtrip(r"(?m)$", r"(?m:$)");
311 }
312
313 #[test]
314 fn print_word_boundary() {
315 roundtrip(r"\b", r"\b");
316 roundtrip(r"\B", r"\B");
317 roundtrip(r"(?-u)\b", r"(?-u:\b)");
318 roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
319 }
320
321 #[test]
322 fn print_repetition() {
323 roundtrip("a?", "a?");
324 roundtrip("a??", "a??");
325 roundtrip("(?U)a?", "a??");
326
327 roundtrip("a*", "a*");
328 roundtrip("a*?", "a*?");
329 roundtrip("(?U)a*", "a*?");
330
331 roundtrip("a+", "a+");
332 roundtrip("a+?", "a+?");
333 roundtrip("(?U)a+", "a+?");
334
335 roundtrip("a{1}", "a{1}");
336 roundtrip("a{1,}", "a{1,}");
337 roundtrip("a{1,5}", "a{1,5}");
338 roundtrip("a{1}?", "a{1}?");
339 roundtrip("a{1,}?", "a{1,}?");
340 roundtrip("a{1,5}?", "a{1,5}?");
341 roundtrip("(?U)a{1}", "a{1}?");
342 roundtrip("(?U)a{1,}", "a{1,}?");
343 roundtrip("(?U)a{1,5}", "a{1,5}?");
344 }
345
346 #[test]
347 fn print_group() {
348 roundtrip("()", "()");
349 roundtrip("(?P<foo>)", "(?P<foo>)");
350 roundtrip("(?:)", "(?:)");
351
352 roundtrip("(a)", "(a)");
353 roundtrip("(?P<foo>a)", "(?P<foo>a)");
354 roundtrip("(?:a)", "(?:a)");
355
356 roundtrip("((((a))))", "((((a))))");
357 }
358
359 #[test]
360 fn print_alternation() {
361 roundtrip("|", "|");
362 roundtrip("||", "||");
363
364 roundtrip("a|b", "a|b");
365 roundtrip("a|b|c", "a|b|c");
366 roundtrip("foo|bar|quux", "foo|bar|quux");
367 }
368}