Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame^] | 1 | use std::str; |
| 2 | |
| 3 | use find_byte::find_byte; |
| 4 | |
| 5 | use re_bytes; |
| 6 | use re_unicode; |
| 7 | |
| 8 | pub fn expand_str( |
| 9 | caps: &re_unicode::Captures, |
| 10 | mut replacement: &str, |
| 11 | dst: &mut String, |
| 12 | ) { |
| 13 | while !replacement.is_empty() { |
| 14 | match find_byte(b'$', replacement.as_bytes()) { |
| 15 | None => break, |
| 16 | Some(i) => { |
| 17 | dst.push_str(&replacement[..i]); |
| 18 | replacement = &replacement[i..]; |
| 19 | } |
| 20 | } |
| 21 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { |
| 22 | dst.push_str("$"); |
| 23 | replacement = &replacement[2..]; |
| 24 | continue; |
| 25 | } |
| 26 | debug_assert!(!replacement.is_empty()); |
| 27 | let cap_ref = match find_cap_ref(replacement) { |
| 28 | Some(cap_ref) => cap_ref, |
| 29 | None => { |
| 30 | dst.push_str("$"); |
| 31 | replacement = &replacement[1..]; |
| 32 | continue; |
| 33 | } |
| 34 | }; |
| 35 | replacement = &replacement[cap_ref.end..]; |
| 36 | match cap_ref.cap { |
| 37 | Ref::Number(i) => { |
| 38 | dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("")); |
| 39 | } |
| 40 | Ref::Named(name) => { |
| 41 | dst.push_str( |
| 42 | caps.name(name).map(|m| m.as_str()).unwrap_or(""), |
| 43 | ); |
| 44 | } |
| 45 | } |
| 46 | } |
| 47 | dst.push_str(replacement); |
| 48 | } |
| 49 | |
| 50 | pub fn expand_bytes( |
| 51 | caps: &re_bytes::Captures, |
| 52 | mut replacement: &[u8], |
| 53 | dst: &mut Vec<u8>, |
| 54 | ) { |
| 55 | while !replacement.is_empty() { |
| 56 | match find_byte(b'$', replacement) { |
| 57 | None => break, |
| 58 | Some(i) => { |
| 59 | dst.extend(&replacement[..i]); |
| 60 | replacement = &replacement[i..]; |
| 61 | } |
| 62 | } |
| 63 | if replacement.get(1).map_or(false, |&b| b == b'$') { |
| 64 | dst.push(b'$'); |
| 65 | replacement = &replacement[2..]; |
| 66 | continue; |
| 67 | } |
| 68 | debug_assert!(!replacement.is_empty()); |
| 69 | let cap_ref = match find_cap_ref(replacement) { |
| 70 | Some(cap_ref) => cap_ref, |
| 71 | None => { |
| 72 | dst.push(b'$'); |
| 73 | replacement = &replacement[1..]; |
| 74 | continue; |
| 75 | } |
| 76 | }; |
| 77 | replacement = &replacement[cap_ref.end..]; |
| 78 | match cap_ref.cap { |
| 79 | Ref::Number(i) => { |
| 80 | dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); |
| 81 | } |
| 82 | Ref::Named(name) => { |
| 83 | dst.extend( |
| 84 | caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), |
| 85 | ); |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | dst.extend(replacement); |
| 90 | } |
| 91 | |
| 92 | /// `CaptureRef` represents a reference to a capture group inside some text. |
| 93 | /// The reference is either a capture group name or a number. |
| 94 | /// |
| 95 | /// It is also tagged with the position in the text following the |
| 96 | /// capture reference. |
| 97 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 98 | struct CaptureRef<'a> { |
| 99 | cap: Ref<'a>, |
| 100 | end: usize, |
| 101 | } |
| 102 | |
| 103 | /// A reference to a capture group in some text. |
| 104 | /// |
| 105 | /// e.g., `$2`, `$foo`, `${foo}`. |
| 106 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 107 | enum Ref<'a> { |
| 108 | Named(&'a str), |
| 109 | Number(usize), |
| 110 | } |
| 111 | |
| 112 | impl<'a> From<&'a str> for Ref<'a> { |
| 113 | fn from(x: &'a str) -> Ref<'a> { |
| 114 | Ref::Named(x) |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | impl From<usize> for Ref<'static> { |
| 119 | fn from(x: usize) -> Ref<'static> { |
| 120 | Ref::Number(x) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | /// Parses a possible reference to a capture group name in the given text, |
| 125 | /// starting at the beginning of `replacement`. |
| 126 | /// |
| 127 | /// If no such valid reference could be found, None is returned. |
| 128 | fn find_cap_ref<T: ?Sized + AsRef<[u8]>>( |
| 129 | replacement: &T, |
| 130 | ) -> Option<CaptureRef> { |
| 131 | let mut i = 0; |
| 132 | let rep: &[u8] = replacement.as_ref(); |
| 133 | if rep.len() <= 1 || rep[0] != b'$' { |
| 134 | return None; |
| 135 | } |
| 136 | let mut brace = false; |
| 137 | i += 1; |
| 138 | if rep[i] == b'{' { |
| 139 | brace = true; |
| 140 | i += 1; |
| 141 | } |
| 142 | let mut cap_end = i; |
| 143 | while rep.get(cap_end).map_or(false, is_valid_cap_letter) { |
| 144 | cap_end += 1; |
| 145 | } |
| 146 | if cap_end == i { |
| 147 | return None; |
| 148 | } |
| 149 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
| 150 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
| 151 | // check with either unsafe or by parsing the number straight from &[u8]. |
| 152 | let cap = |
| 153 | str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); |
| 154 | if brace { |
| 155 | if !rep.get(cap_end).map_or(false, |&b| b == b'}') { |
| 156 | return None; |
| 157 | } |
| 158 | cap_end += 1; |
| 159 | } |
| 160 | Some(CaptureRef { |
| 161 | cap: match cap.parse::<u32>() { |
| 162 | Ok(i) => Ref::Number(i as usize), |
| 163 | Err(_) => Ref::Named(cap), |
| 164 | }, |
| 165 | end: cap_end, |
| 166 | }) |
| 167 | } |
| 168 | |
| 169 | /// Returns true if and only if the given byte is allowed in a capture name. |
| 170 | fn is_valid_cap_letter(b: &u8) -> bool { |
| 171 | match *b { |
| 172 | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, |
| 173 | _ => false, |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | #[cfg(test)] |
| 178 | mod tests { |
| 179 | use super::{find_cap_ref, CaptureRef}; |
| 180 | |
| 181 | macro_rules! find { |
| 182 | ($name:ident, $text:expr) => { |
| 183 | #[test] |
| 184 | fn $name() { |
| 185 | assert_eq!(None, find_cap_ref($text)); |
| 186 | } |
| 187 | }; |
| 188 | ($name:ident, $text:expr, $capref:expr) => { |
| 189 | #[test] |
| 190 | fn $name() { |
| 191 | assert_eq!(Some($capref), find_cap_ref($text)); |
| 192 | } |
| 193 | }; |
| 194 | } |
| 195 | |
| 196 | macro_rules! c { |
| 197 | ($name_or_number:expr, $pos:expr) => { |
| 198 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
| 199 | }; |
| 200 | } |
| 201 | |
| 202 | find!(find_cap_ref1, "$foo", c!("foo", 4)); |
| 203 | find!(find_cap_ref2, "${foo}", c!("foo", 6)); |
| 204 | find!(find_cap_ref3, "$0", c!(0, 2)); |
| 205 | find!(find_cap_ref4, "$5", c!(5, 2)); |
| 206 | find!(find_cap_ref5, "$10", c!(10, 3)); |
| 207 | // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers |
| 208 | find!(find_cap_ref6, "$42a", c!("42a", 4)); |
| 209 | find!(find_cap_ref7, "${42}a", c!(42, 5)); |
| 210 | find!(find_cap_ref8, "${42"); |
| 211 | find!(find_cap_ref9, "${42 "); |
| 212 | find!(find_cap_ref10, " $0 "); |
| 213 | find!(find_cap_ref11, "$"); |
| 214 | find!(find_cap_ref12, " "); |
| 215 | find!(find_cap_ref13, ""); |
| 216 | find!(find_cap_ref14, "$1-$2", c!(1, 2)); |
| 217 | find!(find_cap_ref15, "$1_$2", c!("1_", 3)); |
| 218 | find!(find_cap_ref16, "$x-$y", c!("x", 2)); |
| 219 | find!(find_cap_ref17, "$x_$y", c!("x_", 3)); |
| 220 | } |