Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 1 | use std::str; |
| 2 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 3 | use crate::find_byte::find_byte; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 4 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 5 | use crate::re_bytes; |
| 6 | use crate::re_unicode; |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 7 | |
| 8 | pub fn expand_str( |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 9 | caps: &re_unicode::Captures<'_>, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 10 | mut replacement: &str, |
| 11 | dst: &mut String, |
| 12 | ) { |
| 13 | while !replacement.is_empty() { |
| 14 | match find_byte(b'$', replacement.as_bytes()) { |
| 15 | None => break, |
| 16 | Some(i) => { |
| 17 | dst.push_str(&replacement[..i]); |
| 18 | replacement = &replacement[i..]; |
| 19 | } |
| 20 | } |
| 21 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { |
| 22 | dst.push_str("$"); |
| 23 | replacement = &replacement[2..]; |
| 24 | continue; |
| 25 | } |
| 26 | debug_assert!(!replacement.is_empty()); |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 27 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 28 | Some(cap_ref) => cap_ref, |
| 29 | None => { |
| 30 | dst.push_str("$"); |
| 31 | replacement = &replacement[1..]; |
| 32 | continue; |
| 33 | } |
| 34 | }; |
| 35 | replacement = &replacement[cap_ref.end..]; |
| 36 | match cap_ref.cap { |
| 37 | Ref::Number(i) => { |
| 38 | dst.push_str(caps.get(i).map(|m| m.as_str()).unwrap_or("")); |
| 39 | } |
| 40 | Ref::Named(name) => { |
| 41 | dst.push_str( |
| 42 | caps.name(name).map(|m| m.as_str()).unwrap_or(""), |
| 43 | ); |
| 44 | } |
| 45 | } |
| 46 | } |
| 47 | dst.push_str(replacement); |
| 48 | } |
| 49 | |
| 50 | pub fn expand_bytes( |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 51 | caps: &re_bytes::Captures<'_>, |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 52 | mut replacement: &[u8], |
| 53 | dst: &mut Vec<u8>, |
| 54 | ) { |
| 55 | while !replacement.is_empty() { |
| 56 | match find_byte(b'$', replacement) { |
| 57 | None => break, |
| 58 | Some(i) => { |
| 59 | dst.extend(&replacement[..i]); |
| 60 | replacement = &replacement[i..]; |
| 61 | } |
| 62 | } |
| 63 | if replacement.get(1).map_or(false, |&b| b == b'$') { |
| 64 | dst.push(b'$'); |
| 65 | replacement = &replacement[2..]; |
| 66 | continue; |
| 67 | } |
| 68 | debug_assert!(!replacement.is_empty()); |
| 69 | let cap_ref = match find_cap_ref(replacement) { |
| 70 | Some(cap_ref) => cap_ref, |
| 71 | None => { |
| 72 | dst.push(b'$'); |
| 73 | replacement = &replacement[1..]; |
| 74 | continue; |
| 75 | } |
| 76 | }; |
| 77 | replacement = &replacement[cap_ref.end..]; |
| 78 | match cap_ref.cap { |
| 79 | Ref::Number(i) => { |
| 80 | dst.extend(caps.get(i).map(|m| m.as_bytes()).unwrap_or(b"")); |
| 81 | } |
| 82 | Ref::Named(name) => { |
| 83 | dst.extend( |
| 84 | caps.name(name).map(|m| m.as_bytes()).unwrap_or(b""), |
| 85 | ); |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | dst.extend(replacement); |
| 90 | } |
| 91 | |
| 92 | /// `CaptureRef` represents a reference to a capture group inside some text. |
| 93 | /// The reference is either a capture group name or a number. |
| 94 | /// |
| 95 | /// It is also tagged with the position in the text following the |
| 96 | /// capture reference. |
| 97 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 98 | struct CaptureRef<'a> { |
| 99 | cap: Ref<'a>, |
| 100 | end: usize, |
| 101 | } |
| 102 | |
| 103 | /// A reference to a capture group in some text. |
| 104 | /// |
| 105 | /// e.g., `$2`, `$foo`, `${foo}`. |
| 106 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
| 107 | enum Ref<'a> { |
| 108 | Named(&'a str), |
| 109 | Number(usize), |
| 110 | } |
| 111 | |
| 112 | impl<'a> From<&'a str> for Ref<'a> { |
| 113 | fn from(x: &'a str) -> Ref<'a> { |
| 114 | Ref::Named(x) |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | impl From<usize> for Ref<'static> { |
| 119 | fn from(x: usize) -> Ref<'static> { |
| 120 | Ref::Number(x) |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | /// Parses a possible reference to a capture group name in the given text, |
| 125 | /// starting at the beginning of `replacement`. |
| 126 | /// |
| 127 | /// If no such valid reference could be found, None is returned. |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 128 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 129 | let mut i = 0; |
| 130 | let rep: &[u8] = replacement.as_ref(); |
| 131 | if rep.len() <= 1 || rep[0] != b'$' { |
| 132 | return None; |
| 133 | } |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 134 | i += 1; |
| 135 | if rep[i] == b'{' { |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 136 | return find_cap_ref_braced(rep, i + 1); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 137 | } |
| 138 | let mut cap_end = i; |
| 139 | while rep.get(cap_end).map_or(false, is_valid_cap_letter) { |
| 140 | cap_end += 1; |
| 141 | } |
| 142 | if cap_end == i { |
| 143 | return None; |
| 144 | } |
| 145 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
| 146 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
Elliott Hughes | ffb6030 | 2021-04-01 17:11:40 -0700 | [diff] [blame] | 147 | // check via an unchecked conversion or by parsing the number straight from |
| 148 | // &[u8]. |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 149 | let cap = |
| 150 | str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 151 | Some(CaptureRef { |
| 152 | cap: match cap.parse::<u32>() { |
| 153 | Ok(i) => Ref::Number(i as usize), |
| 154 | Err(_) => Ref::Named(cap), |
| 155 | }, |
| 156 | end: cap_end, |
| 157 | }) |
| 158 | } |
| 159 | |
Joel Galenson | 3874808 | 2021-05-19 16:51:51 -0700 | [diff] [blame^] | 160 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 161 | let start = i; |
| 162 | while rep.get(i).map_or(false, |&b| b != b'}') { |
| 163 | i += 1; |
| 164 | } |
| 165 | if !rep.get(i).map_or(false, |&b| b == b'}') { |
| 166 | return None; |
| 167 | } |
| 168 | // When looking at braced names, we don't put any restrictions on the name, |
| 169 | // so it's possible it could be invalid UTF-8. But a capture group name |
| 170 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
| 171 | // safely return None. |
| 172 | let cap = match str::from_utf8(&rep[start..i]) { |
| 173 | Err(_) => return None, |
| 174 | Ok(cap) => cap, |
| 175 | }; |
| 176 | Some(CaptureRef { |
| 177 | cap: match cap.parse::<u32>() { |
| 178 | Ok(i) => Ref::Number(i as usize), |
| 179 | Err(_) => Ref::Named(cap), |
| 180 | }, |
| 181 | end: i + 1, |
| 182 | }) |
| 183 | } |
| 184 | |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 185 | /// Returns true if and only if the given byte is allowed in a capture name. |
| 186 | fn is_valid_cap_letter(b: &u8) -> bool { |
| 187 | match *b { |
| 188 | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, |
| 189 | _ => false, |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | #[cfg(test)] |
| 194 | mod tests { |
| 195 | use super::{find_cap_ref, CaptureRef}; |
| 196 | |
| 197 | macro_rules! find { |
| 198 | ($name:ident, $text:expr) => { |
| 199 | #[test] |
| 200 | fn $name() { |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 201 | assert_eq!(None, find_cap_ref($text.as_bytes())); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 202 | } |
| 203 | }; |
| 204 | ($name:ident, $text:expr, $capref:expr) => { |
| 205 | #[test] |
| 206 | fn $name() { |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 207 | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 208 | } |
| 209 | }; |
| 210 | } |
| 211 | |
| 212 | macro_rules! c { |
| 213 | ($name_or_number:expr, $pos:expr) => { |
| 214 | CaptureRef { cap: $name_or_number.into(), end: $pos } |
| 215 | }; |
| 216 | } |
| 217 | |
| 218 | find!(find_cap_ref1, "$foo", c!("foo", 4)); |
| 219 | find!(find_cap_ref2, "${foo}", c!("foo", 6)); |
| 220 | find!(find_cap_ref3, "$0", c!(0, 2)); |
| 221 | find!(find_cap_ref4, "$5", c!(5, 2)); |
| 222 | find!(find_cap_ref5, "$10", c!(10, 3)); |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 223 | // See https://github.com/rust-lang/regex/pull/585 |
| 224 | // for more on characters following numbers |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 225 | find!(find_cap_ref6, "$42a", c!("42a", 4)); |
| 226 | find!(find_cap_ref7, "${42}a", c!(42, 5)); |
| 227 | find!(find_cap_ref8, "${42"); |
| 228 | find!(find_cap_ref9, "${42 "); |
| 229 | find!(find_cap_ref10, " $0 "); |
| 230 | find!(find_cap_ref11, "$"); |
| 231 | find!(find_cap_ref12, " "); |
| 232 | find!(find_cap_ref13, ""); |
| 233 | find!(find_cap_ref14, "$1-$2", c!(1, 2)); |
| 234 | find!(find_cap_ref15, "$1_$2", c!("1_", 3)); |
| 235 | find!(find_cap_ref16, "$x-$y", c!("x", 2)); |
| 236 | find!(find_cap_ref17, "$x_$y", c!("x_", 3)); |
Chih-Hung Hsieh | 849e445 | 2020-10-26 13:16:47 -0700 | [diff] [blame] | 237 | find!(find_cap_ref18, "${#}", c!("#", 4)); |
| 238 | find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); |
Chih-Hung Hsieh | e42c505 | 2020-04-16 10:44:21 -0700 | [diff] [blame] | 239 | } |