Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame^] | 1 | use core::fmt; |
| 2 | |
| 3 | /// A representation of byte oriented equivalence classes. |
| 4 | /// |
| 5 | /// This is used in a DFA to reduce the size of the transition table. This can |
| 6 | /// have a particularly large impact not only on the total size of a dense DFA, |
| 7 | /// but also on compile times. |
| 8 | #[derive(Clone, Copy)] |
| 9 | pub struct ByteClasses([u8; 256]); |
| 10 | |
| 11 | impl ByteClasses { |
| 12 | /// Creates a new set of equivalence classes where all bytes are mapped to |
| 13 | /// the same class. |
| 14 | pub fn empty() -> ByteClasses { |
| 15 | ByteClasses([0; 256]) |
| 16 | } |
| 17 | |
| 18 | /// Creates a new set of equivalence classes where each byte belongs to |
| 19 | /// its own equivalence class. |
| 20 | pub fn singletons() -> ByteClasses { |
| 21 | let mut classes = ByteClasses::empty(); |
| 22 | for i in 0..256 { |
| 23 | classes.set(i as u8, i as u8); |
| 24 | } |
| 25 | classes |
| 26 | } |
| 27 | |
| 28 | /// Copies the byte classes given. The given slice must have length 0 or |
| 29 | /// length 256. Slices of length 0 are treated as singletons (every byte |
| 30 | /// is its own class). |
| 31 | pub fn from_slice(slice: &[u8]) -> ByteClasses { |
| 32 | assert!(slice.is_empty() || slice.len() == 256); |
| 33 | |
| 34 | if slice.is_empty() { |
| 35 | ByteClasses::singletons() |
| 36 | } else { |
| 37 | let mut classes = ByteClasses::empty(); |
| 38 | for (b, &class) in slice.iter().enumerate() { |
| 39 | classes.set(b as u8, class); |
| 40 | } |
| 41 | classes |
| 42 | } |
| 43 | } |
| 44 | |
| 45 | /// Set the equivalence class for the given byte. |
| 46 | #[inline] |
| 47 | pub fn set(&mut self, byte: u8, class: u8) { |
| 48 | self.0[byte as usize] = class; |
| 49 | } |
| 50 | |
| 51 | /// Get the equivalence class for the given byte. |
| 52 | #[inline] |
| 53 | pub fn get(&self, byte: u8) -> u8 { |
| 54 | self.0[byte as usize] |
| 55 | } |
| 56 | |
| 57 | /// Get the equivalence class for the given byte while forcefully |
| 58 | /// eliding bounds checks. |
| 59 | #[inline] |
| 60 | pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { |
| 61 | *self.0.get_unchecked(byte as usize) |
| 62 | } |
| 63 | |
| 64 | /// Return the total number of elements in the alphabet represented by |
| 65 | /// these equivalence classes. Equivalently, this returns the total number |
| 66 | /// of equivalence classes. |
| 67 | #[inline] |
| 68 | pub fn alphabet_len(&self) -> usize { |
| 69 | self.0[255] as usize + 1 |
| 70 | } |
| 71 | |
| 72 | /// Returns true if and only if every byte in this class maps to its own |
| 73 | /// equivalence class. Equivalently, there are 256 equivalence classes |
| 74 | /// and each class contains exactly one byte. |
| 75 | #[inline] |
| 76 | pub fn is_singleton(&self) -> bool { |
| 77 | self.alphabet_len() == 256 |
| 78 | } |
| 79 | |
| 80 | /// Returns an iterator over a sequence of representative bytes from each |
| 81 | /// equivalence class. Namely, this yields exactly N items, where N is |
| 82 | /// equivalent to the number of equivalence classes. Each item is an |
| 83 | /// arbitrary byte drawn from each equivalence class. |
| 84 | /// |
| 85 | /// This is useful when one is determinizing an NFA and the NFA's alphabet |
| 86 | /// hasn't been converted to equivalence classes yet. Picking an arbitrary |
| 87 | /// byte from each equivalence class then permits a full exploration of |
| 88 | /// the NFA instead of using every possible byte value. |
| 89 | #[cfg(feature = "std")] |
| 90 | pub fn representatives(&self) -> ByteClassRepresentatives { |
| 91 | ByteClassRepresentatives { classes: self, byte: 0, last_class: None } |
| 92 | } |
| 93 | |
| 94 | /// Returns all of the bytes in the given equivalence class. |
| 95 | /// |
| 96 | /// The second element in the tuple indicates the number of elements in |
| 97 | /// the array. |
| 98 | fn elements(&self, equiv: u8) -> ([u8; 256], usize) { |
| 99 | let (mut array, mut len) = ([0; 256], 0); |
| 100 | for b in 0..256 { |
| 101 | if self.get(b as u8) == equiv { |
| 102 | array[len] = b as u8; |
| 103 | len += 1; |
| 104 | } |
| 105 | } |
| 106 | (array, len) |
| 107 | } |
| 108 | } |
| 109 | |
| 110 | impl fmt::Debug for ByteClasses { |
| 111 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 112 | if self.is_singleton() { |
| 113 | write!(f, "ByteClasses({{singletons}})") |
| 114 | } else { |
| 115 | write!(f, "ByteClasses(")?; |
| 116 | for equiv in 0..self.alphabet_len() { |
| 117 | let (members, len) = self.elements(equiv as u8); |
| 118 | write!(f, "{} => {:?}", equiv, &members[..len])?; |
| 119 | } |
| 120 | write!(f, ")") |
| 121 | } |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | /// An iterator over representative bytes from each equivalence class. |
| 126 | #[cfg(feature = "std")] |
| 127 | #[derive(Debug)] |
| 128 | pub struct ByteClassRepresentatives<'a> { |
| 129 | classes: &'a ByteClasses, |
| 130 | byte: usize, |
| 131 | last_class: Option<u8>, |
| 132 | } |
| 133 | |
| 134 | #[cfg(feature = "std")] |
| 135 | impl<'a> Iterator for ByteClassRepresentatives<'a> { |
| 136 | type Item = u8; |
| 137 | |
| 138 | fn next(&mut self) -> Option<u8> { |
| 139 | while self.byte < 256 { |
| 140 | let byte = self.byte as u8; |
| 141 | let class = self.classes.get(byte); |
| 142 | self.byte += 1; |
| 143 | |
| 144 | if self.last_class != Some(class) { |
| 145 | self.last_class = Some(class); |
| 146 | return Some(byte); |
| 147 | } |
| 148 | } |
| 149 | None |
| 150 | } |
| 151 | } |
| 152 | |
| 153 | /// A byte class set keeps track of an *approximation* of equivalence classes |
| 154 | /// of bytes during NFA construction. That is, every byte in an equivalence |
| 155 | /// class cannot discriminate between a match and a non-match. |
| 156 | /// |
| 157 | /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the |
| 158 | /// same equivalence class because it never matters whether an `a` or a `b` is |
| 159 | /// seen, and no combination of `a`s and `b`s in the text can discriminate |
| 160 | /// a match. |
| 161 | /// |
| 162 | /// Note though that this does not compute the minimal set of equivalence |
| 163 | /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the |
| 164 | /// same equivalence class for the same reason that `a` and `b` are in the |
| 165 | /// same equivalence class in the aforementioned regex. However, in this |
| 166 | /// implementation, `a` and `c` are put into distinct equivalence classes. |
| 167 | /// The reason for this is implementation complexity. In the future, we should |
| 168 | /// endeavor to compute the minimal equivalence classes since they can have a |
| 169 | /// rather large impact on the size of the DFA. |
| 170 | /// |
| 171 | /// The representation here is 256 booleans, all initially set to false. Each |
| 172 | /// boolean maps to its corresponding byte based on position. A `true` value |
| 173 | /// indicates the end of an equivalence class, where its corresponding byte |
| 174 | /// and all of the bytes corresponding to all previous contiguous `false` |
| 175 | /// values are in the same equivalence class. |
| 176 | /// |
| 177 | /// This particular representation only permits contiguous ranges of bytes to |
| 178 | /// be in the same equivalence class, which means that we can never discover |
| 179 | /// the true minimal set of equivalence classes. |
| 180 | #[cfg(feature = "std")] |
| 181 | #[derive(Debug)] |
| 182 | pub struct ByteClassSet(Vec<bool>); |
| 183 | |
| 184 | #[cfg(feature = "std")] |
| 185 | impl ByteClassSet { |
| 186 | /// Create a new set of byte classes where all bytes are part of the same |
| 187 | /// equivalence class. |
| 188 | pub fn new() -> Self { |
| 189 | ByteClassSet(vec![false; 256]) |
| 190 | } |
| 191 | |
| 192 | /// Indicate the the range of byte given (inclusive) can discriminate a |
| 193 | /// match between it and all other bytes outside of the range. |
| 194 | pub fn set_range(&mut self, start: u8, end: u8) { |
| 195 | debug_assert!(start <= end); |
| 196 | if start > 0 { |
| 197 | self.0[start as usize - 1] = true; |
| 198 | } |
| 199 | self.0[end as usize] = true; |
| 200 | } |
| 201 | |
| 202 | /// Convert this boolean set to a map that maps all byte values to their |
| 203 | /// corresponding equivalence class. The last mapping indicates the largest |
| 204 | /// equivalence class identifier (which is never bigger than 255). |
| 205 | pub fn byte_classes(&self) -> ByteClasses { |
| 206 | let mut classes = ByteClasses::empty(); |
| 207 | let mut class = 0u8; |
| 208 | let mut i = 0; |
| 209 | loop { |
| 210 | classes.set(i as u8, class as u8); |
| 211 | if i >= 255 { |
| 212 | break; |
| 213 | } |
| 214 | if self.0[i] { |
| 215 | class = class.checked_add(1).unwrap(); |
| 216 | } |
| 217 | i += 1; |
| 218 | } |
| 219 | classes |
| 220 | } |
| 221 | } |
| 222 | |
| 223 | #[cfg(test)] |
| 224 | mod tests { |
| 225 | #[cfg(feature = "std")] |
| 226 | #[test] |
| 227 | fn byte_classes() { |
| 228 | use super::ByteClassSet; |
| 229 | |
| 230 | let mut set = ByteClassSet::new(); |
| 231 | set.set_range(b'a', b'z'); |
| 232 | |
| 233 | let classes = set.byte_classes(); |
| 234 | assert_eq!(classes.get(0), 0); |
| 235 | assert_eq!(classes.get(1), 0); |
| 236 | assert_eq!(classes.get(2), 0); |
| 237 | assert_eq!(classes.get(b'a' - 1), 0); |
| 238 | assert_eq!(classes.get(b'a'), 1); |
| 239 | assert_eq!(classes.get(b'm'), 1); |
| 240 | assert_eq!(classes.get(b'z'), 1); |
| 241 | assert_eq!(classes.get(b'z' + 1), 2); |
| 242 | assert_eq!(classes.get(254), 2); |
| 243 | assert_eq!(classes.get(255), 2); |
| 244 | |
| 245 | let mut set = ByteClassSet::new(); |
| 246 | set.set_range(0, 2); |
| 247 | set.set_range(4, 6); |
| 248 | let classes = set.byte_classes(); |
| 249 | assert_eq!(classes.get(0), 0); |
| 250 | assert_eq!(classes.get(1), 0); |
| 251 | assert_eq!(classes.get(2), 0); |
| 252 | assert_eq!(classes.get(3), 1); |
| 253 | assert_eq!(classes.get(4), 2); |
| 254 | assert_eq!(classes.get(5), 2); |
| 255 | assert_eq!(classes.get(6), 2); |
| 256 | assert_eq!(classes.get(7), 3); |
| 257 | assert_eq!(classes.get(255), 3); |
| 258 | } |
| 259 | |
| 260 | #[cfg(feature = "std")] |
| 261 | #[test] |
| 262 | fn full_byte_classes() { |
| 263 | use super::ByteClassSet; |
| 264 | |
| 265 | let mut set = ByteClassSet::new(); |
| 266 | for i in 0..256u16 { |
| 267 | set.set_range(i as u8, i as u8); |
| 268 | } |
| 269 | assert_eq!(set.byte_classes().alphabet_len(), 256); |
| 270 | } |
| 271 | } |