blob: 489729e745eb9d892b9d82cfd1946d3a81adc314 [file] [log] [blame]
Chih-Hung Hsieh048fc042020-04-16 10:44:22 -07001/*!
2Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4This is sub-module is useful for constructing byte based automatons that need
5to embed UTF-8 decoding. The most common use of this module is in conjunction
6with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type.
7
8See the documentation on the `Utf8Sequences` iterator for more details and
9an example.
10
11# Wait, what is this?
12
13This is simplest to explain with an example. Let's say you wanted to test
14whether a particular byte sequence was a Cyrillic character. One possible
15scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16range can be expressed as a sequence of byte ranges:
17
18```ignore
19[D0-D3][80-BF]
20```
21
22This is simple enough: simply encode the boundaries, `0400` encodes to
23`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26However, what if you wanted to add the Cyrillic Supplementary characters to
27your range? Your range might then become `[0400-052F]`. The same procedure
28as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30this isn't quite correct because this range doesn't capture many characters,
31for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33Instead, you need multiple sequences of byte ranges:
34
35```ignore
36[D0-D3][80-BF] # matches codepoints 0400-04FF
37[D4][80-AF] # matches codepoints 0500-052F
38```
39
40This gets even more complicated if you want bigger ranges, particularly if
41they naively contain surrogate codepoints. For example, the sequence of byte
42ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44```ignore
45[0-7F]
46[C2-DF][80-BF]
47[E0][A0-BF][80-BF]
48[E1-EC][80-BF][80-BF]
49[ED][80-9F][80-BF]
50[EE-EF][80-BF][80-BF]
51```
52
53Note that the byte ranges above will *not* match any erroneous encoding of
54UTF-8, including encodings of surrogate codepoints.
55
56And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58```ignore
59[0-7F]
60[C2-DF][80-BF]
61[E0][A0-BF][80-BF]
62[E1-EC][80-BF][80-BF]
63[ED][80-9F][80-BF]
64[EE-EF][80-BF][80-BF]
65[F0][90-BF][80-BF][80-BF]
66[F1-F3][80-BF][80-BF][80-BF]
67[F4][80-8F][80-BF][80-BF]
68```
69
70This module automates the process of creating these byte ranges from ranges of
71Unicode scalar values.
72
73# Lineage
74
75I got the idea and general implementation strategy from Russ Cox in his
76[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78I also got the idea from
79[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80which uses it for executing automata on their term index.
81*/
82
83#![deny(missing_docs)]
84
85use std::char;
86use std::fmt;
87use std::slice;
88
89const MAX_UTF8_BYTES: usize = 4;
90
91/// Utf8Sequence represents a sequence of byte ranges.
92///
93/// To match a Utf8Sequence, a candidate byte sequence must match each
94/// successive range.
95///
96/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
97/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
98#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
99pub enum Utf8Sequence {
100 /// One byte range.
101 One(Utf8Range),
102 /// Two successive byte ranges.
103 Two([Utf8Range; 2]),
104 /// Three successive byte ranges.
105 Three([Utf8Range; 3]),
106 /// Four successive byte ranges.
107 Four([Utf8Range; 4]),
108}
109
110impl Utf8Sequence {
111 /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
112 /// range.
113 ///
114 /// This assumes that `start` and `end` have the same length.
115 fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
116 assert_eq!(start.len(), end.len());
117 match start.len() {
118 2 => Utf8Sequence::Two([
119 Utf8Range::new(start[0], end[0]),
120 Utf8Range::new(start[1], end[1]),
121 ]),
122 3 => Utf8Sequence::Three([
123 Utf8Range::new(start[0], end[0]),
124 Utf8Range::new(start[1], end[1]),
125 Utf8Range::new(start[2], end[2]),
126 ]),
127 4 => Utf8Sequence::Four([
128 Utf8Range::new(start[0], end[0]),
129 Utf8Range::new(start[1], end[1]),
130 Utf8Range::new(start[2], end[2]),
131 Utf8Range::new(start[3], end[3]),
132 ]),
133 n => unreachable!("invalid encoded length: {}", n),
134 }
135 }
136
137 /// Returns the underlying sequence of byte ranges as a slice.
138 pub fn as_slice(&self) -> &[Utf8Range] {
139 use self::Utf8Sequence::*;
140 match *self {
141 One(ref r) => slice::from_ref(r),
142 Two(ref r) => &r[..],
143 Three(ref r) => &r[..],
144 Four(ref r) => &r[..],
145 }
146 }
147
148 /// Returns the number of byte ranges in this sequence.
149 ///
150 /// The length is guaranteed to be in the closed interval `[1, 4]`.
151 pub fn len(&self) -> usize {
152 self.as_slice().len()
153 }
154
155 /// Reverses the ranges in this sequence.
156 ///
157 /// For example, if this corresponds to the following sequence:
158 ///
159 /// ```ignore
160 /// [D0-D3][80-BF]
161 /// ```
162 ///
163 /// Then after reversal, it will be
164 ///
165 /// ```ignore
166 /// [80-BF][D0-D3]
167 /// ```
168 ///
169 /// This is useful when one is constructing a UTF-8 automaton to match
170 /// character classes in reverse.
171 pub fn reverse(&mut self) {
172 match *self {
173 Utf8Sequence::One(_) => {}
174 Utf8Sequence::Two(ref mut x) => x.reverse(),
175 Utf8Sequence::Three(ref mut x) => x.reverse(),
176 Utf8Sequence::Four(ref mut x) => x.reverse(),
177 }
178 }
179
180 /// Returns true if and only if a prefix of `bytes` matches this sequence
181 /// of byte ranges.
182 pub fn matches(&self, bytes: &[u8]) -> bool {
183 if bytes.len() < self.len() {
184 return false;
185 }
186 for (&b, r) in bytes.iter().zip(self) {
187 if !r.matches(b) {
188 return false;
189 }
190 }
191 true
192 }
193}
194
195impl<'a> IntoIterator for &'a Utf8Sequence {
196 type IntoIter = slice::Iter<'a, Utf8Range>;
197 type Item = &'a Utf8Range;
198
199 fn into_iter(self) -> Self::IntoIter {
200 self.as_slice().into_iter()
201 }
202}
203
204impl fmt::Debug for Utf8Sequence {
205 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206 use self::Utf8Sequence::*;
207 match *self {
208 One(ref r) => write!(f, "{:?}", r),
209 Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
210 Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
211 Four(ref r) => {
212 write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
213 }
214 }
215 }
216}
217
218/// A single inclusive range of UTF-8 bytes.
219#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
220pub struct Utf8Range {
221 /// Start of byte range (inclusive).
222 pub start: u8,
223 /// End of byte range (inclusive).
224 pub end: u8,
225}
226
227impl Utf8Range {
228 fn new(start: u8, end: u8) -> Self {
229 Utf8Range { start, end }
230 }
231
232 /// Returns true if and only if the given byte is in this range.
233 pub fn matches(&self, b: u8) -> bool {
234 self.start <= b && b <= self.end
235 }
236}
237
238impl fmt::Debug for Utf8Range {
239 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
240 if self.start == self.end {
241 write!(f, "[{:X}]", self.start)
242 } else {
243 write!(f, "[{:X}-{:X}]", self.start, self.end)
244 }
245 }
246}
247
248/// An iterator over ranges of matching UTF-8 byte sequences.
249///
250/// The iteration represents an alternation of comprehensive byte sequences
251/// that match precisely the set of UTF-8 encoded scalar values.
252///
253/// A byte sequence corresponds to one of the scalar values in the range given
254/// if and only if it completely matches exactly one of the sequences of byte
255/// ranges produced by this iterator.
256///
257/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
258/// sequences will match the same bytes.
259///
260/// # Example
261///
262/// This shows how to match an arbitrary byte sequence against a range of
263/// scalar values.
264///
265/// ```rust
266/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
267///
268/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
269/// for range in seqs {
270/// if range.matches(bytes) {
271/// return true;
272/// }
273/// }
274/// false
275/// }
276///
277/// // Test the basic multilingual plane.
278/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
279///
280/// // UTF-8 encoding of 'a'.
281/// assert!(matches(&seqs, &[0x61]));
282/// // UTF-8 encoding of '☃' (`\u{2603}`).
283/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
284/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
285/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
286/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
287/// // which is invalid UTF-8, and therefore fails, despite the fact that
288/// // the corresponding codepoint (0xD800) falls in the range given.
289/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
290/// // And fails against plain old invalid UTF-8.
291/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
292/// ```
293///
294/// If this example seems circuitous, that's because it is! It's meant to be
295/// illustrative. In practice, you could just try to decode your byte sequence
296/// and compare it with the scalar value range directly. However, this is not
297/// always possible (for example, in a byte based automaton).
298pub struct Utf8Sequences {
299 range_stack: Vec<ScalarRange>,
300}
301
302impl Utf8Sequences {
303 /// Create a new iterator over UTF-8 byte ranges for the scalar value range
304 /// given.
305 pub fn new(start: char, end: char) -> Self {
306 let mut it = Utf8Sequences { range_stack: vec![] };
307 it.push(start as u32, end as u32);
308 it
309 }
310
311 /// reset resets the scalar value range.
312 /// Any existing state is cleared, but resources may be reused.
313 ///
314 /// N.B. Benchmarks say that this method is dubious.
315 #[doc(hidden)]
316 pub fn reset(&mut self, start: char, end: char) {
317 self.range_stack.clear();
318 self.push(start as u32, end as u32);
319 }
320
321 fn push(&mut self, start: u32, end: u32) {
322 self.range_stack.push(ScalarRange { start, end });
323 }
324}
325
326struct ScalarRange {
327 start: u32,
328 end: u32,
329}
330
331impl fmt::Debug for ScalarRange {
332 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
333 write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
334 }
335}
336
337impl Iterator for Utf8Sequences {
338 type Item = Utf8Sequence;
339
340 fn next(&mut self) -> Option<Self::Item> {
341 'TOP: while let Some(mut r) = self.range_stack.pop() {
342 'INNER: loop {
343 if let Some((r1, r2)) = r.split() {
344 self.push(r2.start, r2.end);
345 r.start = r1.start;
346 r.end = r1.end;
347 continue 'INNER;
348 }
349 if !r.is_valid() {
350 continue 'TOP;
351 }
352 for i in 1..MAX_UTF8_BYTES {
353 let max = max_scalar_value(i);
354 if r.start <= max && max < r.end {
355 self.push(max + 1, r.end);
356 r.end = max;
357 continue 'INNER;
358 }
359 }
360 if let Some(ascii_range) = r.as_ascii() {
361 return Some(Utf8Sequence::One(ascii_range));
362 }
363 for i in 1..MAX_UTF8_BYTES {
364 let m = (1 << (6 * i)) - 1;
365 if (r.start & !m) != (r.end & !m) {
366 if (r.start & m) != 0 {
367 self.push((r.start | m) + 1, r.end);
368 r.end = r.start | m;
369 continue 'INNER;
370 }
371 if (r.end & m) != m {
372 self.push(r.end & !m, r.end);
373 r.end = (r.end & !m) - 1;
374 continue 'INNER;
375 }
376 }
377 }
378 let mut start = [0; MAX_UTF8_BYTES];
379 let mut end = [0; MAX_UTF8_BYTES];
380 let n = r.encode(&mut start, &mut end);
381 return Some(Utf8Sequence::from_encoded_range(
382 &start[0..n],
383 &end[0..n],
384 ));
385 }
386 }
387 None
388 }
389}
390
391impl ScalarRange {
392 /// split splits this range if it overlaps with a surrogate codepoint.
393 ///
394 /// Either or both ranges may be invalid.
395 fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
396 if self.start < 0xE000 && self.end > 0xD7FF {
397 Some((
398 ScalarRange { start: self.start, end: 0xD7FF },
399 ScalarRange { start: 0xE000, end: self.end },
400 ))
401 } else {
402 None
403 }
404 }
405
406 /// is_valid returns true if and only if start <= end.
407 fn is_valid(&self) -> bool {
408 self.start <= self.end
409 }
410
411 /// as_ascii returns this range as a Utf8Range if and only if all scalar
412 /// values in this range can be encoded as a single byte.
413 fn as_ascii(&self) -> Option<Utf8Range> {
414 if self.is_ascii() {
415 Some(Utf8Range::new(self.start as u8, self.end as u8))
416 } else {
417 None
418 }
419 }
420
421 /// is_ascii returns true if the range is ASCII only (i.e., takes a single
422 /// byte to encode any scalar value).
423 fn is_ascii(&self) -> bool {
424 self.is_valid() && self.end <= 0x7f
425 }
426
427 /// encode writes the UTF-8 encoding of the start and end of this range
428 /// to the corresponding destination slices, and returns the number of
429 /// bytes written.
430 ///
431 /// The slices should have room for at least `MAX_UTF8_BYTES`.
432 fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
433 let cs = char::from_u32(self.start).unwrap();
434 let ce = char::from_u32(self.end).unwrap();
435 let ss = cs.encode_utf8(start);
436 let se = ce.encode_utf8(end);
437 assert_eq!(ss.len(), se.len());
438 ss.len()
439 }
440}
441
442fn max_scalar_value(nbytes: usize) -> u32 {
443 match nbytes {
444 1 => 0x007F,
445 2 => 0x07FF,
446 3 => 0xFFFF,
447 4 => 0x10FFFF,
448 _ => unreachable!("invalid UTF-8 byte sequence size"),
449 }
450}
451
452#[cfg(test)]
453mod tests {
454 use std::char;
455
456 use utf8::{Utf8Range, Utf8Sequences};
457
458 fn rutf8(s: u8, e: u8) -> Utf8Range {
459 Utf8Range::new(s, e)
460 }
461
462 fn never_accepts_surrogate_codepoints(start: char, end: char) {
463 for cp in 0xD800..0xE000 {
464 let buf = encode_surrogate(cp);
465 for r in Utf8Sequences::new(start, end) {
466 if r.matches(&buf) {
467 panic!(
468 "Sequence ({:X}, {:X}) contains range {:?}, \
469 which matches surrogate code point {:X} \
470 with encoded bytes {:?}",
471 start as u32, end as u32, r, cp, buf,
472 );
473 }
474 }
475 }
476 }
477
478 #[test]
479 fn codepoints_no_surrogates() {
480 never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
481 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
482 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
483 never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
484 never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
485 }
486
487 #[test]
488 fn single_codepoint_one_sequence() {
489 // Tests that every range of scalar values that contains a single
490 // scalar value is recognized by one sequence of byte ranges.
491 for i in 0x0..(0x10FFFF + 1) {
492 let c = match char::from_u32(i) {
493 None => continue,
494 Some(c) => c,
495 };
496 let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
497 assert_eq!(seqs.len(), 1);
498 }
499 }
500
501 #[test]
502 fn bmp() {
503 use utf8::Utf8Sequence::*;
504
505 let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
506 assert_eq!(
507 seqs,
508 vec![
509 One(rutf8(0x0, 0x7F)),
510 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
511 Three([
512 rutf8(0xE0, 0xE0),
513 rutf8(0xA0, 0xBF),
514 rutf8(0x80, 0xBF)
515 ]),
516 Three([
517 rutf8(0xE1, 0xEC),
518 rutf8(0x80, 0xBF),
519 rutf8(0x80, 0xBF)
520 ]),
521 Three([
522 rutf8(0xED, 0xED),
523 rutf8(0x80, 0x9F),
524 rutf8(0x80, 0xBF)
525 ]),
526 Three([
527 rutf8(0xEE, 0xEF),
528 rutf8(0x80, 0xBF),
529 rutf8(0x80, 0xBF)
530 ]),
531 ]
532 );
533 }
534
535 #[test]
536 fn reverse() {
537 use utf8::Utf8Sequence::*;
538
539 let mut s = One(rutf8(0xA, 0xB));
540 s.reverse();
541 assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
542
543 let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
544 s.reverse();
545 assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
546
547 let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
548 s.reverse();
549 assert_eq!(
550 s.as_slice(),
551 &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
552 );
553
554 let mut s = Four([
555 rutf8(0xA, 0xB),
556 rutf8(0xB, 0xC),
557 rutf8(0xC, 0xD),
558 rutf8(0xD, 0xE),
559 ]);
560 s.reverse();
561 assert_eq!(
562 s.as_slice(),
563 &[
564 rutf8(0xD, 0xE),
565 rutf8(0xC, 0xD),
566 rutf8(0xB, 0xC),
567 rutf8(0xA, 0xB)
568 ]
569 );
570 }
571
572 fn encode_surrogate(cp: u32) -> [u8; 3] {
573 const TAG_CONT: u8 = 0b1000_0000;
574 const TAG_THREE_B: u8 = 0b1110_0000;
575
576 assert!(0xD800 <= cp && cp < 0xE000);
577 let mut dst = [0; 3];
578 dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
579 dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
580 dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
581 dst
582 }
583}