Chih-Hung Hsieh | 048fc04 | 2020-04-16 10:44:22 -0700 | [diff] [blame] | 1 | use std::error; |
| 2 | use std::fmt; |
| 3 | use std::result; |
| 4 | |
| 5 | use hir; |
| 6 | |
| 7 | /// A type alias for errors specific to Unicode handling of classes. |
| 8 | pub type Result<T> = result::Result<T, Error>; |
| 9 | |
| 10 | /// An inclusive range of codepoints from a generated file (hence the static |
| 11 | /// lifetime). |
| 12 | type Range = &'static [(char, char)]; |
| 13 | |
| 14 | /// An error that occurs when dealing with Unicode. |
| 15 | /// |
| 16 | /// We don't impl the Error trait here because these always get converted |
| 17 | /// into other public errors. (This error type isn't exported.) |
| 18 | #[derive(Debug)] |
| 19 | pub enum Error { |
| 20 | PropertyNotFound, |
| 21 | PropertyValueNotFound, |
| 22 | // Not used when unicode-perl is enabled. |
| 23 | #[allow(dead_code)] |
| 24 | PerlClassNotFound, |
| 25 | } |
| 26 | |
| 27 | /// A type alias for errors specific to Unicode case folding. |
| 28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; |
| 29 | |
| 30 | /// An error that occurs when Unicode-aware simple case folding fails. |
| 31 | /// |
| 32 | /// This error can occur when the case mapping tables necessary for Unicode |
| 33 | /// aware case folding are unavailable. This only occurs when the |
| 34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
| 35 | #[derive(Debug)] |
| 36 | pub struct CaseFoldError(()); |
| 37 | |
| 38 | impl error::Error for CaseFoldError {} |
| 39 | |
| 40 | impl fmt::Display for CaseFoldError { |
| 41 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 42 | write!( |
| 43 | f, |
| 44 | "Unicode-aware case folding is not available \ |
| 45 | (probably because the unicode-case feature is not enabled)" |
| 46 | ) |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
| 51 | /// |
| 52 | /// This error can occur when the data tables necessary for the Unicode aware |
| 53 | /// Perl character class `\w` are unavailable. This only occurs when the |
| 54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
| 55 | #[derive(Debug)] |
| 56 | pub struct UnicodeWordError(()); |
| 57 | |
| 58 | impl error::Error for UnicodeWordError {} |
| 59 | |
| 60 | impl fmt::Display for UnicodeWordError { |
| 61 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 62 | write!( |
| 63 | f, |
| 64 | "Unicode-aware \\w class is not available \ |
| 65 | (probably because the unicode-perl feature is not enabled)" |
| 66 | ) |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | /// Return an iterator over the equivalence class of simple case mappings |
| 71 | /// for the given codepoint. The equivalence class does not include the |
| 72 | /// given codepoint. |
| 73 | /// |
| 74 | /// If the equivalence class is empty, then this returns the next scalar |
| 75 | /// value that has a non-empty equivalence class, if it exists. If no such |
| 76 | /// scalar value exists, then `None` is returned. The point of this behavior |
| 77 | /// is to permit callers to avoid calling `simple_fold` more than they need |
| 78 | /// to, since there is some cost to fetching the equivalence class. |
| 79 | /// |
| 80 | /// This returns an error if the Unicode case folding tables are not available. |
| 81 | pub fn simple_fold( |
| 82 | c: char, |
| 83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { |
| 84 | #[cfg(not(feature = "unicode-case"))] |
| 85 | fn imp( |
| 86 | _: char, |
| 87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
| 88 | { |
| 89 | use std::option::IntoIter; |
| 90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) |
| 91 | } |
| 92 | |
| 93 | #[cfg(feature = "unicode-case")] |
| 94 | fn imp( |
| 95 | c: char, |
| 96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
| 97 | { |
| 98 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
| 99 | |
| 100 | Ok(CASE_FOLDING_SIMPLE |
| 101 | .binary_search_by_key(&c, |&(c1, _)| c1) |
| 102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) |
| 103 | .map_err(|i| { |
| 104 | if i >= CASE_FOLDING_SIMPLE.len() { |
| 105 | None |
| 106 | } else { |
| 107 | Some(CASE_FOLDING_SIMPLE[i].0) |
| 108 | } |
| 109 | })) |
| 110 | } |
| 111 | |
| 112 | imp(c) |
| 113 | } |
| 114 | |
| 115 | /// Returns true if and only if the given (inclusive) range contains at least |
| 116 | /// one Unicode scalar value that has a non-empty non-trivial simple case |
| 117 | /// mapping. |
| 118 | /// |
| 119 | /// This function panics if `end < start`. |
| 120 | /// |
| 121 | /// This returns an error if the Unicode case folding tables are not available. |
| 122 | pub fn contains_simple_case_mapping( |
| 123 | start: char, |
| 124 | end: char, |
| 125 | ) -> FoldResult<bool> { |
| 126 | #[cfg(not(feature = "unicode-case"))] |
| 127 | fn imp(_: char, _: char) -> FoldResult<bool> { |
| 128 | Err(CaseFoldError(())) |
| 129 | } |
| 130 | |
| 131 | #[cfg(feature = "unicode-case")] |
| 132 | fn imp(start: char, end: char) -> FoldResult<bool> { |
| 133 | use std::cmp::Ordering; |
| 134 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
| 135 | |
| 136 | assert!(start <= end); |
| 137 | Ok(CASE_FOLDING_SIMPLE |
| 138 | .binary_search_by(|&(c, _)| { |
| 139 | if start <= c && c <= end { |
| 140 | Ordering::Equal |
| 141 | } else if c > end { |
| 142 | Ordering::Greater |
| 143 | } else { |
| 144 | Ordering::Less |
| 145 | } |
| 146 | }) |
| 147 | .is_ok()) |
| 148 | } |
| 149 | |
| 150 | imp(start, end) |
| 151 | } |
| 152 | |
| 153 | /// A query for finding a character class defined by Unicode. This supports |
| 154 | /// either use of a property name directly, or lookup by property value. The |
| 155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but |
| 156 | /// as a special exception (see UTS#18, Section 1.2) both general categories |
| 157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their |
| 158 | /// possible values were a binary property. |
| 159 | /// |
| 160 | /// In all circumstances, property names and values are normalized and |
| 161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
| 162 | /// |
| 163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
| 164 | /// and property value. |
| 165 | #[derive(Debug)] |
| 166 | pub enum ClassQuery<'a> { |
| 167 | /// Return a class corresponding to a Unicode binary property, named by |
| 168 | /// a single letter. |
| 169 | OneLetter(char), |
| 170 | /// Return a class corresponding to a Unicode binary property. |
| 171 | /// |
| 172 | /// Note that, by special exception (see UTS#18, Section 1.2), both |
| 173 | /// general category values and script values are permitted here as if |
| 174 | /// they were a binary property. |
| 175 | Binary(&'a str), |
| 176 | /// Return a class corresponding to all codepoints whose property |
| 177 | /// (identified by `property_name`) corresponds to the given value |
| 178 | /// (identified by `property_value`). |
| 179 | ByValue { |
| 180 | /// A property name. |
| 181 | property_name: &'a str, |
| 182 | /// A property value. |
| 183 | property_value: &'a str, |
| 184 | }, |
| 185 | } |
| 186 | |
| 187 | impl<'a> ClassQuery<'a> { |
| 188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { |
| 189 | match *self { |
| 190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
| 191 | ClassQuery::Binary(name) => self.canonical_binary(name), |
| 192 | ClassQuery::ByValue { property_name, property_value } => { |
| 193 | let property_name = symbolic_name_normalize(property_name); |
| 194 | let property_value = symbolic_name_normalize(property_value); |
| 195 | |
| 196 | let canon_name = match canonical_prop(&property_name)? { |
| 197 | None => return Err(Error::PropertyNotFound), |
| 198 | Some(canon_name) => canon_name, |
| 199 | }; |
| 200 | Ok(match canon_name { |
| 201 | "General_Category" => { |
| 202 | let canon = match canonical_gencat(&property_value)? { |
| 203 | None => return Err(Error::PropertyValueNotFound), |
| 204 | Some(canon) => canon, |
| 205 | }; |
| 206 | CanonicalClassQuery::GeneralCategory(canon) |
| 207 | } |
| 208 | "Script" => { |
| 209 | let canon = match canonical_script(&property_value)? { |
| 210 | None => return Err(Error::PropertyValueNotFound), |
| 211 | Some(canon) => canon, |
| 212 | }; |
| 213 | CanonicalClassQuery::Script(canon) |
| 214 | } |
| 215 | _ => { |
| 216 | let vals = match property_values(canon_name)? { |
| 217 | None => return Err(Error::PropertyValueNotFound), |
| 218 | Some(vals) => vals, |
| 219 | }; |
| 220 | let canon_val = |
| 221 | match canonical_value(vals, &property_value) { |
| 222 | None => { |
| 223 | return Err(Error::PropertyValueNotFound) |
| 224 | } |
| 225 | Some(canon_val) => canon_val, |
| 226 | }; |
| 227 | CanonicalClassQuery::ByValue { |
| 228 | property_name: canon_name, |
| 229 | property_value: canon_val, |
| 230 | } |
| 231 | } |
| 232 | }) |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { |
| 238 | let norm = symbolic_name_normalize(name); |
| 239 | |
| 240 | if let Some(canon) = canonical_prop(&norm)? { |
| 241 | return Ok(CanonicalClassQuery::Binary(canon)); |
| 242 | } |
| 243 | if let Some(canon) = canonical_gencat(&norm)? { |
| 244 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
| 245 | } |
| 246 | if let Some(canon) = canonical_script(&norm)? { |
| 247 | return Ok(CanonicalClassQuery::Script(canon)); |
| 248 | } |
| 249 | Err(Error::PropertyNotFound) |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | /// Like ClassQuery, but its parameters have been canonicalized. This also |
| 254 | /// differentiates binary properties from flattened general categories and |
| 255 | /// scripts. |
| 256 | #[derive(Debug, Eq, PartialEq)] |
| 257 | enum CanonicalClassQuery { |
| 258 | /// The canonical binary property name. |
| 259 | Binary(&'static str), |
| 260 | /// The canonical general category name. |
| 261 | GeneralCategory(&'static str), |
| 262 | /// The canonical script name. |
| 263 | Script(&'static str), |
| 264 | /// An arbitrary association between property and value, both of which |
| 265 | /// have been canonicalized. |
| 266 | /// |
| 267 | /// Note that by construction, the property name of ByValue will never |
| 268 | /// be General_Category or Script. Those two cases are subsumed by the |
| 269 | /// eponymous variants. |
| 270 | ByValue { |
| 271 | /// The canonical property name. |
| 272 | property_name: &'static str, |
| 273 | /// The canonical property value. |
| 274 | property_value: &'static str, |
| 275 | }, |
| 276 | } |
| 277 | |
| 278 | /// Looks up a Unicode class given a query. If one doesn't exist, then |
| 279 | /// `None` is returned. |
| 280 | pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> { |
| 281 | use self::CanonicalClassQuery::*; |
| 282 | |
| 283 | match query.canonicalize()? { |
| 284 | Binary(name) => bool_property(name), |
| 285 | GeneralCategory(name) => gencat(name), |
| 286 | Script(name) => script(name), |
| 287 | ByValue { property_name: "Age", property_value } => { |
| 288 | let mut class = hir::ClassUnicode::empty(); |
| 289 | for set in ages(property_value)? { |
| 290 | class.union(&hir_class(set)); |
| 291 | } |
| 292 | Ok(class) |
| 293 | } |
| 294 | ByValue { property_name: "Script_Extensions", property_value } => { |
| 295 | script_extension(property_value) |
| 296 | } |
| 297 | ByValue { |
| 298 | property_name: "Grapheme_Cluster_Break", |
| 299 | property_value, |
| 300 | } => gcb(property_value), |
| 301 | ByValue { property_name: "Sentence_Break", property_value } => { |
| 302 | sb(property_value) |
| 303 | } |
| 304 | ByValue { property_name: "Word_Break", property_value } => { |
| 305 | wb(property_value) |
| 306 | } |
| 307 | _ => { |
| 308 | // What else should we support? |
| 309 | Err(Error::PropertyNotFound) |
| 310 | } |
| 311 | } |
| 312 | } |
| 313 | |
| 314 | /// Returns a Unicode aware class for \w. |
| 315 | /// |
| 316 | /// This returns an error if the data is not available for \w. |
| 317 | pub fn perl_word() -> Result<hir::ClassUnicode> { |
| 318 | #[cfg(not(feature = "unicode-perl"))] |
| 319 | fn imp() -> Result<hir::ClassUnicode> { |
| 320 | Err(Error::PerlClassNotFound) |
| 321 | } |
| 322 | |
| 323 | #[cfg(feature = "unicode-perl")] |
| 324 | fn imp() -> Result<hir::ClassUnicode> { |
| 325 | use unicode_tables::perl_word::PERL_WORD; |
| 326 | Ok(hir_class(PERL_WORD)) |
| 327 | } |
| 328 | |
| 329 | imp() |
| 330 | } |
| 331 | |
| 332 | /// Returns a Unicode aware class for \s. |
| 333 | /// |
| 334 | /// This returns an error if the data is not available for \s. |
| 335 | pub fn perl_space() -> Result<hir::ClassUnicode> { |
| 336 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] |
| 337 | fn imp() -> Result<hir::ClassUnicode> { |
| 338 | Err(Error::PerlClassNotFound) |
| 339 | } |
| 340 | |
| 341 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] |
| 342 | fn imp() -> Result<hir::ClassUnicode> { |
| 343 | use unicode_tables::perl_space::WHITE_SPACE; |
| 344 | Ok(hir_class(WHITE_SPACE)) |
| 345 | } |
| 346 | |
| 347 | #[cfg(feature = "unicode-bool")] |
| 348 | fn imp() -> Result<hir::ClassUnicode> { |
| 349 | use unicode_tables::property_bool::WHITE_SPACE; |
| 350 | Ok(hir_class(WHITE_SPACE)) |
| 351 | } |
| 352 | |
| 353 | imp() |
| 354 | } |
| 355 | |
| 356 | /// Returns a Unicode aware class for \d. |
| 357 | /// |
| 358 | /// This returns an error if the data is not available for \d. |
| 359 | pub fn perl_digit() -> Result<hir::ClassUnicode> { |
| 360 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] |
| 361 | fn imp() -> Result<hir::ClassUnicode> { |
| 362 | Err(Error::PerlClassNotFound) |
| 363 | } |
| 364 | |
| 365 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] |
| 366 | fn imp() -> Result<hir::ClassUnicode> { |
| 367 | use unicode_tables::perl_decimal::DECIMAL_NUMBER; |
| 368 | Ok(hir_class(DECIMAL_NUMBER)) |
| 369 | } |
| 370 | |
| 371 | #[cfg(feature = "unicode-gencat")] |
| 372 | fn imp() -> Result<hir::ClassUnicode> { |
| 373 | use unicode_tables::general_category::DECIMAL_NUMBER; |
| 374 | Ok(hir_class(DECIMAL_NUMBER)) |
| 375 | } |
| 376 | |
| 377 | imp() |
| 378 | } |
| 379 | |
| 380 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
| 381 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
| 382 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges |
| 383 | .iter() |
| 384 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
| 385 | .collect(); |
| 386 | hir::ClassUnicode::new(hir_ranges) |
| 387 | } |
| 388 | |
| 389 | /// Returns true only if the given codepoint is in the `\w` character class. |
| 390 | /// |
| 391 | /// If the `unicode-perl` feature is not enabled, then this returns an error. |
| 392 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { |
| 393 | #[cfg(not(feature = "unicode-perl"))] |
| 394 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { |
| 395 | Err(UnicodeWordError(())) |
| 396 | } |
| 397 | |
| 398 | #[cfg(feature = "unicode-perl")] |
| 399 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { |
| 400 | use is_word_byte; |
| 401 | use std::cmp::Ordering; |
| 402 | use unicode_tables::perl_word::PERL_WORD; |
| 403 | |
| 404 | if c <= 0x7F as char && is_word_byte(c as u8) { |
| 405 | return Ok(true); |
| 406 | } |
| 407 | Ok(PERL_WORD |
| 408 | .binary_search_by(|&(start, end)| { |
| 409 | if start <= c && c <= end { |
| 410 | Ordering::Equal |
| 411 | } else if start > c { |
| 412 | Ordering::Greater |
| 413 | } else { |
| 414 | Ordering::Less |
| 415 | } |
| 416 | }) |
| 417 | .is_ok()) |
| 418 | } |
| 419 | |
| 420 | imp(c) |
| 421 | } |
| 422 | |
| 423 | /// A mapping of property values for a specific property. |
| 424 | /// |
| 425 | /// The first element of each tuple is a normalized property value while the |
| 426 | /// second element of each tuple is the corresponding canonical property |
| 427 | /// value. |
| 428 | type PropertyValues = &'static [(&'static str, &'static str)]; |
| 429 | |
| 430 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { |
| 431 | Ok(match normalized_value { |
| 432 | "any" => Some("Any"), |
| 433 | "assigned" => Some("Assigned"), |
| 434 | "ascii" => Some("ASCII"), |
| 435 | _ => { |
| 436 | let gencats = property_values("General_Category")?.unwrap(); |
| 437 | canonical_value(gencats, normalized_value) |
| 438 | } |
| 439 | }) |
| 440 | } |
| 441 | |
| 442 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { |
| 443 | let scripts = property_values("Script")?.unwrap(); |
| 444 | Ok(canonical_value(scripts, normalized_value)) |
| 445 | } |
| 446 | |
| 447 | /// Find the canonical property name for the given normalized property name. |
| 448 | /// |
| 449 | /// If no such property exists, then `None` is returned. |
| 450 | /// |
| 451 | /// The normalized property name must have been normalized according to |
| 452 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| 453 | /// |
| 454 | /// If the property names data is not available, then an error is returned. |
| 455 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { |
| 456 | #[cfg(not(any( |
| 457 | feature = "unicode-age", |
| 458 | feature = "unicode-bool", |
| 459 | feature = "unicode-gencat", |
| 460 | feature = "unicode-perl", |
| 461 | feature = "unicode-script", |
| 462 | feature = "unicode-segment", |
| 463 | )))] |
| 464 | fn imp(_: &str) -> Result<Option<&'static str>> { |
| 465 | Err(Error::PropertyNotFound) |
| 466 | } |
| 467 | |
| 468 | #[cfg(any( |
| 469 | feature = "unicode-age", |
| 470 | feature = "unicode-bool", |
| 471 | feature = "unicode-gencat", |
| 472 | feature = "unicode-perl", |
| 473 | feature = "unicode-script", |
| 474 | feature = "unicode-segment", |
| 475 | ))] |
| 476 | fn imp(name: &str) -> Result<Option<&'static str>> { |
| 477 | use unicode_tables::property_names::PROPERTY_NAMES; |
| 478 | |
| 479 | Ok(PROPERTY_NAMES |
| 480 | .binary_search_by_key(&name, |&(n, _)| n) |
| 481 | .ok() |
| 482 | .map(|i| PROPERTY_NAMES[i].1)) |
| 483 | } |
| 484 | |
| 485 | imp(normalized_name) |
| 486 | } |
| 487 | |
| 488 | /// Find the canonical property value for the given normalized property |
| 489 | /// value. |
| 490 | /// |
| 491 | /// The given property values should correspond to the values for the property |
| 492 | /// under question, which can be found using `property_values`. |
| 493 | /// |
| 494 | /// If no such property value exists, then `None` is returned. |
| 495 | /// |
| 496 | /// The normalized property value must have been normalized according to |
| 497 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| 498 | fn canonical_value( |
| 499 | vals: PropertyValues, |
| 500 | normalized_value: &str, |
| 501 | ) -> Option<&'static str> { |
| 502 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
| 503 | .ok() |
| 504 | .map(|i| vals[i].1) |
| 505 | } |
| 506 | |
| 507 | /// Return the table of property values for the given property name. |
| 508 | /// |
| 509 | /// If the property values data is not available, then an error is returned. |
| 510 | fn property_values( |
| 511 | canonical_property_name: &'static str, |
| 512 | ) -> Result<Option<PropertyValues>> { |
| 513 | #[cfg(not(any( |
| 514 | feature = "unicode-age", |
| 515 | feature = "unicode-bool", |
| 516 | feature = "unicode-gencat", |
| 517 | feature = "unicode-perl", |
| 518 | feature = "unicode-script", |
| 519 | feature = "unicode-segment", |
| 520 | )))] |
| 521 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { |
| 522 | Err(Error::PropertyValueNotFound) |
| 523 | } |
| 524 | |
| 525 | #[cfg(any( |
| 526 | feature = "unicode-age", |
| 527 | feature = "unicode-bool", |
| 528 | feature = "unicode-gencat", |
| 529 | feature = "unicode-perl", |
| 530 | feature = "unicode-script", |
| 531 | feature = "unicode-segment", |
| 532 | ))] |
| 533 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { |
| 534 | use unicode_tables::property_values::PROPERTY_VALUES; |
| 535 | |
| 536 | Ok(PROPERTY_VALUES |
| 537 | .binary_search_by_key(&name, |&(n, _)| n) |
| 538 | .ok() |
| 539 | .map(|i| PROPERTY_VALUES[i].1)) |
| 540 | } |
| 541 | |
| 542 | imp(canonical_property_name) |
| 543 | } |
| 544 | |
| 545 | // This is only used in some cases, but small enough to just let it be dead |
| 546 | // instead of figuring out (and maintaining) the right set of features. |
| 547 | #[allow(dead_code)] |
| 548 | fn property_set( |
| 549 | name_map: &'static [(&'static str, Range)], |
| 550 | canonical: &'static str, |
| 551 | ) -> Option<Range> { |
| 552 | name_map |
| 553 | .binary_search_by_key(&canonical, |x| x.0) |
| 554 | .ok() |
| 555 | .map(|i| name_map[i].1) |
| 556 | } |
| 557 | |
| 558 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
| 559 | /// of codepoints that were added in a particular revision of Unicode. The |
| 560 | /// iterator yields items in chronological order. |
| 561 | /// |
| 562 | /// If the given age value isn't valid or if the data isn't available, then an |
| 563 | /// error is returned instead. |
| 564 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
| 565 | #[cfg(not(feature = "unicode-age"))] |
| 566 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { |
| 567 | use std::option::IntoIter; |
| 568 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
| 569 | } |
| 570 | |
| 571 | #[cfg(feature = "unicode-age")] |
| 572 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
| 573 | use unicode_tables::age; |
| 574 | |
| 575 | const AGES: &'static [(&'static str, Range)] = &[ |
| 576 | ("V1_1", age::V1_1), |
| 577 | ("V2_0", age::V2_0), |
| 578 | ("V2_1", age::V2_1), |
| 579 | ("V3_0", age::V3_0), |
| 580 | ("V3_1", age::V3_1), |
| 581 | ("V3_2", age::V3_2), |
| 582 | ("V4_0", age::V4_0), |
| 583 | ("V4_1", age::V4_1), |
| 584 | ("V5_0", age::V5_0), |
| 585 | ("V5_1", age::V5_1), |
| 586 | ("V5_2", age::V5_2), |
| 587 | ("V6_0", age::V6_0), |
| 588 | ("V6_1", age::V6_1), |
| 589 | ("V6_2", age::V6_2), |
| 590 | ("V6_3", age::V6_3), |
| 591 | ("V7_0", age::V7_0), |
| 592 | ("V8_0", age::V8_0), |
| 593 | ("V9_0", age::V9_0), |
| 594 | ("V10_0", age::V10_0), |
| 595 | ("V11_0", age::V11_0), |
| 596 | ("V12_0", age::V12_0), |
| 597 | ("V12_1", age::V12_1), |
| 598 | ("V13_0", age::V13_0), |
| 599 | ]; |
| 600 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); |
| 601 | |
| 602 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
| 603 | match pos { |
| 604 | None => Err(Error::PropertyValueNotFound), |
| 605 | Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), |
| 606 | } |
| 607 | } |
| 608 | |
| 609 | imp(canonical_age) |
| 610 | } |
| 611 | |
| 612 | /// Returns the Unicode HIR class corresponding to the given general category. |
| 613 | /// |
| 614 | /// Name canonicalization is assumed to be performed by the caller. |
| 615 | /// |
| 616 | /// If the given general category could not be found, or if the general |
| 617 | /// category data is not available, then an error is returned. |
| 618 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 619 | #[cfg(not(feature = "unicode-gencat"))] |
| 620 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 621 | Err(Error::PropertyNotFound) |
| 622 | } |
| 623 | |
| 624 | #[cfg(feature = "unicode-gencat")] |
| 625 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 626 | use unicode_tables::general_category::BY_NAME; |
| 627 | match name { |
| 628 | "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), |
| 629 | "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), |
| 630 | "Assigned" => { |
| 631 | let mut cls = gencat("Unassigned")?; |
| 632 | cls.negate(); |
| 633 | Ok(cls) |
| 634 | } |
| 635 | name => property_set(BY_NAME, name) |
| 636 | .map(hir_class) |
| 637 | .ok_or(Error::PropertyValueNotFound), |
| 638 | } |
| 639 | } |
| 640 | |
| 641 | match canonical_name { |
| 642 | "Decimal_Number" => perl_digit(), |
| 643 | name => imp(name), |
| 644 | } |
| 645 | } |
| 646 | |
| 647 | /// Returns the Unicode HIR class corresponding to the given script. |
| 648 | /// |
| 649 | /// Name canonicalization is assumed to be performed by the caller. |
| 650 | /// |
| 651 | /// If the given script could not be found, or if the script data is not |
| 652 | /// available, then an error is returned. |
| 653 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 654 | #[cfg(not(feature = "unicode-script"))] |
| 655 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 656 | Err(Error::PropertyNotFound) |
| 657 | } |
| 658 | |
| 659 | #[cfg(feature = "unicode-script")] |
| 660 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 661 | use unicode_tables::script::BY_NAME; |
| 662 | property_set(BY_NAME, name) |
| 663 | .map(hir_class) |
| 664 | .ok_or(Error::PropertyValueNotFound) |
| 665 | } |
| 666 | |
| 667 | imp(canonical_name) |
| 668 | } |
| 669 | |
| 670 | /// Returns the Unicode HIR class corresponding to the given script extension. |
| 671 | /// |
| 672 | /// Name canonicalization is assumed to be performed by the caller. |
| 673 | /// |
| 674 | /// If the given script extension could not be found, or if the script data is |
| 675 | /// not available, then an error is returned. |
| 676 | fn script_extension( |
| 677 | canonical_name: &'static str, |
| 678 | ) -> Result<hir::ClassUnicode> { |
| 679 | #[cfg(not(feature = "unicode-script"))] |
| 680 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 681 | Err(Error::PropertyNotFound) |
| 682 | } |
| 683 | |
| 684 | #[cfg(feature = "unicode-script")] |
| 685 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 686 | use unicode_tables::script_extension::BY_NAME; |
| 687 | property_set(BY_NAME, name) |
| 688 | .map(hir_class) |
| 689 | .ok_or(Error::PropertyValueNotFound) |
| 690 | } |
| 691 | |
| 692 | imp(canonical_name) |
| 693 | } |
| 694 | |
| 695 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
| 696 | /// property. |
| 697 | /// |
| 698 | /// Name canonicalization is assumed to be performed by the caller. |
| 699 | /// |
| 700 | /// If the given boolean property could not be found, or if the boolean |
| 701 | /// property data is not available, then an error is returned. |
| 702 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 703 | #[cfg(not(feature = "unicode-bool"))] |
| 704 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 705 | Err(Error::PropertyNotFound) |
| 706 | } |
| 707 | |
| 708 | #[cfg(feature = "unicode-bool")] |
| 709 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 710 | use unicode_tables::property_bool::BY_NAME; |
| 711 | property_set(BY_NAME, name) |
| 712 | .map(hir_class) |
| 713 | .ok_or(Error::PropertyNotFound) |
| 714 | } |
| 715 | |
| 716 | match canonical_name { |
| 717 | "Decimal_Number" => perl_digit(), |
| 718 | "White_Space" => perl_space(), |
| 719 | name => imp(name), |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
| 724 | /// break property. |
| 725 | /// |
| 726 | /// Name canonicalization is assumed to be performed by the caller. |
| 727 | /// |
| 728 | /// If the given property could not be found, or if the corresponding data is |
| 729 | /// not available, then an error is returned. |
| 730 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 731 | #[cfg(not(feature = "unicode-segment"))] |
| 732 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 733 | Err(Error::PropertyNotFound) |
| 734 | } |
| 735 | |
| 736 | #[cfg(feature = "unicode-segment")] |
| 737 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 738 | use unicode_tables::grapheme_cluster_break::BY_NAME; |
| 739 | property_set(BY_NAME, name) |
| 740 | .map(hir_class) |
| 741 | .ok_or(Error::PropertyValueNotFound) |
| 742 | } |
| 743 | |
| 744 | imp(canonical_name) |
| 745 | } |
| 746 | |
| 747 | /// Returns the Unicode HIR class corresponding to the given word break |
| 748 | /// property. |
| 749 | /// |
| 750 | /// Name canonicalization is assumed to be performed by the caller. |
| 751 | /// |
| 752 | /// If the given property could not be found, or if the corresponding data is |
| 753 | /// not available, then an error is returned. |
| 754 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 755 | #[cfg(not(feature = "unicode-segment"))] |
| 756 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 757 | Err(Error::PropertyNotFound) |
| 758 | } |
| 759 | |
| 760 | #[cfg(feature = "unicode-segment")] |
| 761 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 762 | use unicode_tables::word_break::BY_NAME; |
| 763 | property_set(BY_NAME, name) |
| 764 | .map(hir_class) |
| 765 | .ok_or(Error::PropertyValueNotFound) |
| 766 | } |
| 767 | |
| 768 | imp(canonical_name) |
| 769 | } |
| 770 | |
| 771 | /// Returns the Unicode HIR class corresponding to the given sentence |
| 772 | /// break property. |
| 773 | /// |
| 774 | /// Name canonicalization is assumed to be performed by the caller. |
| 775 | /// |
| 776 | /// If the given property could not be found, or if the corresponding data is |
| 777 | /// not available, then an error is returned. |
| 778 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 779 | #[cfg(not(feature = "unicode-segment"))] |
| 780 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 781 | Err(Error::PropertyNotFound) |
| 782 | } |
| 783 | |
| 784 | #[cfg(feature = "unicode-segment")] |
| 785 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 786 | use unicode_tables::sentence_break::BY_NAME; |
| 787 | property_set(BY_NAME, name) |
| 788 | .map(hir_class) |
| 789 | .ok_or(Error::PropertyValueNotFound) |
| 790 | } |
| 791 | |
| 792 | imp(canonical_name) |
| 793 | } |
| 794 | |
| 795 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
| 796 | fn symbolic_name_normalize(x: &str) -> String { |
| 797 | let mut tmp = x.as_bytes().to_vec(); |
| 798 | let len = symbolic_name_normalize_bytes(&mut tmp).len(); |
| 799 | tmp.truncate(len); |
| 800 | // This should always succeed because `symbolic_name_normalize_bytes` |
| 801 | // guarantees that `&tmp[..len]` is always valid UTF-8. |
| 802 | // |
| 803 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
| 804 | // to be worth skipping the additional safety check. A benchmark must |
| 805 | // justify it first. |
| 806 | String::from_utf8(tmp).unwrap() |
| 807 | } |
| 808 | |
| 809 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
| 810 | /// |
| 811 | /// A "symbolic name" typically corresponds to property names and property |
| 812 | /// value aliases. Note, though, that it should not be applied to property |
| 813 | /// string values. |
| 814 | /// |
| 815 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
| 816 | /// of `slice`. |
| 817 | /// |
| 818 | /// See: http://unicode.org/reports/tr44/#UAX44-LM3 |
| 819 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
| 820 | // I couldn't find a place in the standard that specified that property |
| 821 | // names/aliases had a particular structure (unlike character names), but |
| 822 | // we assume that it's ASCII only and drop anything that isn't ASCII. |
| 823 | let mut start = 0; |
| 824 | let mut starts_with_is = false; |
| 825 | if slice.len() >= 2 { |
| 826 | // Ignore any "is" prefix. |
| 827 | starts_with_is = slice[0..2] == b"is"[..] |
| 828 | || slice[0..2] == b"IS"[..] |
| 829 | || slice[0..2] == b"iS"[..] |
| 830 | || slice[0..2] == b"Is"[..]; |
| 831 | if starts_with_is { |
| 832 | start = 2; |
| 833 | } |
| 834 | } |
| 835 | let mut next_write = 0; |
| 836 | for i in start..slice.len() { |
| 837 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
| 838 | // UTF-8, we ensure that the slice contains only ASCII bytes. In |
| 839 | // particular, we drop every non-ASCII byte from the normalized string. |
| 840 | let b = slice[i]; |
| 841 | if b == b' ' || b == b'_' || b == b'-' { |
| 842 | continue; |
| 843 | } else if b'A' <= b && b <= b'Z' { |
| 844 | slice[next_write] = b + (b'a' - b'A'); |
| 845 | next_write += 1; |
| 846 | } else if b <= 0x7F { |
| 847 | slice[next_write] = b; |
| 848 | next_write += 1; |
| 849 | } |
| 850 | } |
| 851 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
| 852 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
| 853 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
| 854 | // is actually an alias for the 'Other' general category. |
| 855 | if starts_with_is && next_write == 1 && slice[0] == b'c' { |
| 856 | slice[0] = b'i'; |
| 857 | slice[1] = b's'; |
| 858 | slice[2] = b'c'; |
| 859 | next_write = 3; |
| 860 | } |
| 861 | &mut slice[..next_write] |
| 862 | } |
| 863 | |
| 864 | #[cfg(test)] |
| 865 | mod tests { |
| 866 | use super::{ |
| 867 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, |
| 868 | symbolic_name_normalize_bytes, |
| 869 | }; |
| 870 | |
| 871 | #[cfg(feature = "unicode-case")] |
| 872 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
| 873 | simple_fold(c).unwrap().unwrap() |
| 874 | } |
| 875 | |
| 876 | #[cfg(feature = "unicode-case")] |
| 877 | fn simple_fold_err(c: char) -> Option<char> { |
| 878 | match simple_fold(c).unwrap() { |
| 879 | Ok(_) => unreachable!("simple_fold returned Ok iterator"), |
| 880 | Err(next) => next, |
| 881 | } |
| 882 | } |
| 883 | |
| 884 | #[cfg(feature = "unicode-case")] |
| 885 | fn contains_case_map(start: char, end: char) -> bool { |
| 886 | contains_simple_case_mapping(start, end).unwrap() |
| 887 | } |
| 888 | |
| 889 | #[test] |
| 890 | #[cfg(feature = "unicode-case")] |
| 891 | fn simple_fold_k() { |
| 892 | let xs: Vec<char> = simple_fold_ok('k').collect(); |
| 893 | assert_eq!(xs, vec!['K', 'K']); |
| 894 | |
| 895 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
| 896 | assert_eq!(xs, vec!['k', 'K']); |
| 897 | |
| 898 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
| 899 | assert_eq!(xs, vec!['K', 'k']); |
| 900 | } |
| 901 | |
| 902 | #[test] |
| 903 | #[cfg(feature = "unicode-case")] |
| 904 | fn simple_fold_a() { |
| 905 | let xs: Vec<char> = simple_fold_ok('a').collect(); |
| 906 | assert_eq!(xs, vec!['A']); |
| 907 | |
| 908 | let xs: Vec<char> = simple_fold_ok('A').collect(); |
| 909 | assert_eq!(xs, vec!['a']); |
| 910 | } |
| 911 | |
| 912 | #[test] |
| 913 | #[cfg(feature = "unicode-case")] |
| 914 | fn simple_fold_empty() { |
| 915 | assert_eq!(Some('A'), simple_fold_err('?')); |
| 916 | assert_eq!(Some('A'), simple_fold_err('@')); |
| 917 | assert_eq!(Some('a'), simple_fold_err('[')); |
| 918 | assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); |
| 919 | } |
| 920 | |
| 921 | #[test] |
| 922 | #[cfg(feature = "unicode-case")] |
| 923 | fn simple_fold_max() { |
| 924 | assert_eq!(None, simple_fold_err('\u{10FFFE}')); |
| 925 | assert_eq!(None, simple_fold_err('\u{10FFFF}')); |
| 926 | } |
| 927 | |
| 928 | #[test] |
| 929 | #[cfg(not(feature = "unicode-case"))] |
| 930 | fn simple_fold_disabled() { |
| 931 | assert!(simple_fold('a').is_err()); |
| 932 | } |
| 933 | |
| 934 | #[test] |
| 935 | #[cfg(feature = "unicode-case")] |
| 936 | fn range_contains() { |
| 937 | assert!(contains_case_map('A', 'A')); |
| 938 | assert!(contains_case_map('Z', 'Z')); |
| 939 | assert!(contains_case_map('A', 'Z')); |
| 940 | assert!(contains_case_map('@', 'A')); |
| 941 | assert!(contains_case_map('Z', '[')); |
| 942 | assert!(contains_case_map('☃', 'Ⰰ')); |
| 943 | |
| 944 | assert!(!contains_case_map('[', '[')); |
| 945 | assert!(!contains_case_map('[', '`')); |
| 946 | |
| 947 | assert!(!contains_case_map('☃', '☃')); |
| 948 | } |
| 949 | |
| 950 | #[test] |
| 951 | #[cfg(not(feature = "unicode-case"))] |
| 952 | fn range_contains_disabled() { |
| 953 | assert!(contains_simple_case_mapping('a', 'a').is_err()); |
| 954 | } |
| 955 | |
| 956 | #[test] |
| 957 | #[cfg(feature = "unicode-gencat")] |
| 958 | fn regression_466() { |
| 959 | use super::{CanonicalClassQuery, ClassQuery}; |
| 960 | |
| 961 | let q = ClassQuery::OneLetter('C'); |
| 962 | assert_eq!( |
| 963 | q.canonicalize().unwrap(), |
| 964 | CanonicalClassQuery::GeneralCategory("Other") |
| 965 | ); |
| 966 | } |
| 967 | |
| 968 | #[test] |
| 969 | fn sym_normalize() { |
| 970 | let sym_norm = symbolic_name_normalize; |
| 971 | |
| 972 | assert_eq!(sym_norm("Line_Break"), "linebreak"); |
| 973 | assert_eq!(sym_norm("Line-break"), "linebreak"); |
| 974 | assert_eq!(sym_norm("linebreak"), "linebreak"); |
| 975 | assert_eq!(sym_norm("BA"), "ba"); |
| 976 | assert_eq!(sym_norm("ba"), "ba"); |
| 977 | assert_eq!(sym_norm("Greek"), "greek"); |
| 978 | assert_eq!(sym_norm("isGreek"), "greek"); |
| 979 | assert_eq!(sym_norm("IS_Greek"), "greek"); |
| 980 | assert_eq!(sym_norm("isc"), "isc"); |
| 981 | assert_eq!(sym_norm("is c"), "isc"); |
| 982 | assert_eq!(sym_norm("is_c"), "isc"); |
| 983 | } |
| 984 | |
| 985 | #[test] |
| 986 | fn valid_utf8_symbolic() { |
| 987 | let mut x = b"abc\xFFxyz".to_vec(); |
| 988 | let y = symbolic_name_normalize_bytes(&mut x); |
| 989 | assert_eq!(y, b"abcxyz"); |
| 990 | } |
| 991 | } |