Chih-Hung Hsieh | 048fc04 | 2020-04-16 10:44:22 -0700 | [diff] [blame] | 1 | use std::error; |
| 2 | use std::fmt; |
| 3 | use std::result; |
| 4 | |
| 5 | use hir; |
| 6 | |
| 7 | /// A type alias for errors specific to Unicode handling of classes. |
| 8 | pub type Result<T> = result::Result<T, Error>; |
| 9 | |
| 10 | /// An inclusive range of codepoints from a generated file (hence the static |
| 11 | /// lifetime). |
| 12 | type Range = &'static [(char, char)]; |
| 13 | |
| 14 | /// An error that occurs when dealing with Unicode. |
| 15 | /// |
| 16 | /// We don't impl the Error trait here because these always get converted |
| 17 | /// into other public errors. (This error type isn't exported.) |
| 18 | #[derive(Debug)] |
| 19 | pub enum Error { |
| 20 | PropertyNotFound, |
| 21 | PropertyValueNotFound, |
| 22 | // Not used when unicode-perl is enabled. |
| 23 | #[allow(dead_code)] |
| 24 | PerlClassNotFound, |
| 25 | } |
| 26 | |
| 27 | /// A type alias for errors specific to Unicode case folding. |
| 28 | pub type FoldResult<T> = result::Result<T, CaseFoldError>; |
| 29 | |
| 30 | /// An error that occurs when Unicode-aware simple case folding fails. |
| 31 | /// |
| 32 | /// This error can occur when the case mapping tables necessary for Unicode |
| 33 | /// aware case folding are unavailable. This only occurs when the |
| 34 | /// `unicode-case` feature is disabled. (The feature is enabled by default.) |
| 35 | #[derive(Debug)] |
| 36 | pub struct CaseFoldError(()); |
| 37 | |
| 38 | impl error::Error for CaseFoldError {} |
| 39 | |
| 40 | impl fmt::Display for CaseFoldError { |
| 41 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 42 | write!( |
| 43 | f, |
| 44 | "Unicode-aware case folding is not available \ |
| 45 | (probably because the unicode-case feature is not enabled)" |
| 46 | ) |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
| 51 | /// |
| 52 | /// This error can occur when the data tables necessary for the Unicode aware |
| 53 | /// Perl character class `\w` are unavailable. This only occurs when the |
| 54 | /// `unicode-perl` feature is disabled. (The feature is enabled by default.) |
| 55 | #[derive(Debug)] |
| 56 | pub struct UnicodeWordError(()); |
| 57 | |
| 58 | impl error::Error for UnicodeWordError {} |
| 59 | |
| 60 | impl fmt::Display for UnicodeWordError { |
| 61 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 62 | write!( |
| 63 | f, |
| 64 | "Unicode-aware \\w class is not available \ |
| 65 | (probably because the unicode-perl feature is not enabled)" |
| 66 | ) |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | /// Return an iterator over the equivalence class of simple case mappings |
| 71 | /// for the given codepoint. The equivalence class does not include the |
| 72 | /// given codepoint. |
| 73 | /// |
| 74 | /// If the equivalence class is empty, then this returns the next scalar |
| 75 | /// value that has a non-empty equivalence class, if it exists. If no such |
| 76 | /// scalar value exists, then `None` is returned. The point of this behavior |
| 77 | /// is to permit callers to avoid calling `simple_fold` more than they need |
| 78 | /// to, since there is some cost to fetching the equivalence class. |
| 79 | /// |
| 80 | /// This returns an error if the Unicode case folding tables are not available. |
| 81 | pub fn simple_fold( |
| 82 | c: char, |
| 83 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> { |
| 84 | #[cfg(not(feature = "unicode-case"))] |
| 85 | fn imp( |
| 86 | _: char, |
| 87 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
| 88 | { |
| 89 | use std::option::IntoIter; |
| 90 | Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(())) |
| 91 | } |
| 92 | |
| 93 | #[cfg(feature = "unicode-case")] |
| 94 | fn imp( |
| 95 | c: char, |
| 96 | ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> |
| 97 | { |
| 98 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
| 99 | |
| 100 | Ok(CASE_FOLDING_SIMPLE |
| 101 | .binary_search_by_key(&c, |&(c1, _)| c1) |
| 102 | .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().map(|&c| c)) |
| 103 | .map_err(|i| { |
| 104 | if i >= CASE_FOLDING_SIMPLE.len() { |
| 105 | None |
| 106 | } else { |
| 107 | Some(CASE_FOLDING_SIMPLE[i].0) |
| 108 | } |
| 109 | })) |
| 110 | } |
| 111 | |
| 112 | imp(c) |
| 113 | } |
| 114 | |
| 115 | /// Returns true if and only if the given (inclusive) range contains at least |
| 116 | /// one Unicode scalar value that has a non-empty non-trivial simple case |
| 117 | /// mapping. |
| 118 | /// |
| 119 | /// This function panics if `end < start`. |
| 120 | /// |
| 121 | /// This returns an error if the Unicode case folding tables are not available. |
| 122 | pub fn contains_simple_case_mapping( |
| 123 | start: char, |
| 124 | end: char, |
| 125 | ) -> FoldResult<bool> { |
| 126 | #[cfg(not(feature = "unicode-case"))] |
| 127 | fn imp(_: char, _: char) -> FoldResult<bool> { |
| 128 | Err(CaseFoldError(())) |
| 129 | } |
| 130 | |
| 131 | #[cfg(feature = "unicode-case")] |
| 132 | fn imp(start: char, end: char) -> FoldResult<bool> { |
| 133 | use std::cmp::Ordering; |
| 134 | use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; |
| 135 | |
| 136 | assert!(start <= end); |
| 137 | Ok(CASE_FOLDING_SIMPLE |
| 138 | .binary_search_by(|&(c, _)| { |
| 139 | if start <= c && c <= end { |
| 140 | Ordering::Equal |
| 141 | } else if c > end { |
| 142 | Ordering::Greater |
| 143 | } else { |
| 144 | Ordering::Less |
| 145 | } |
| 146 | }) |
| 147 | .is_ok()) |
| 148 | } |
| 149 | |
| 150 | imp(start, end) |
| 151 | } |
| 152 | |
| 153 | /// A query for finding a character class defined by Unicode. This supports |
| 154 | /// either use of a property name directly, or lookup by property value. The |
| 155 | /// former generally refers to Binary properties (see UTS#44, Table 8), but |
| 156 | /// as a special exception (see UTS#18, Section 1.2) both general categories |
| 157 | /// (an enumeration) and scripts (a catalog) are supported as if each of their |
| 158 | /// possible values were a binary property. |
| 159 | /// |
| 160 | /// In all circumstances, property names and values are normalized and |
| 161 | /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. |
| 162 | /// |
| 163 | /// The lifetime `'a` refers to the shorter of the lifetimes of property name |
| 164 | /// and property value. |
| 165 | #[derive(Debug)] |
| 166 | pub enum ClassQuery<'a> { |
| 167 | /// Return a class corresponding to a Unicode binary property, named by |
| 168 | /// a single letter. |
| 169 | OneLetter(char), |
| 170 | /// Return a class corresponding to a Unicode binary property. |
| 171 | /// |
| 172 | /// Note that, by special exception (see UTS#18, Section 1.2), both |
| 173 | /// general category values and script values are permitted here as if |
| 174 | /// they were a binary property. |
| 175 | Binary(&'a str), |
| 176 | /// Return a class corresponding to all codepoints whose property |
| 177 | /// (identified by `property_name`) corresponds to the given value |
| 178 | /// (identified by `property_value`). |
| 179 | ByValue { |
| 180 | /// A property name. |
| 181 | property_name: &'a str, |
| 182 | /// A property value. |
| 183 | property_value: &'a str, |
| 184 | }, |
| 185 | } |
| 186 | |
| 187 | impl<'a> ClassQuery<'a> { |
| 188 | fn canonicalize(&self) -> Result<CanonicalClassQuery> { |
| 189 | match *self { |
| 190 | ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), |
| 191 | ClassQuery::Binary(name) => self.canonical_binary(name), |
| 192 | ClassQuery::ByValue { property_name, property_value } => { |
| 193 | let property_name = symbolic_name_normalize(property_name); |
| 194 | let property_value = symbolic_name_normalize(property_value); |
| 195 | |
| 196 | let canon_name = match canonical_prop(&property_name)? { |
| 197 | None => return Err(Error::PropertyNotFound), |
| 198 | Some(canon_name) => canon_name, |
| 199 | }; |
| 200 | Ok(match canon_name { |
| 201 | "General_Category" => { |
| 202 | let canon = match canonical_gencat(&property_value)? { |
| 203 | None => return Err(Error::PropertyValueNotFound), |
| 204 | Some(canon) => canon, |
| 205 | }; |
| 206 | CanonicalClassQuery::GeneralCategory(canon) |
| 207 | } |
| 208 | "Script" => { |
| 209 | let canon = match canonical_script(&property_value)? { |
| 210 | None => return Err(Error::PropertyValueNotFound), |
| 211 | Some(canon) => canon, |
| 212 | }; |
| 213 | CanonicalClassQuery::Script(canon) |
| 214 | } |
| 215 | _ => { |
| 216 | let vals = match property_values(canon_name)? { |
| 217 | None => return Err(Error::PropertyValueNotFound), |
| 218 | Some(vals) => vals, |
| 219 | }; |
| 220 | let canon_val = |
| 221 | match canonical_value(vals, &property_value) { |
| 222 | None => { |
| 223 | return Err(Error::PropertyValueNotFound) |
| 224 | } |
| 225 | Some(canon_val) => canon_val, |
| 226 | }; |
| 227 | CanonicalClassQuery::ByValue { |
| 228 | property_name: canon_name, |
| 229 | property_value: canon_val, |
| 230 | } |
| 231 | } |
| 232 | }) |
| 233 | } |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> { |
| 238 | let norm = symbolic_name_normalize(name); |
| 239 | |
Chih-Hung Hsieh | 31dfd7f | 2020-10-26 13:16:58 -0700 | [diff] [blame] | 240 | // This is a special case where 'cf' refers to the 'Format' general |
| 241 | // category, but where the 'cf' abbreviation is also an abbreviation |
| 242 | // for the 'Case_Folding' property. But we want to treat it as |
| 243 | // a general category. (Currently, we don't even support the |
| 244 | // 'Case_Folding' property. But if we do in the future, users will be |
| 245 | // required to spell it out.) |
| 246 | if norm != "cf" { |
| 247 | if let Some(canon) = canonical_prop(&norm)? { |
| 248 | return Ok(CanonicalClassQuery::Binary(canon)); |
| 249 | } |
Chih-Hung Hsieh | 048fc04 | 2020-04-16 10:44:22 -0700 | [diff] [blame] | 250 | } |
| 251 | if let Some(canon) = canonical_gencat(&norm)? { |
| 252 | return Ok(CanonicalClassQuery::GeneralCategory(canon)); |
| 253 | } |
| 254 | if let Some(canon) = canonical_script(&norm)? { |
| 255 | return Ok(CanonicalClassQuery::Script(canon)); |
| 256 | } |
| 257 | Err(Error::PropertyNotFound) |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | /// Like ClassQuery, but its parameters have been canonicalized. This also |
| 262 | /// differentiates binary properties from flattened general categories and |
| 263 | /// scripts. |
| 264 | #[derive(Debug, Eq, PartialEq)] |
| 265 | enum CanonicalClassQuery { |
| 266 | /// The canonical binary property name. |
| 267 | Binary(&'static str), |
| 268 | /// The canonical general category name. |
| 269 | GeneralCategory(&'static str), |
| 270 | /// The canonical script name. |
| 271 | Script(&'static str), |
| 272 | /// An arbitrary association between property and value, both of which |
| 273 | /// have been canonicalized. |
| 274 | /// |
| 275 | /// Note that by construction, the property name of ByValue will never |
| 276 | /// be General_Category or Script. Those two cases are subsumed by the |
| 277 | /// eponymous variants. |
| 278 | ByValue { |
| 279 | /// The canonical property name. |
| 280 | property_name: &'static str, |
| 281 | /// The canonical property value. |
| 282 | property_value: &'static str, |
| 283 | }, |
| 284 | } |
| 285 | |
| 286 | /// Looks up a Unicode class given a query. If one doesn't exist, then |
| 287 | /// `None` is returned. |
| 288 | pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> { |
| 289 | use self::CanonicalClassQuery::*; |
| 290 | |
| 291 | match query.canonicalize()? { |
| 292 | Binary(name) => bool_property(name), |
| 293 | GeneralCategory(name) => gencat(name), |
| 294 | Script(name) => script(name), |
| 295 | ByValue { property_name: "Age", property_value } => { |
| 296 | let mut class = hir::ClassUnicode::empty(); |
| 297 | for set in ages(property_value)? { |
| 298 | class.union(&hir_class(set)); |
| 299 | } |
| 300 | Ok(class) |
| 301 | } |
| 302 | ByValue { property_name: "Script_Extensions", property_value } => { |
| 303 | script_extension(property_value) |
| 304 | } |
| 305 | ByValue { |
| 306 | property_name: "Grapheme_Cluster_Break", |
| 307 | property_value, |
| 308 | } => gcb(property_value), |
| 309 | ByValue { property_name: "Sentence_Break", property_value } => { |
| 310 | sb(property_value) |
| 311 | } |
| 312 | ByValue { property_name: "Word_Break", property_value } => { |
| 313 | wb(property_value) |
| 314 | } |
| 315 | _ => { |
| 316 | // What else should we support? |
| 317 | Err(Error::PropertyNotFound) |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | /// Returns a Unicode aware class for \w. |
| 323 | /// |
| 324 | /// This returns an error if the data is not available for \w. |
| 325 | pub fn perl_word() -> Result<hir::ClassUnicode> { |
| 326 | #[cfg(not(feature = "unicode-perl"))] |
| 327 | fn imp() -> Result<hir::ClassUnicode> { |
| 328 | Err(Error::PerlClassNotFound) |
| 329 | } |
| 330 | |
| 331 | #[cfg(feature = "unicode-perl")] |
| 332 | fn imp() -> Result<hir::ClassUnicode> { |
| 333 | use unicode_tables::perl_word::PERL_WORD; |
| 334 | Ok(hir_class(PERL_WORD)) |
| 335 | } |
| 336 | |
| 337 | imp() |
| 338 | } |
| 339 | |
| 340 | /// Returns a Unicode aware class for \s. |
| 341 | /// |
| 342 | /// This returns an error if the data is not available for \s. |
| 343 | pub fn perl_space() -> Result<hir::ClassUnicode> { |
| 344 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] |
| 345 | fn imp() -> Result<hir::ClassUnicode> { |
| 346 | Err(Error::PerlClassNotFound) |
| 347 | } |
| 348 | |
| 349 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] |
| 350 | fn imp() -> Result<hir::ClassUnicode> { |
| 351 | use unicode_tables::perl_space::WHITE_SPACE; |
| 352 | Ok(hir_class(WHITE_SPACE)) |
| 353 | } |
| 354 | |
| 355 | #[cfg(feature = "unicode-bool")] |
| 356 | fn imp() -> Result<hir::ClassUnicode> { |
| 357 | use unicode_tables::property_bool::WHITE_SPACE; |
| 358 | Ok(hir_class(WHITE_SPACE)) |
| 359 | } |
| 360 | |
| 361 | imp() |
| 362 | } |
| 363 | |
| 364 | /// Returns a Unicode aware class for \d. |
| 365 | /// |
| 366 | /// This returns an error if the data is not available for \d. |
| 367 | pub fn perl_digit() -> Result<hir::ClassUnicode> { |
| 368 | #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] |
| 369 | fn imp() -> Result<hir::ClassUnicode> { |
| 370 | Err(Error::PerlClassNotFound) |
| 371 | } |
| 372 | |
| 373 | #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] |
| 374 | fn imp() -> Result<hir::ClassUnicode> { |
| 375 | use unicode_tables::perl_decimal::DECIMAL_NUMBER; |
| 376 | Ok(hir_class(DECIMAL_NUMBER)) |
| 377 | } |
| 378 | |
| 379 | #[cfg(feature = "unicode-gencat")] |
| 380 | fn imp() -> Result<hir::ClassUnicode> { |
| 381 | use unicode_tables::general_category::DECIMAL_NUMBER; |
| 382 | Ok(hir_class(DECIMAL_NUMBER)) |
| 383 | } |
| 384 | |
| 385 | imp() |
| 386 | } |
| 387 | |
| 388 | /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. |
| 389 | pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { |
| 390 | let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges |
| 391 | .iter() |
| 392 | .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) |
| 393 | .collect(); |
| 394 | hir::ClassUnicode::new(hir_ranges) |
| 395 | } |
| 396 | |
| 397 | /// Returns true only if the given codepoint is in the `\w` character class. |
| 398 | /// |
| 399 | /// If the `unicode-perl` feature is not enabled, then this returns an error. |
| 400 | pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> { |
| 401 | #[cfg(not(feature = "unicode-perl"))] |
| 402 | fn imp(_: char) -> result::Result<bool, UnicodeWordError> { |
| 403 | Err(UnicodeWordError(())) |
| 404 | } |
| 405 | |
| 406 | #[cfg(feature = "unicode-perl")] |
| 407 | fn imp(c: char) -> result::Result<bool, UnicodeWordError> { |
| 408 | use is_word_byte; |
| 409 | use std::cmp::Ordering; |
| 410 | use unicode_tables::perl_word::PERL_WORD; |
| 411 | |
| 412 | if c <= 0x7F as char && is_word_byte(c as u8) { |
| 413 | return Ok(true); |
| 414 | } |
| 415 | Ok(PERL_WORD |
| 416 | .binary_search_by(|&(start, end)| { |
| 417 | if start <= c && c <= end { |
| 418 | Ordering::Equal |
| 419 | } else if start > c { |
| 420 | Ordering::Greater |
| 421 | } else { |
| 422 | Ordering::Less |
| 423 | } |
| 424 | }) |
| 425 | .is_ok()) |
| 426 | } |
| 427 | |
| 428 | imp(c) |
| 429 | } |
| 430 | |
| 431 | /// A mapping of property values for a specific property. |
| 432 | /// |
| 433 | /// The first element of each tuple is a normalized property value while the |
| 434 | /// second element of each tuple is the corresponding canonical property |
| 435 | /// value. |
| 436 | type PropertyValues = &'static [(&'static str, &'static str)]; |
| 437 | |
| 438 | fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> { |
| 439 | Ok(match normalized_value { |
| 440 | "any" => Some("Any"), |
| 441 | "assigned" => Some("Assigned"), |
| 442 | "ascii" => Some("ASCII"), |
| 443 | _ => { |
| 444 | let gencats = property_values("General_Category")?.unwrap(); |
| 445 | canonical_value(gencats, normalized_value) |
| 446 | } |
| 447 | }) |
| 448 | } |
| 449 | |
| 450 | fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> { |
| 451 | let scripts = property_values("Script")?.unwrap(); |
| 452 | Ok(canonical_value(scripts, normalized_value)) |
| 453 | } |
| 454 | |
| 455 | /// Find the canonical property name for the given normalized property name. |
| 456 | /// |
| 457 | /// If no such property exists, then `None` is returned. |
| 458 | /// |
| 459 | /// The normalized property name must have been normalized according to |
| 460 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| 461 | /// |
| 462 | /// If the property names data is not available, then an error is returned. |
| 463 | fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> { |
| 464 | #[cfg(not(any( |
| 465 | feature = "unicode-age", |
| 466 | feature = "unicode-bool", |
| 467 | feature = "unicode-gencat", |
| 468 | feature = "unicode-perl", |
| 469 | feature = "unicode-script", |
| 470 | feature = "unicode-segment", |
| 471 | )))] |
| 472 | fn imp(_: &str) -> Result<Option<&'static str>> { |
| 473 | Err(Error::PropertyNotFound) |
| 474 | } |
| 475 | |
| 476 | #[cfg(any( |
| 477 | feature = "unicode-age", |
| 478 | feature = "unicode-bool", |
| 479 | feature = "unicode-gencat", |
| 480 | feature = "unicode-perl", |
| 481 | feature = "unicode-script", |
| 482 | feature = "unicode-segment", |
| 483 | ))] |
| 484 | fn imp(name: &str) -> Result<Option<&'static str>> { |
| 485 | use unicode_tables::property_names::PROPERTY_NAMES; |
| 486 | |
| 487 | Ok(PROPERTY_NAMES |
| 488 | .binary_search_by_key(&name, |&(n, _)| n) |
| 489 | .ok() |
| 490 | .map(|i| PROPERTY_NAMES[i].1)) |
| 491 | } |
| 492 | |
| 493 | imp(normalized_name) |
| 494 | } |
| 495 | |
| 496 | /// Find the canonical property value for the given normalized property |
| 497 | /// value. |
| 498 | /// |
| 499 | /// The given property values should correspond to the values for the property |
| 500 | /// under question, which can be found using `property_values`. |
| 501 | /// |
| 502 | /// If no such property value exists, then `None` is returned. |
| 503 | /// |
| 504 | /// The normalized property value must have been normalized according to |
| 505 | /// UAX44 LM3, which can be done using `symbolic_name_normalize`. |
| 506 | fn canonical_value( |
| 507 | vals: PropertyValues, |
| 508 | normalized_value: &str, |
| 509 | ) -> Option<&'static str> { |
| 510 | vals.binary_search_by_key(&normalized_value, |&(n, _)| n) |
| 511 | .ok() |
| 512 | .map(|i| vals[i].1) |
| 513 | } |
| 514 | |
| 515 | /// Return the table of property values for the given property name. |
| 516 | /// |
| 517 | /// If the property values data is not available, then an error is returned. |
| 518 | fn property_values( |
| 519 | canonical_property_name: &'static str, |
| 520 | ) -> Result<Option<PropertyValues>> { |
| 521 | #[cfg(not(any( |
| 522 | feature = "unicode-age", |
| 523 | feature = "unicode-bool", |
| 524 | feature = "unicode-gencat", |
| 525 | feature = "unicode-perl", |
| 526 | feature = "unicode-script", |
| 527 | feature = "unicode-segment", |
| 528 | )))] |
| 529 | fn imp(_: &'static str) -> Result<Option<PropertyValues>> { |
| 530 | Err(Error::PropertyValueNotFound) |
| 531 | } |
| 532 | |
| 533 | #[cfg(any( |
| 534 | feature = "unicode-age", |
| 535 | feature = "unicode-bool", |
| 536 | feature = "unicode-gencat", |
| 537 | feature = "unicode-perl", |
| 538 | feature = "unicode-script", |
| 539 | feature = "unicode-segment", |
| 540 | ))] |
| 541 | fn imp(name: &'static str) -> Result<Option<PropertyValues>> { |
| 542 | use unicode_tables::property_values::PROPERTY_VALUES; |
| 543 | |
| 544 | Ok(PROPERTY_VALUES |
| 545 | .binary_search_by_key(&name, |&(n, _)| n) |
| 546 | .ok() |
| 547 | .map(|i| PROPERTY_VALUES[i].1)) |
| 548 | } |
| 549 | |
| 550 | imp(canonical_property_name) |
| 551 | } |
| 552 | |
| 553 | // This is only used in some cases, but small enough to just let it be dead |
| 554 | // instead of figuring out (and maintaining) the right set of features. |
| 555 | #[allow(dead_code)] |
| 556 | fn property_set( |
| 557 | name_map: &'static [(&'static str, Range)], |
| 558 | canonical: &'static str, |
| 559 | ) -> Option<Range> { |
| 560 | name_map |
| 561 | .binary_search_by_key(&canonical, |x| x.0) |
| 562 | .ok() |
| 563 | .map(|i| name_map[i].1) |
| 564 | } |
| 565 | |
| 566 | /// Returns an iterator over Unicode Age sets. Each item corresponds to a set |
| 567 | /// of codepoints that were added in a particular revision of Unicode. The |
| 568 | /// iterator yields items in chronological order. |
| 569 | /// |
| 570 | /// If the given age value isn't valid or if the data isn't available, then an |
| 571 | /// error is returned instead. |
| 572 | fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
| 573 | #[cfg(not(feature = "unicode-age"))] |
| 574 | fn imp(_: &str) -> Result<impl Iterator<Item = Range>> { |
| 575 | use std::option::IntoIter; |
| 576 | Err::<IntoIter<Range>, _>(Error::PropertyNotFound) |
| 577 | } |
| 578 | |
| 579 | #[cfg(feature = "unicode-age")] |
| 580 | fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> { |
| 581 | use unicode_tables::age; |
| 582 | |
| 583 | const AGES: &'static [(&'static str, Range)] = &[ |
| 584 | ("V1_1", age::V1_1), |
| 585 | ("V2_0", age::V2_0), |
| 586 | ("V2_1", age::V2_1), |
| 587 | ("V3_0", age::V3_0), |
| 588 | ("V3_1", age::V3_1), |
| 589 | ("V3_2", age::V3_2), |
| 590 | ("V4_0", age::V4_0), |
| 591 | ("V4_1", age::V4_1), |
| 592 | ("V5_0", age::V5_0), |
| 593 | ("V5_1", age::V5_1), |
| 594 | ("V5_2", age::V5_2), |
| 595 | ("V6_0", age::V6_0), |
| 596 | ("V6_1", age::V6_1), |
| 597 | ("V6_2", age::V6_2), |
| 598 | ("V6_3", age::V6_3), |
| 599 | ("V7_0", age::V7_0), |
| 600 | ("V8_0", age::V8_0), |
| 601 | ("V9_0", age::V9_0), |
| 602 | ("V10_0", age::V10_0), |
| 603 | ("V11_0", age::V11_0), |
| 604 | ("V12_0", age::V12_0), |
| 605 | ("V12_1", age::V12_1), |
| 606 | ("V13_0", age::V13_0), |
| 607 | ]; |
| 608 | assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); |
| 609 | |
| 610 | let pos = AGES.iter().position(|&(age, _)| canonical_age == age); |
| 611 | match pos { |
| 612 | None => Err(Error::PropertyValueNotFound), |
| 613 | Some(i) => Ok(AGES[..i + 1].iter().map(|&(_, classes)| classes)), |
| 614 | } |
| 615 | } |
| 616 | |
| 617 | imp(canonical_age) |
| 618 | } |
| 619 | |
| 620 | /// Returns the Unicode HIR class corresponding to the given general category. |
| 621 | /// |
| 622 | /// Name canonicalization is assumed to be performed by the caller. |
| 623 | /// |
| 624 | /// If the given general category could not be found, or if the general |
| 625 | /// category data is not available, then an error is returned. |
| 626 | fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 627 | #[cfg(not(feature = "unicode-gencat"))] |
| 628 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 629 | Err(Error::PropertyNotFound) |
| 630 | } |
| 631 | |
| 632 | #[cfg(feature = "unicode-gencat")] |
| 633 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 634 | use unicode_tables::general_category::BY_NAME; |
| 635 | match name { |
| 636 | "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), |
| 637 | "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), |
| 638 | "Assigned" => { |
| 639 | let mut cls = gencat("Unassigned")?; |
| 640 | cls.negate(); |
| 641 | Ok(cls) |
| 642 | } |
| 643 | name => property_set(BY_NAME, name) |
| 644 | .map(hir_class) |
| 645 | .ok_or(Error::PropertyValueNotFound), |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | match canonical_name { |
| 650 | "Decimal_Number" => perl_digit(), |
| 651 | name => imp(name), |
| 652 | } |
| 653 | } |
| 654 | |
| 655 | /// Returns the Unicode HIR class corresponding to the given script. |
| 656 | /// |
| 657 | /// Name canonicalization is assumed to be performed by the caller. |
| 658 | /// |
| 659 | /// If the given script could not be found, or if the script data is not |
| 660 | /// available, then an error is returned. |
| 661 | fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 662 | #[cfg(not(feature = "unicode-script"))] |
| 663 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 664 | Err(Error::PropertyNotFound) |
| 665 | } |
| 666 | |
| 667 | #[cfg(feature = "unicode-script")] |
| 668 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 669 | use unicode_tables::script::BY_NAME; |
| 670 | property_set(BY_NAME, name) |
| 671 | .map(hir_class) |
| 672 | .ok_or(Error::PropertyValueNotFound) |
| 673 | } |
| 674 | |
| 675 | imp(canonical_name) |
| 676 | } |
| 677 | |
| 678 | /// Returns the Unicode HIR class corresponding to the given script extension. |
| 679 | /// |
| 680 | /// Name canonicalization is assumed to be performed by the caller. |
| 681 | /// |
| 682 | /// If the given script extension could not be found, or if the script data is |
| 683 | /// not available, then an error is returned. |
| 684 | fn script_extension( |
| 685 | canonical_name: &'static str, |
| 686 | ) -> Result<hir::ClassUnicode> { |
| 687 | #[cfg(not(feature = "unicode-script"))] |
| 688 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 689 | Err(Error::PropertyNotFound) |
| 690 | } |
| 691 | |
| 692 | #[cfg(feature = "unicode-script")] |
| 693 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 694 | use unicode_tables::script_extension::BY_NAME; |
| 695 | property_set(BY_NAME, name) |
| 696 | .map(hir_class) |
| 697 | .ok_or(Error::PropertyValueNotFound) |
| 698 | } |
| 699 | |
| 700 | imp(canonical_name) |
| 701 | } |
| 702 | |
| 703 | /// Returns the Unicode HIR class corresponding to the given Unicode boolean |
| 704 | /// property. |
| 705 | /// |
| 706 | /// Name canonicalization is assumed to be performed by the caller. |
| 707 | /// |
| 708 | /// If the given boolean property could not be found, or if the boolean |
| 709 | /// property data is not available, then an error is returned. |
| 710 | fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 711 | #[cfg(not(feature = "unicode-bool"))] |
| 712 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 713 | Err(Error::PropertyNotFound) |
| 714 | } |
| 715 | |
| 716 | #[cfg(feature = "unicode-bool")] |
| 717 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 718 | use unicode_tables::property_bool::BY_NAME; |
| 719 | property_set(BY_NAME, name) |
| 720 | .map(hir_class) |
| 721 | .ok_or(Error::PropertyNotFound) |
| 722 | } |
| 723 | |
| 724 | match canonical_name { |
| 725 | "Decimal_Number" => perl_digit(), |
| 726 | "White_Space" => perl_space(), |
| 727 | name => imp(name), |
| 728 | } |
| 729 | } |
| 730 | |
| 731 | /// Returns the Unicode HIR class corresponding to the given grapheme cluster |
| 732 | /// break property. |
| 733 | /// |
| 734 | /// Name canonicalization is assumed to be performed by the caller. |
| 735 | /// |
| 736 | /// If the given property could not be found, or if the corresponding data is |
| 737 | /// not available, then an error is returned. |
| 738 | fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 739 | #[cfg(not(feature = "unicode-segment"))] |
| 740 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 741 | Err(Error::PropertyNotFound) |
| 742 | } |
| 743 | |
| 744 | #[cfg(feature = "unicode-segment")] |
| 745 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 746 | use unicode_tables::grapheme_cluster_break::BY_NAME; |
| 747 | property_set(BY_NAME, name) |
| 748 | .map(hir_class) |
| 749 | .ok_or(Error::PropertyValueNotFound) |
| 750 | } |
| 751 | |
| 752 | imp(canonical_name) |
| 753 | } |
| 754 | |
| 755 | /// Returns the Unicode HIR class corresponding to the given word break |
| 756 | /// property. |
| 757 | /// |
| 758 | /// Name canonicalization is assumed to be performed by the caller. |
| 759 | /// |
| 760 | /// If the given property could not be found, or if the corresponding data is |
| 761 | /// not available, then an error is returned. |
| 762 | fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 763 | #[cfg(not(feature = "unicode-segment"))] |
| 764 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 765 | Err(Error::PropertyNotFound) |
| 766 | } |
| 767 | |
| 768 | #[cfg(feature = "unicode-segment")] |
| 769 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 770 | use unicode_tables::word_break::BY_NAME; |
| 771 | property_set(BY_NAME, name) |
| 772 | .map(hir_class) |
| 773 | .ok_or(Error::PropertyValueNotFound) |
| 774 | } |
| 775 | |
| 776 | imp(canonical_name) |
| 777 | } |
| 778 | |
| 779 | /// Returns the Unicode HIR class corresponding to the given sentence |
| 780 | /// break property. |
| 781 | /// |
| 782 | /// Name canonicalization is assumed to be performed by the caller. |
| 783 | /// |
| 784 | /// If the given property could not be found, or if the corresponding data is |
| 785 | /// not available, then an error is returned. |
| 786 | fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> { |
| 787 | #[cfg(not(feature = "unicode-segment"))] |
| 788 | fn imp(_: &'static str) -> Result<hir::ClassUnicode> { |
| 789 | Err(Error::PropertyNotFound) |
| 790 | } |
| 791 | |
| 792 | #[cfg(feature = "unicode-segment")] |
| 793 | fn imp(name: &'static str) -> Result<hir::ClassUnicode> { |
| 794 | use unicode_tables::sentence_break::BY_NAME; |
| 795 | property_set(BY_NAME, name) |
| 796 | .map(hir_class) |
| 797 | .ok_or(Error::PropertyValueNotFound) |
| 798 | } |
| 799 | |
| 800 | imp(canonical_name) |
| 801 | } |
| 802 | |
| 803 | /// Like symbolic_name_normalize_bytes, but operates on a string. |
| 804 | fn symbolic_name_normalize(x: &str) -> String { |
| 805 | let mut tmp = x.as_bytes().to_vec(); |
| 806 | let len = symbolic_name_normalize_bytes(&mut tmp).len(); |
| 807 | tmp.truncate(len); |
| 808 | // This should always succeed because `symbolic_name_normalize_bytes` |
| 809 | // guarantees that `&tmp[..len]` is always valid UTF-8. |
| 810 | // |
| 811 | // N.B. We could avoid the additional UTF-8 check here, but it's unlikely |
| 812 | // to be worth skipping the additional safety check. A benchmark must |
| 813 | // justify it first. |
| 814 | String::from_utf8(tmp).unwrap() |
| 815 | } |
| 816 | |
| 817 | /// Normalize the given symbolic name in place according to UAX44-LM3. |
| 818 | /// |
| 819 | /// A "symbolic name" typically corresponds to property names and property |
| 820 | /// value aliases. Note, though, that it should not be applied to property |
| 821 | /// string values. |
| 822 | /// |
| 823 | /// The slice returned is guaranteed to be valid UTF-8 for all possible values |
| 824 | /// of `slice`. |
| 825 | /// |
| 826 | /// See: http://unicode.org/reports/tr44/#UAX44-LM3 |
| 827 | fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { |
| 828 | // I couldn't find a place in the standard that specified that property |
| 829 | // names/aliases had a particular structure (unlike character names), but |
| 830 | // we assume that it's ASCII only and drop anything that isn't ASCII. |
| 831 | let mut start = 0; |
| 832 | let mut starts_with_is = false; |
| 833 | if slice.len() >= 2 { |
| 834 | // Ignore any "is" prefix. |
| 835 | starts_with_is = slice[0..2] == b"is"[..] |
| 836 | || slice[0..2] == b"IS"[..] |
| 837 | || slice[0..2] == b"iS"[..] |
| 838 | || slice[0..2] == b"Is"[..]; |
| 839 | if starts_with_is { |
| 840 | start = 2; |
| 841 | } |
| 842 | } |
| 843 | let mut next_write = 0; |
| 844 | for i in start..slice.len() { |
| 845 | // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid |
| 846 | // UTF-8, we ensure that the slice contains only ASCII bytes. In |
| 847 | // particular, we drop every non-ASCII byte from the normalized string. |
| 848 | let b = slice[i]; |
| 849 | if b == b' ' || b == b'_' || b == b'-' { |
| 850 | continue; |
| 851 | } else if b'A' <= b && b <= b'Z' { |
| 852 | slice[next_write] = b + (b'a' - b'A'); |
| 853 | next_write += 1; |
| 854 | } else if b <= 0x7F { |
| 855 | slice[next_write] = b; |
| 856 | next_write += 1; |
| 857 | } |
| 858 | } |
| 859 | // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally |
| 860 | // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross |
| 861 | // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it |
| 862 | // is actually an alias for the 'Other' general category. |
| 863 | if starts_with_is && next_write == 1 && slice[0] == b'c' { |
| 864 | slice[0] = b'i'; |
| 865 | slice[1] = b's'; |
| 866 | slice[2] = b'c'; |
| 867 | next_write = 3; |
| 868 | } |
| 869 | &mut slice[..next_write] |
| 870 | } |
| 871 | |
| 872 | #[cfg(test)] |
| 873 | mod tests { |
| 874 | use super::{ |
| 875 | contains_simple_case_mapping, simple_fold, symbolic_name_normalize, |
| 876 | symbolic_name_normalize_bytes, |
| 877 | }; |
| 878 | |
| 879 | #[cfg(feature = "unicode-case")] |
| 880 | fn simple_fold_ok(c: char) -> impl Iterator<Item = char> { |
| 881 | simple_fold(c).unwrap().unwrap() |
| 882 | } |
| 883 | |
| 884 | #[cfg(feature = "unicode-case")] |
| 885 | fn simple_fold_err(c: char) -> Option<char> { |
| 886 | match simple_fold(c).unwrap() { |
| 887 | Ok(_) => unreachable!("simple_fold returned Ok iterator"), |
| 888 | Err(next) => next, |
| 889 | } |
| 890 | } |
| 891 | |
| 892 | #[cfg(feature = "unicode-case")] |
| 893 | fn contains_case_map(start: char, end: char) -> bool { |
| 894 | contains_simple_case_mapping(start, end).unwrap() |
| 895 | } |
| 896 | |
| 897 | #[test] |
| 898 | #[cfg(feature = "unicode-case")] |
| 899 | fn simple_fold_k() { |
| 900 | let xs: Vec<char> = simple_fold_ok('k').collect(); |
| 901 | assert_eq!(xs, vec!['K', 'K']); |
| 902 | |
| 903 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
| 904 | assert_eq!(xs, vec!['k', 'K']); |
| 905 | |
| 906 | let xs: Vec<char> = simple_fold_ok('K').collect(); |
| 907 | assert_eq!(xs, vec!['K', 'k']); |
| 908 | } |
| 909 | |
| 910 | #[test] |
| 911 | #[cfg(feature = "unicode-case")] |
| 912 | fn simple_fold_a() { |
| 913 | let xs: Vec<char> = simple_fold_ok('a').collect(); |
| 914 | assert_eq!(xs, vec!['A']); |
| 915 | |
| 916 | let xs: Vec<char> = simple_fold_ok('A').collect(); |
| 917 | assert_eq!(xs, vec!['a']); |
| 918 | } |
| 919 | |
| 920 | #[test] |
| 921 | #[cfg(feature = "unicode-case")] |
| 922 | fn simple_fold_empty() { |
| 923 | assert_eq!(Some('A'), simple_fold_err('?')); |
| 924 | assert_eq!(Some('A'), simple_fold_err('@')); |
| 925 | assert_eq!(Some('a'), simple_fold_err('[')); |
| 926 | assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); |
| 927 | } |
| 928 | |
| 929 | #[test] |
| 930 | #[cfg(feature = "unicode-case")] |
| 931 | fn simple_fold_max() { |
| 932 | assert_eq!(None, simple_fold_err('\u{10FFFE}')); |
| 933 | assert_eq!(None, simple_fold_err('\u{10FFFF}')); |
| 934 | } |
| 935 | |
| 936 | #[test] |
| 937 | #[cfg(not(feature = "unicode-case"))] |
| 938 | fn simple_fold_disabled() { |
| 939 | assert!(simple_fold('a').is_err()); |
| 940 | } |
| 941 | |
| 942 | #[test] |
| 943 | #[cfg(feature = "unicode-case")] |
| 944 | fn range_contains() { |
| 945 | assert!(contains_case_map('A', 'A')); |
| 946 | assert!(contains_case_map('Z', 'Z')); |
| 947 | assert!(contains_case_map('A', 'Z')); |
| 948 | assert!(contains_case_map('@', 'A')); |
| 949 | assert!(contains_case_map('Z', '[')); |
| 950 | assert!(contains_case_map('☃', 'Ⰰ')); |
| 951 | |
| 952 | assert!(!contains_case_map('[', '[')); |
| 953 | assert!(!contains_case_map('[', '`')); |
| 954 | |
| 955 | assert!(!contains_case_map('☃', '☃')); |
| 956 | } |
| 957 | |
| 958 | #[test] |
| 959 | #[cfg(not(feature = "unicode-case"))] |
| 960 | fn range_contains_disabled() { |
| 961 | assert!(contains_simple_case_mapping('a', 'a').is_err()); |
| 962 | } |
| 963 | |
| 964 | #[test] |
| 965 | #[cfg(feature = "unicode-gencat")] |
| 966 | fn regression_466() { |
| 967 | use super::{CanonicalClassQuery, ClassQuery}; |
| 968 | |
| 969 | let q = ClassQuery::OneLetter('C'); |
| 970 | assert_eq!( |
| 971 | q.canonicalize().unwrap(), |
| 972 | CanonicalClassQuery::GeneralCategory("Other") |
| 973 | ); |
| 974 | } |
| 975 | |
| 976 | #[test] |
| 977 | fn sym_normalize() { |
| 978 | let sym_norm = symbolic_name_normalize; |
| 979 | |
| 980 | assert_eq!(sym_norm("Line_Break"), "linebreak"); |
| 981 | assert_eq!(sym_norm("Line-break"), "linebreak"); |
| 982 | assert_eq!(sym_norm("linebreak"), "linebreak"); |
| 983 | assert_eq!(sym_norm("BA"), "ba"); |
| 984 | assert_eq!(sym_norm("ba"), "ba"); |
| 985 | assert_eq!(sym_norm("Greek"), "greek"); |
| 986 | assert_eq!(sym_norm("isGreek"), "greek"); |
| 987 | assert_eq!(sym_norm("IS_Greek"), "greek"); |
| 988 | assert_eq!(sym_norm("isc"), "isc"); |
| 989 | assert_eq!(sym_norm("is c"), "isc"); |
| 990 | assert_eq!(sym_norm("is_c"), "isc"); |
| 991 | } |
| 992 | |
| 993 | #[test] |
| 994 | fn valid_utf8_symbolic() { |
| 995 | let mut x = b"abc\xFFxyz".to_vec(); |
| 996 | let y = symbolic_name_normalize_bytes(&mut x); |
| 997 | assert_eq!(y, b"abcxyz"); |
| 998 | } |
| 999 | } |