Blame - src/unicode.rs - platform/external/rust/crates/regex-syntax

blob: a78362b2fb55624f10ad82709f9e8f5c9fb84137 [file] [log] [blame]

Chih-Hung Hsieh	048fc04	2020-04-16 10:44:22 -0700	[diff] [blame]	1	use std::error;
				2	use std::fmt;
				3	use std::result;
				4
				5	use hir;
				6
				7	/// A type alias for errors specific to Unicode handling of classes.
				8	pub type Result<T> = result::Result<T, Error>;
				9
				10	/// An inclusive range of codepoints from a generated file (hence the static
				11	/// lifetime).
				12	type Range = &'static [(char, char)];
				13
				14	/// An error that occurs when dealing with Unicode.
				15	///
				16	/// We don't impl the Error trait here because these always get converted
				17	/// into other public errors. (This error type isn't exported.)
				18	#[derive(Debug)]
				19	pub enum Error {
				20	PropertyNotFound,
				21	PropertyValueNotFound,
				22	// Not used when unicode-perl is enabled.
				23	#[allow(dead_code)]
				24	PerlClassNotFound,
				25	}
				26
				27	/// A type alias for errors specific to Unicode case folding.
				28	pub type FoldResult<T> = result::Result<T, CaseFoldError>;
				29
				30	/// An error that occurs when Unicode-aware simple case folding fails.
				31	///
				32	/// This error can occur when the case mapping tables necessary for Unicode
				33	/// aware case folding are unavailable. This only occurs when the
				34	/// `unicode-case` feature is disabled. (The feature is enabled by default.)
				35	#[derive(Debug)]
				36	pub struct CaseFoldError(());
				37
				38	impl error::Error for CaseFoldError {}
				39
				40	impl fmt::Display for CaseFoldError {
				41	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				42	write!(
				43	f,
				44	"Unicode-aware case folding is not available \
				45	(probably because the unicode-case feature is not enabled)"
				46	)
				47	}
				48	}
				49
				50	/// An error that occurs when the Unicode-aware `\w` class is unavailable.
				51	///
				52	/// This error can occur when the data tables necessary for the Unicode aware
				53	/// Perl character class `\w` are unavailable. This only occurs when the
				54	/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
				55	#[derive(Debug)]
				56	pub struct UnicodeWordError(());
				57
				58	impl error::Error for UnicodeWordError {}
				59
				60	impl fmt::Display for UnicodeWordError {
				61	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				62	write!(
				63	f,
				64	"Unicode-aware \\w class is not available \
				65	(probably because the unicode-perl feature is not enabled)"
				66	)
				67	}
				68	}
				69
				70	/// Return an iterator over the equivalence class of simple case mappings
				71	/// for the given codepoint. The equivalence class does not include the
				72	/// given codepoint.
				73	///
				74	/// If the equivalence class is empty, then this returns the next scalar
				75	/// value that has a non-empty equivalence class, if it exists. If no such
				76	/// scalar value exists, then `None` is returned. The point of this behavior
				77	/// is to permit callers to avoid calling `simple_fold` more than they need
				78	/// to, since there is some cost to fetching the equivalence class.
				79	///
				80	/// This returns an error if the Unicode case folding tables are not available.
				81	pub fn simple_fold(
				82	c: char,
				83	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
				84	#[cfg(not(feature = "unicode-case"))]
				85	fn imp(
				86	_: char,
				87	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
				88	{
				89	use std::option::IntoIter;
				90	Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
				91	}
				92
				93	#[cfg(feature = "unicode-case")]
				94	fn imp(
				95	c: char,
				96	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
				97	{
				98	use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
				99
				100	Ok(CASE_FOLDING_SIMPLE
				101	.binary_search_by_key(&c, \|&(c1, _)\| c1)
				102	.map(\|i\| CASE_FOLDING_SIMPLE[i].1.iter().map(\|&c\| c))
				103	.map_err(\|i\| {
				104	if i >= CASE_FOLDING_SIMPLE.len() {
				105	None
				106	} else {
				107	Some(CASE_FOLDING_SIMPLE[i].0)
				108	}
				109	}))
				110	}
				111
				112	imp(c)
				113	}
				114
				115	/// Returns true if and only if the given (inclusive) range contains at least
				116	/// one Unicode scalar value that has a non-empty non-trivial simple case
				117	/// mapping.
				118	///
				119	/// This function panics if `end < start`.
				120	///
				121	/// This returns an error if the Unicode case folding tables are not available.
				122	pub fn contains_simple_case_mapping(
				123	start: char,
				124	end: char,
				125	) -> FoldResult<bool> {
				126	#[cfg(not(feature = "unicode-case"))]
				127	fn imp(_: char, _: char) -> FoldResult<bool> {
				128	Err(CaseFoldError(()))
				129	}
				130
				131	#[cfg(feature = "unicode-case")]
				132	fn imp(start: char, end: char) -> FoldResult<bool> {
				133	use std::cmp::Ordering;
				134	use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
				135
				136	assert!(start <= end);
				137	Ok(CASE_FOLDING_SIMPLE
				138	.binary_search_by(\|&(c, _)\| {
				139	if start <= c && c <= end {
				140	Ordering::Equal
				141	} else if c > end {
				142	Ordering::Greater
				143	} else {
				144	Ordering::Less
				145	}
				146	})
				147	.is_ok())
				148	}
				149
				150	imp(start, end)
				151	}
				152
				153	/// A query for finding a character class defined by Unicode. This supports
				154	/// either use of a property name directly, or lookup by property value. The
				155	/// former generally refers to Binary properties (see UTS#44, Table 8), but
				156	/// as a special exception (see UTS#18, Section 1.2) both general categories
				157	/// (an enumeration) and scripts (a catalog) are supported as if each of their
				158	/// possible values were a binary property.
				159	///
				160	/// In all circumstances, property names and values are normalized and
				161	/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
				162	///
				163	/// The lifetime `'a` refers to the shorter of the lifetimes of property name
				164	/// and property value.
				165	#[derive(Debug)]
				166	pub enum ClassQuery<'a> {
				167	/// Return a class corresponding to a Unicode binary property, named by
				168	/// a single letter.
				169	OneLetter(char),
				170	/// Return a class corresponding to a Unicode binary property.
				171	///
				172	/// Note that, by special exception (see UTS#18, Section 1.2), both
				173	/// general category values and script values are permitted here as if
				174	/// they were a binary property.
				175	Binary(&'a str),
				176	/// Return a class corresponding to all codepoints whose property
				177	/// (identified by `property_name`) corresponds to the given value
				178	/// (identified by `property_value`).
				179	ByValue {
				180	/// A property name.
				181	property_name: &'a str,
				182	/// A property value.
				183	property_value: &'a str,
				184	},
				185	}
				186
				187	impl<'a> ClassQuery<'a> {
				188	fn canonicalize(&self) -> Result<CanonicalClassQuery> {
				189	match *self {
				190	ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
				191	ClassQuery::Binary(name) => self.canonical_binary(name),
				192	ClassQuery::ByValue { property_name, property_value } => {
				193	let property_name = symbolic_name_normalize(property_name);
				194	let property_value = symbolic_name_normalize(property_value);
				195
				196	let canon_name = match canonical_prop(&property_name)? {
				197	None => return Err(Error::PropertyNotFound),
				198	Some(canon_name) => canon_name,
				199	};
				200	Ok(match canon_name {
				201	"General_Category" => {
				202	let canon = match canonical_gencat(&property_value)? {
				203	None => return Err(Error::PropertyValueNotFound),
				204	Some(canon) => canon,
				205	};
				206	CanonicalClassQuery::GeneralCategory(canon)
				207	}
				208	"Script" => {
				209	let canon = match canonical_script(&property_value)? {
				210	None => return Err(Error::PropertyValueNotFound),
				211	Some(canon) => canon,
				212	};
				213	CanonicalClassQuery::Script(canon)
				214	}
				215	_ => {
				216	let vals = match property_values(canon_name)? {
				217	None => return Err(Error::PropertyValueNotFound),
				218	Some(vals) => vals,
				219	};
				220	let canon_val =
				221	match canonical_value(vals, &property_value) {
				222	None => {
				223	return Err(Error::PropertyValueNotFound)
				224	}
				225	Some(canon_val) => canon_val,
				226	};
				227	CanonicalClassQuery::ByValue {
				228	property_name: canon_name,
				229	property_value: canon_val,
				230	}
				231	}
				232	})
				233	}
				234	}
				235	}
				236
				237	fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
				238	let norm = symbolic_name_normalize(name);
				239
Chih-Hung Hsieh	31dfd7f	2020-10-26 13:16:58 -0700	[diff] [blame]	240	// This is a special case where 'cf' refers to the 'Format' general
				241	// category, but where the 'cf' abbreviation is also an abbreviation
				242	// for the 'Case_Folding' property. But we want to treat it as
				243	// a general category. (Currently, we don't even support the
				244	// 'Case_Folding' property. But if we do in the future, users will be
				245	// required to spell it out.)
				246	if norm != "cf" {
				247	if let Some(canon) = canonical_prop(&norm)? {
				248	return Ok(CanonicalClassQuery::Binary(canon));
				249	}
Chih-Hung Hsieh	048fc04	2020-04-16 10:44:22 -0700	[diff] [blame]	250	}
				251	if let Some(canon) = canonical_gencat(&norm)? {
				252	return Ok(CanonicalClassQuery::GeneralCategory(canon));
				253	}
				254	if let Some(canon) = canonical_script(&norm)? {
				255	return Ok(CanonicalClassQuery::Script(canon));
				256	}
				257	Err(Error::PropertyNotFound)
				258	}
				259	}
				260
				261	/// Like ClassQuery, but its parameters have been canonicalized. This also
				262	/// differentiates binary properties from flattened general categories and
				263	/// scripts.
				264	#[derive(Debug, Eq, PartialEq)]
				265	enum CanonicalClassQuery {
				266	/// The canonical binary property name.
				267	Binary(&'static str),
				268	/// The canonical general category name.
				269	GeneralCategory(&'static str),
				270	/// The canonical script name.
				271	Script(&'static str),
				272	/// An arbitrary association between property and value, both of which
				273	/// have been canonicalized.
				274	///
				275	/// Note that by construction, the property name of ByValue will never
				276	/// be General_Category or Script. Those two cases are subsumed by the
				277	/// eponymous variants.
				278	ByValue {
				279	/// The canonical property name.
				280	property_name: &'static str,
				281	/// The canonical property value.
				282	property_value: &'static str,
				283	},
				284	}
				285
				286	/// Looks up a Unicode class given a query. If one doesn't exist, then
				287	/// `None` is returned.
				288	pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> {
				289	use self::CanonicalClassQuery::*;
				290
				291	match query.canonicalize()? {
				292	Binary(name) => bool_property(name),
				293	GeneralCategory(name) => gencat(name),
				294	Script(name) => script(name),
				295	ByValue { property_name: "Age", property_value } => {
				296	let mut class = hir::ClassUnicode::empty();
				297	for set in ages(property_value)? {
				298	class.union(&hir_class(set));
				299	}
				300	Ok(class)
				301	}
				302	ByValue { property_name: "Script_Extensions", property_value } => {
				303	script_extension(property_value)
				304	}
				305	ByValue {
				306	property_name: "Grapheme_Cluster_Break",
				307	property_value,
				308	} => gcb(property_value),
				309	ByValue { property_name: "Sentence_Break", property_value } => {
				310	sb(property_value)
				311	}
				312	ByValue { property_name: "Word_Break", property_value } => {
				313	wb(property_value)
				314	}
				315	_ => {
				316	// What else should we support?
				317	Err(Error::PropertyNotFound)
				318	}
				319	}
				320	}
				321
				322	/// Returns a Unicode aware class for \w.
				323	///
				324	/// This returns an error if the data is not available for \w.
				325	pub fn perl_word() -> Result<hir::ClassUnicode> {
				326	#[cfg(not(feature = "unicode-perl"))]
				327	fn imp() -> Result<hir::ClassUnicode> {
				328	Err(Error::PerlClassNotFound)
				329	}
				330
				331	#[cfg(feature = "unicode-perl")]
				332	fn imp() -> Result<hir::ClassUnicode> {
				333	use unicode_tables::perl_word::PERL_WORD;
				334	Ok(hir_class(PERL_WORD))
				335	}
				336
				337	imp()
				338	}
				339
				340	/// Returns a Unicode aware class for \s.
				341	///
				342	/// This returns an error if the data is not available for \s.
				343	pub fn perl_space() -> Result<hir::ClassUnicode> {
				344	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
				345	fn imp() -> Result<hir::ClassUnicode> {
				346	Err(Error::PerlClassNotFound)
				347	}
				348
				349	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
				350	fn imp() -> Result<hir::ClassUnicode> {
				351	use unicode_tables::perl_space::WHITE_SPACE;
				352	Ok(hir_class(WHITE_SPACE))
				353	}
				354
				355	#[cfg(feature = "unicode-bool")]
				356	fn imp() -> Result<hir::ClassUnicode> {
				357	use unicode_tables::property_bool::WHITE_SPACE;
				358	Ok(hir_class(WHITE_SPACE))
				359	}
				360
				361	imp()
				362	}
				363
				364	/// Returns a Unicode aware class for \d.
				365	///
				366	/// This returns an error if the data is not available for \d.
				367	pub fn perl_digit() -> Result<hir::ClassUnicode> {
				368	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
				369	fn imp() -> Result<hir::ClassUnicode> {
				370	Err(Error::PerlClassNotFound)
				371	}
				372
				373	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
				374	fn imp() -> Result<hir::ClassUnicode> {
				375	use unicode_tables::perl_decimal::DECIMAL_NUMBER;
				376	Ok(hir_class(DECIMAL_NUMBER))
				377	}
				378
				379	#[cfg(feature = "unicode-gencat")]
				380	fn imp() -> Result<hir::ClassUnicode> {
				381	use unicode_tables::general_category::DECIMAL_NUMBER;
				382	Ok(hir_class(DECIMAL_NUMBER))
				383	}
				384
				385	imp()
				386	}
				387
				388	/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
				389	pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
				390	let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
				391	.iter()
				392	.map(\|&(s, e)\| hir::ClassUnicodeRange::new(s, e))
				393	.collect();
				394	hir::ClassUnicode::new(hir_ranges)
				395	}
				396
				397	/// Returns true only if the given codepoint is in the `\w` character class.
				398	///
				399	/// If the `unicode-perl` feature is not enabled, then this returns an error.
				400	pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
				401	#[cfg(not(feature = "unicode-perl"))]
				402	fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
				403	Err(UnicodeWordError(()))
				404	}
				405
				406	#[cfg(feature = "unicode-perl")]
				407	fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
				408	use is_word_byte;
				409	use std::cmp::Ordering;
				410	use unicode_tables::perl_word::PERL_WORD;
				411
				412	if c <= 0x7F as char && is_word_byte(c as u8) {
				413	return Ok(true);
				414	}
				415	Ok(PERL_WORD
				416	.binary_search_by(\|&(start, end)\| {
				417	if start <= c && c <= end {
				418	Ordering::Equal
				419	} else if start > c {
				420	Ordering::Greater
				421	} else {
				422	Ordering::Less
				423	}
				424	})
				425	.is_ok())
				426	}
				427
				428	imp(c)
				429	}
				430
				431	/// A mapping of property values for a specific property.
				432	///
				433	/// The first element of each tuple is a normalized property value while the
				434	/// second element of each tuple is the corresponding canonical property
				435	/// value.
				436	type PropertyValues = &'static [(&'static str, &'static str)];
				437
				438	fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
				439	Ok(match normalized_value {
				440	"any" => Some("Any"),
				441	"assigned" => Some("Assigned"),
				442	"ascii" => Some("ASCII"),
				443	_ => {
				444	let gencats = property_values("General_Category")?.unwrap();
				445	canonical_value(gencats, normalized_value)
				446	}
				447	})
				448	}
				449
				450	fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
				451	let scripts = property_values("Script")?.unwrap();
				452	Ok(canonical_value(scripts, normalized_value))
				453	}
				454
				455	/// Find the canonical property name for the given normalized property name.
				456	///
				457	/// If no such property exists, then `None` is returned.
				458	///
				459	/// The normalized property name must have been normalized according to
				460	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
				461	///
				462	/// If the property names data is not available, then an error is returned.
				463	fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
				464	#[cfg(not(any(
				465	feature = "unicode-age",
				466	feature = "unicode-bool",
				467	feature = "unicode-gencat",
				468	feature = "unicode-perl",
				469	feature = "unicode-script",
				470	feature = "unicode-segment",
				471	)))]
				472	fn imp(_: &str) -> Result<Option<&'static str>> {
				473	Err(Error::PropertyNotFound)
				474	}
				475
				476	#[cfg(any(
				477	feature = "unicode-age",
				478	feature = "unicode-bool",
				479	feature = "unicode-gencat",
				480	feature = "unicode-perl",
				481	feature = "unicode-script",
				482	feature = "unicode-segment",
				483	))]
				484	fn imp(name: &str) -> Result<Option<&'static str>> {
				485	use unicode_tables::property_names::PROPERTY_NAMES;
				486
				487	Ok(PROPERTY_NAMES
				488	.binary_search_by_key(&name, \|&(n, _)\| n)
				489	.ok()
				490	.map(\|i\| PROPERTY_NAMES[i].1))
				491	}
				492
				493	imp(normalized_name)
				494	}
				495
				496	/// Find the canonical property value for the given normalized property
				497	/// value.
				498	///
				499	/// The given property values should correspond to the values for the property
				500	/// under question, which can be found using `property_values`.
				501	///
				502	/// If no such property value exists, then `None` is returned.
				503	///
				504	/// The normalized property value must have been normalized according to
				505	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
				506	fn canonical_value(
				507	vals: PropertyValues,
				508	normalized_value: &str,
				509	) -> Option<&'static str> {
				510	vals.binary_search_by_key(&normalized_value, \|&(n, _)\| n)
				511	.ok()
				512	.map(\|i\| vals[i].1)
				513	}
				514
				515	/// Return the table of property values for the given property name.
				516	///
				517	/// If the property values data is not available, then an error is returned.
				518	fn property_values(
				519	canonical_property_name: &'static str,
				520	) -> Result<Option<PropertyValues>> {
				521	#[cfg(not(any(
				522	feature = "unicode-age",
				523	feature = "unicode-bool",
				524	feature = "unicode-gencat",
				525	feature = "unicode-perl",
				526	feature = "unicode-script",
				527	feature = "unicode-segment",
				528	)))]
				529	fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
				530	Err(Error::PropertyValueNotFound)
				531	}
				532
				533	#[cfg(any(
				534	feature = "unicode-age",
				535	feature = "unicode-bool",
				536	feature = "unicode-gencat",
				537	feature = "unicode-perl",
				538	feature = "unicode-script",
				539	feature = "unicode-segment",
				540	))]
				541	fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
				542	use unicode_tables::property_values::PROPERTY_VALUES;
				543
				544	Ok(PROPERTY_VALUES
				545	.binary_search_by_key(&name, \|&(n, _)\| n)
				546	.ok()
				547	.map(\|i\| PROPERTY_VALUES[i].1))
				548	}
				549
				550	imp(canonical_property_name)
				551	}
				552
				553	// This is only used in some cases, but small enough to just let it be dead
				554	// instead of figuring out (and maintaining) the right set of features.
				555	#[allow(dead_code)]
				556	fn property_set(
				557	name_map: &'static [(&'static str, Range)],
				558	canonical: &'static str,
				559	) -> Option<Range> {
				560	name_map
				561	.binary_search_by_key(&canonical, \|x\| x.0)
				562	.ok()
				563	.map(\|i\| name_map[i].1)
				564	}
				565
				566	/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
				567	/// of codepoints that were added in a particular revision of Unicode. The
				568	/// iterator yields items in chronological order.
				569	///
				570	/// If the given age value isn't valid or if the data isn't available, then an
				571	/// error is returned instead.
				572	fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
				573	#[cfg(not(feature = "unicode-age"))]
				574	fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
				575	use std::option::IntoIter;
				576	Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
				577	}
				578
				579	#[cfg(feature = "unicode-age")]
				580	fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
				581	use unicode_tables::age;
				582
				583	const AGES: &'static [(&'static str, Range)] = &[
				584	("V1_1", age::V1_1),
				585	("V2_0", age::V2_0),
				586	("V2_1", age::V2_1),
				587	("V3_0", age::V3_0),
				588	("V3_1", age::V3_1),
				589	("V3_2", age::V3_2),
				590	("V4_0", age::V4_0),
				591	("V4_1", age::V4_1),
				592	("V5_0", age::V5_0),
				593	("V5_1", age::V5_1),
				594	("V5_2", age::V5_2),
				595	("V6_0", age::V6_0),
				596	("V6_1", age::V6_1),
				597	("V6_2", age::V6_2),
				598	("V6_3", age::V6_3),
				599	("V7_0", age::V7_0),
				600	("V8_0", age::V8_0),
				601	("V9_0", age::V9_0),
				602	("V10_0", age::V10_0),
				603	("V11_0", age::V11_0),
				604	("V12_0", age::V12_0),
				605	("V12_1", age::V12_1),
				606	("V13_0", age::V13_0),
				607	];
				608	assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
				609
				610	let pos = AGES.iter().position(\|&(age, _)\| canonical_age == age);
				611	match pos {
				612	None => Err(Error::PropertyValueNotFound),
				613	Some(i) => Ok(AGES[..i + 1].iter().map(\|&(_, classes)\| classes)),
				614	}
				615	}
				616
				617	imp(canonical_age)
				618	}
				619
				620	/// Returns the Unicode HIR class corresponding to the given general category.
				621	///
				622	/// Name canonicalization is assumed to be performed by the caller.
				623	///
				624	/// If the given general category could not be found, or if the general
				625	/// category data is not available, then an error is returned.
				626	fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				627	#[cfg(not(feature = "unicode-gencat"))]
				628	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				629	Err(Error::PropertyNotFound)
				630	}
				631
				632	#[cfg(feature = "unicode-gencat")]
				633	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				634	use unicode_tables::general_category::BY_NAME;
				635	match name {
				636	"ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
				637	"Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
				638	"Assigned" => {
				639	let mut cls = gencat("Unassigned")?;
				640	cls.negate();
				641	Ok(cls)
				642	}
				643	name => property_set(BY_NAME, name)
				644	.map(hir_class)
				645	.ok_or(Error::PropertyValueNotFound),
				646	}
				647	}
				648
				649	match canonical_name {
				650	"Decimal_Number" => perl_digit(),
				651	name => imp(name),
				652	}
				653	}
				654
				655	/// Returns the Unicode HIR class corresponding to the given script.
				656	///
				657	/// Name canonicalization is assumed to be performed by the caller.
				658	///
				659	/// If the given script could not be found, or if the script data is not
				660	/// available, then an error is returned.
				661	fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				662	#[cfg(not(feature = "unicode-script"))]
				663	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				664	Err(Error::PropertyNotFound)
				665	}
				666
				667	#[cfg(feature = "unicode-script")]
				668	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				669	use unicode_tables::script::BY_NAME;
				670	property_set(BY_NAME, name)
				671	.map(hir_class)
				672	.ok_or(Error::PropertyValueNotFound)
				673	}
				674
				675	imp(canonical_name)
				676	}
				677
				678	/// Returns the Unicode HIR class corresponding to the given script extension.
				679	///
				680	/// Name canonicalization is assumed to be performed by the caller.
				681	///
				682	/// If the given script extension could not be found, or if the script data is
				683	/// not available, then an error is returned.
				684	fn script_extension(
				685	canonical_name: &'static str,
				686	) -> Result<hir::ClassUnicode> {
				687	#[cfg(not(feature = "unicode-script"))]
				688	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				689	Err(Error::PropertyNotFound)
				690	}
				691
				692	#[cfg(feature = "unicode-script")]
				693	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				694	use unicode_tables::script_extension::BY_NAME;
				695	property_set(BY_NAME, name)
				696	.map(hir_class)
				697	.ok_or(Error::PropertyValueNotFound)
				698	}
				699
				700	imp(canonical_name)
				701	}
				702
				703	/// Returns the Unicode HIR class corresponding to the given Unicode boolean
				704	/// property.
				705	///
				706	/// Name canonicalization is assumed to be performed by the caller.
				707	///
				708	/// If the given boolean property could not be found, or if the boolean
				709	/// property data is not available, then an error is returned.
				710	fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				711	#[cfg(not(feature = "unicode-bool"))]
				712	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				713	Err(Error::PropertyNotFound)
				714	}
				715
				716	#[cfg(feature = "unicode-bool")]
				717	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				718	use unicode_tables::property_bool::BY_NAME;
				719	property_set(BY_NAME, name)
				720	.map(hir_class)
				721	.ok_or(Error::PropertyNotFound)
				722	}
				723
				724	match canonical_name {
				725	"Decimal_Number" => perl_digit(),
				726	"White_Space" => perl_space(),
				727	name => imp(name),
				728	}
				729	}
				730
				731	/// Returns the Unicode HIR class corresponding to the given grapheme cluster
				732	/// break property.
				733	///
				734	/// Name canonicalization is assumed to be performed by the caller.
				735	///
				736	/// If the given property could not be found, or if the corresponding data is
				737	/// not available, then an error is returned.
				738	fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				739	#[cfg(not(feature = "unicode-segment"))]
				740	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				741	Err(Error::PropertyNotFound)
				742	}
				743
				744	#[cfg(feature = "unicode-segment")]
				745	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				746	use unicode_tables::grapheme_cluster_break::BY_NAME;
				747	property_set(BY_NAME, name)
				748	.map(hir_class)
				749	.ok_or(Error::PropertyValueNotFound)
				750	}
				751
				752	imp(canonical_name)
				753	}
				754
				755	/// Returns the Unicode HIR class corresponding to the given word break
				756	/// property.
				757	///
				758	/// Name canonicalization is assumed to be performed by the caller.
				759	///
				760	/// If the given property could not be found, or if the corresponding data is
				761	/// not available, then an error is returned.
				762	fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				763	#[cfg(not(feature = "unicode-segment"))]
				764	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				765	Err(Error::PropertyNotFound)
				766	}
				767
				768	#[cfg(feature = "unicode-segment")]
				769	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				770	use unicode_tables::word_break::BY_NAME;
				771	property_set(BY_NAME, name)
				772	.map(hir_class)
				773	.ok_or(Error::PropertyValueNotFound)
				774	}
				775
				776	imp(canonical_name)
				777	}
				778
				779	/// Returns the Unicode HIR class corresponding to the given sentence
				780	/// break property.
				781	///
				782	/// Name canonicalization is assumed to be performed by the caller.
				783	///
				784	/// If the given property could not be found, or if the corresponding data is
				785	/// not available, then an error is returned.
				786	fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				787	#[cfg(not(feature = "unicode-segment"))]
				788	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				789	Err(Error::PropertyNotFound)
				790	}
				791
				792	#[cfg(feature = "unicode-segment")]
				793	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				794	use unicode_tables::sentence_break::BY_NAME;
				795	property_set(BY_NAME, name)
				796	.map(hir_class)
				797	.ok_or(Error::PropertyValueNotFound)
				798	}
				799
				800	imp(canonical_name)
				801	}
				802
				803	/// Like symbolic_name_normalize_bytes, but operates on a string.
				804	fn symbolic_name_normalize(x: &str) -> String {
				805	let mut tmp = x.as_bytes().to_vec();
				806	let len = symbolic_name_normalize_bytes(&mut tmp).len();
				807	tmp.truncate(len);
				808	// This should always succeed because `symbolic_name_normalize_bytes`
				809	// guarantees that `&tmp[..len]` is always valid UTF-8.
				810	//
				811	// N.B. We could avoid the additional UTF-8 check here, but it's unlikely
				812	// to be worth skipping the additional safety check. A benchmark must
				813	// justify it first.
				814	String::from_utf8(tmp).unwrap()
				815	}
				816
				817	/// Normalize the given symbolic name in place according to UAX44-LM3.
				818	///
				819	/// A "symbolic name" typically corresponds to property names and property
				820	/// value aliases. Note, though, that it should not be applied to property
				821	/// string values.
				822	///
				823	/// The slice returned is guaranteed to be valid UTF-8 for all possible values
				824	/// of `slice`.
				825	///
				826	/// See: http://unicode.org/reports/tr44/#UAX44-LM3
				827	fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
				828	// I couldn't find a place in the standard that specified that property
				829	// names/aliases had a particular structure (unlike character names), but
				830	// we assume that it's ASCII only and drop anything that isn't ASCII.
				831	let mut start = 0;
				832	let mut starts_with_is = false;
				833	if slice.len() >= 2 {
				834	// Ignore any "is" prefix.
				835	starts_with_is = slice[0..2] == b"is"[..]
				836	\|\| slice[0..2] == b"IS"[..]
				837	\|\| slice[0..2] == b"iS"[..]
				838	\|\| slice[0..2] == b"Is"[..];
				839	if starts_with_is {
				840	start = 2;
				841	}
				842	}
				843	let mut next_write = 0;
				844	for i in start..slice.len() {
				845	// VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
				846	// UTF-8, we ensure that the slice contains only ASCII bytes. In
				847	// particular, we drop every non-ASCII byte from the normalized string.
				848	let b = slice[i];
				849	if b == b' ' \|\| b == b'_' \|\| b == b'-' {
				850	continue;
				851	} else if b'A' <= b && b <= b'Z' {
				852	slice[next_write] = b + (b'a' - b'A');
				853	next_write += 1;
				854	} else if b <= 0x7F {
				855	slice[next_write] = b;
				856	next_write += 1;
				857	}
				858	}
				859	// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
				860	// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
				861	// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
				862	// is actually an alias for the 'Other' general category.
				863	if starts_with_is && next_write == 1 && slice[0] == b'c' {
				864	slice[0] = b'i';
				865	slice[1] = b's';
				866	slice[2] = b'c';
				867	next_write = 3;
				868	}
				869	&mut slice[..next_write]
				870	}
				871
				872	#[cfg(test)]
				873	mod tests {
				874	use super::{
				875	contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
				876	symbolic_name_normalize_bytes,
				877	};
				878
				879	#[cfg(feature = "unicode-case")]
				880	fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
				881	simple_fold(c).unwrap().unwrap()
				882	}
				883
				884	#[cfg(feature = "unicode-case")]
				885	fn simple_fold_err(c: char) -> Option<char> {
				886	match simple_fold(c).unwrap() {
				887	Ok(_) => unreachable!("simple_fold returned Ok iterator"),
				888	Err(next) => next,
				889	}
				890	}
				891
				892	#[cfg(feature = "unicode-case")]
				893	fn contains_case_map(start: char, end: char) -> bool {
				894	contains_simple_case_mapping(start, end).unwrap()
				895	}
				896
				897	#[test]
				898	#[cfg(feature = "unicode-case")]
				899	fn simple_fold_k() {
				900	let xs: Vec<char> = simple_fold_ok('k').collect();
				901	assert_eq!(xs, vec!['K', 'K']);
				902
				903	let xs: Vec<char> = simple_fold_ok('K').collect();
				904	assert_eq!(xs, vec!['k', 'K']);
				905
				906	let xs: Vec<char> = simple_fold_ok('K').collect();
				907	assert_eq!(xs, vec!['K', 'k']);
				908	}
				909
				910	#[test]
				911	#[cfg(feature = "unicode-case")]
				912	fn simple_fold_a() {
				913	let xs: Vec<char> = simple_fold_ok('a').collect();
				914	assert_eq!(xs, vec!['A']);
				915
				916	let xs: Vec<char> = simple_fold_ok('A').collect();
				917	assert_eq!(xs, vec!['a']);
				918	}
				919
				920	#[test]
				921	#[cfg(feature = "unicode-case")]
				922	fn simple_fold_empty() {
				923	assert_eq!(Some('A'), simple_fold_err('?'));
				924	assert_eq!(Some('A'), simple_fold_err('@'));
				925	assert_eq!(Some('a'), simple_fold_err('['));
				926	assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
				927	}
				928
				929	#[test]
				930	#[cfg(feature = "unicode-case")]
				931	fn simple_fold_max() {
				932	assert_eq!(None, simple_fold_err('\u{10FFFE}'));
				933	assert_eq!(None, simple_fold_err('\u{10FFFF}'));
				934	}
				935
				936	#[test]
				937	#[cfg(not(feature = "unicode-case"))]
				938	fn simple_fold_disabled() {
				939	assert!(simple_fold('a').is_err());
				940	}
				941
				942	#[test]
				943	#[cfg(feature = "unicode-case")]
				944	fn range_contains() {
				945	assert!(contains_case_map('A', 'A'));
				946	assert!(contains_case_map('Z', 'Z'));
				947	assert!(contains_case_map('A', 'Z'));
				948	assert!(contains_case_map('@', 'A'));
				949	assert!(contains_case_map('Z', '['));
				950	assert!(contains_case_map('☃', 'Ⰰ'));
				951
				952	assert!(!contains_case_map('[', '['));
				953	assert!(!contains_case_map('[', '`'));
				954
				955	assert!(!contains_case_map('☃', '☃'));
				956	}
				957
				958	#[test]
				959	#[cfg(not(feature = "unicode-case"))]
				960	fn range_contains_disabled() {
				961	assert!(contains_simple_case_mapping('a', 'a').is_err());
				962	}
				963
				964	#[test]
				965	#[cfg(feature = "unicode-gencat")]
				966	fn regression_466() {
				967	use super::{CanonicalClassQuery, ClassQuery};
				968
				969	let q = ClassQuery::OneLetter('C');
				970	assert_eq!(
				971	q.canonicalize().unwrap(),
				972	CanonicalClassQuery::GeneralCategory("Other")
				973	);
				974	}
				975
				976	#[test]
				977	fn sym_normalize() {
				978	let sym_norm = symbolic_name_normalize;
				979
				980	assert_eq!(sym_norm("Line_Break"), "linebreak");
				981	assert_eq!(sym_norm("Line-break"), "linebreak");
				982	assert_eq!(sym_norm("linebreak"), "linebreak");
				983	assert_eq!(sym_norm("BA"), "ba");
				984	assert_eq!(sym_norm("ba"), "ba");
				985	assert_eq!(sym_norm("Greek"), "greek");
				986	assert_eq!(sym_norm("isGreek"), "greek");
				987	assert_eq!(sym_norm("IS_Greek"), "greek");
				988	assert_eq!(sym_norm("isc"), "isc");
				989	assert_eq!(sym_norm("is c"), "isc");
				990	assert_eq!(sym_norm("is_c"), "isc");
				991	}
				992
				993	#[test]
				994	fn valid_utf8_symbolic() {
				995	let mut x = b"abc\xFFxyz".to_vec();
				996	let y = symbolic_name_normalize_bytes(&mut x);
				997	assert_eq!(y, b"abcxyz");
				998	}
				999	}