Blame - src/unicode.rs - platform/external/rust/crates/regex-syntax

blob: 7e414396f5e35b96ce8a2d870cd1a7cb72a40e28 [file] [log] [blame]

Chih-Hung Hsieh	048fc04	2020-04-16 10:44:22 -0700	[diff] [blame]	1	use std::error;
				2	use std::fmt;
				3	use std::result;
				4
				5	use hir;
				6
				7	/// A type alias for errors specific to Unicode handling of classes.
				8	pub type Result<T> = result::Result<T, Error>;
				9
				10	/// An inclusive range of codepoints from a generated file (hence the static
				11	/// lifetime).
				12	type Range = &'static [(char, char)];
				13
				14	/// An error that occurs when dealing with Unicode.
				15	///
				16	/// We don't impl the Error trait here because these always get converted
				17	/// into other public errors. (This error type isn't exported.)
				18	#[derive(Debug)]
				19	pub enum Error {
				20	PropertyNotFound,
				21	PropertyValueNotFound,
				22	// Not used when unicode-perl is enabled.
				23	#[allow(dead_code)]
				24	PerlClassNotFound,
				25	}
				26
				27	/// A type alias for errors specific to Unicode case folding.
				28	pub type FoldResult<T> = result::Result<T, CaseFoldError>;
				29
				30	/// An error that occurs when Unicode-aware simple case folding fails.
				31	///
				32	/// This error can occur when the case mapping tables necessary for Unicode
				33	/// aware case folding are unavailable. This only occurs when the
				34	/// `unicode-case` feature is disabled. (The feature is enabled by default.)
				35	#[derive(Debug)]
				36	pub struct CaseFoldError(());
				37
				38	impl error::Error for CaseFoldError {}
				39
				40	impl fmt::Display for CaseFoldError {
				41	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				42	write!(
				43	f,
				44	"Unicode-aware case folding is not available \
				45	(probably because the unicode-case feature is not enabled)"
				46	)
				47	}
				48	}
				49
				50	/// An error that occurs when the Unicode-aware `\w` class is unavailable.
				51	///
				52	/// This error can occur when the data tables necessary for the Unicode aware
				53	/// Perl character class `\w` are unavailable. This only occurs when the
				54	/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
				55	#[derive(Debug)]
				56	pub struct UnicodeWordError(());
				57
				58	impl error::Error for UnicodeWordError {}
				59
				60	impl fmt::Display for UnicodeWordError {
				61	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
				62	write!(
				63	f,
				64	"Unicode-aware \\w class is not available \
				65	(probably because the unicode-perl feature is not enabled)"
				66	)
				67	}
				68	}
				69
				70	/// Return an iterator over the equivalence class of simple case mappings
				71	/// for the given codepoint. The equivalence class does not include the
				72	/// given codepoint.
				73	///
				74	/// If the equivalence class is empty, then this returns the next scalar
				75	/// value that has a non-empty equivalence class, if it exists. If no such
				76	/// scalar value exists, then `None` is returned. The point of this behavior
				77	/// is to permit callers to avoid calling `simple_fold` more than they need
				78	/// to, since there is some cost to fetching the equivalence class.
				79	///
				80	/// This returns an error if the Unicode case folding tables are not available.
				81	pub fn simple_fold(
				82	c: char,
				83	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
				84	#[cfg(not(feature = "unicode-case"))]
				85	fn imp(
				86	_: char,
				87	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
				88	{
				89	use std::option::IntoIter;
				90	Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
				91	}
				92
				93	#[cfg(feature = "unicode-case")]
				94	fn imp(
				95	c: char,
				96	) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
				97	{
				98	use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
				99
				100	Ok(CASE_FOLDING_SIMPLE
				101	.binary_search_by_key(&c, \|&(c1, _)\| c1)
				102	.map(\|i\| CASE_FOLDING_SIMPLE[i].1.iter().map(\|&c\| c))
				103	.map_err(\|i\| {
				104	if i >= CASE_FOLDING_SIMPLE.len() {
				105	None
				106	} else {
				107	Some(CASE_FOLDING_SIMPLE[i].0)
				108	}
				109	}))
				110	}
				111
				112	imp(c)
				113	}
				114
				115	/// Returns true if and only if the given (inclusive) range contains at least
				116	/// one Unicode scalar value that has a non-empty non-trivial simple case
				117	/// mapping.
				118	///
				119	/// This function panics if `end < start`.
				120	///
				121	/// This returns an error if the Unicode case folding tables are not available.
				122	pub fn contains_simple_case_mapping(
				123	start: char,
				124	end: char,
				125	) -> FoldResult<bool> {
				126	#[cfg(not(feature = "unicode-case"))]
				127	fn imp(_: char, _: char) -> FoldResult<bool> {
				128	Err(CaseFoldError(()))
				129	}
				130
				131	#[cfg(feature = "unicode-case")]
				132	fn imp(start: char, end: char) -> FoldResult<bool> {
				133	use std::cmp::Ordering;
				134	use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
				135
				136	assert!(start <= end);
				137	Ok(CASE_FOLDING_SIMPLE
				138	.binary_search_by(\|&(c, _)\| {
				139	if start <= c && c <= end {
				140	Ordering::Equal
				141	} else if c > end {
				142	Ordering::Greater
				143	} else {
				144	Ordering::Less
				145	}
				146	})
				147	.is_ok())
				148	}
				149
				150	imp(start, end)
				151	}
				152
				153	/// A query for finding a character class defined by Unicode. This supports
				154	/// either use of a property name directly, or lookup by property value. The
				155	/// former generally refers to Binary properties (see UTS#44, Table 8), but
				156	/// as a special exception (see UTS#18, Section 1.2) both general categories
				157	/// (an enumeration) and scripts (a catalog) are supported as if each of their
				158	/// possible values were a binary property.
				159	///
				160	/// In all circumstances, property names and values are normalized and
				161	/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
				162	///
				163	/// The lifetime `'a` refers to the shorter of the lifetimes of property name
				164	/// and property value.
				165	#[derive(Debug)]
				166	pub enum ClassQuery<'a> {
				167	/// Return a class corresponding to a Unicode binary property, named by
				168	/// a single letter.
				169	OneLetter(char),
				170	/// Return a class corresponding to a Unicode binary property.
				171	///
				172	/// Note that, by special exception (see UTS#18, Section 1.2), both
				173	/// general category values and script values are permitted here as if
				174	/// they were a binary property.
				175	Binary(&'a str),
				176	/// Return a class corresponding to all codepoints whose property
				177	/// (identified by `property_name`) corresponds to the given value
				178	/// (identified by `property_value`).
				179	ByValue {
				180	/// A property name.
				181	property_name: &'a str,
				182	/// A property value.
				183	property_value: &'a str,
				184	},
				185	}
				186
				187	impl<'a> ClassQuery<'a> {
				188	fn canonicalize(&self) -> Result<CanonicalClassQuery> {
				189	match *self {
				190	ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
				191	ClassQuery::Binary(name) => self.canonical_binary(name),
				192	ClassQuery::ByValue { property_name, property_value } => {
				193	let property_name = symbolic_name_normalize(property_name);
				194	let property_value = symbolic_name_normalize(property_value);
				195
				196	let canon_name = match canonical_prop(&property_name)? {
				197	None => return Err(Error::PropertyNotFound),
				198	Some(canon_name) => canon_name,
				199	};
				200	Ok(match canon_name {
				201	"General_Category" => {
				202	let canon = match canonical_gencat(&property_value)? {
				203	None => return Err(Error::PropertyValueNotFound),
				204	Some(canon) => canon,
				205	};
				206	CanonicalClassQuery::GeneralCategory(canon)
				207	}
				208	"Script" => {
				209	let canon = match canonical_script(&property_value)? {
				210	None => return Err(Error::PropertyValueNotFound),
				211	Some(canon) => canon,
				212	};
				213	CanonicalClassQuery::Script(canon)
				214	}
				215	_ => {
				216	let vals = match property_values(canon_name)? {
				217	None => return Err(Error::PropertyValueNotFound),
				218	Some(vals) => vals,
				219	};
				220	let canon_val =
				221	match canonical_value(vals, &property_value) {
				222	None => {
				223	return Err(Error::PropertyValueNotFound)
				224	}
				225	Some(canon_val) => canon_val,
				226	};
				227	CanonicalClassQuery::ByValue {
				228	property_name: canon_name,
				229	property_value: canon_val,
				230	}
				231	}
				232	})
				233	}
				234	}
				235	}
				236
				237	fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
				238	let norm = symbolic_name_normalize(name);
				239
				240	if let Some(canon) = canonical_prop(&norm)? {
				241	return Ok(CanonicalClassQuery::Binary(canon));
				242	}
				243	if let Some(canon) = canonical_gencat(&norm)? {
				244	return Ok(CanonicalClassQuery::GeneralCategory(canon));
				245	}
				246	if let Some(canon) = canonical_script(&norm)? {
				247	return Ok(CanonicalClassQuery::Script(canon));
				248	}
				249	Err(Error::PropertyNotFound)
				250	}
				251	}
				252
				253	/// Like ClassQuery, but its parameters have been canonicalized. This also
				254	/// differentiates binary properties from flattened general categories and
				255	/// scripts.
				256	#[derive(Debug, Eq, PartialEq)]
				257	enum CanonicalClassQuery {
				258	/// The canonical binary property name.
				259	Binary(&'static str),
				260	/// The canonical general category name.
				261	GeneralCategory(&'static str),
				262	/// The canonical script name.
				263	Script(&'static str),
				264	/// An arbitrary association between property and value, both of which
				265	/// have been canonicalized.
				266	///
				267	/// Note that by construction, the property name of ByValue will never
				268	/// be General_Category or Script. Those two cases are subsumed by the
				269	/// eponymous variants.
				270	ByValue {
				271	/// The canonical property name.
				272	property_name: &'static str,
				273	/// The canonical property value.
				274	property_value: &'static str,
				275	},
				276	}
				277
				278	/// Looks up a Unicode class given a query. If one doesn't exist, then
				279	/// `None` is returned.
				280	pub fn class(query: ClassQuery) -> Result<hir::ClassUnicode> {
				281	use self::CanonicalClassQuery::*;
				282
				283	match query.canonicalize()? {
				284	Binary(name) => bool_property(name),
				285	GeneralCategory(name) => gencat(name),
				286	Script(name) => script(name),
				287	ByValue { property_name: "Age", property_value } => {
				288	let mut class = hir::ClassUnicode::empty();
				289	for set in ages(property_value)? {
				290	class.union(&hir_class(set));
				291	}
				292	Ok(class)
				293	}
				294	ByValue { property_name: "Script_Extensions", property_value } => {
				295	script_extension(property_value)
				296	}
				297	ByValue {
				298	property_name: "Grapheme_Cluster_Break",
				299	property_value,
				300	} => gcb(property_value),
				301	ByValue { property_name: "Sentence_Break", property_value } => {
				302	sb(property_value)
				303	}
				304	ByValue { property_name: "Word_Break", property_value } => {
				305	wb(property_value)
				306	}
				307	_ => {
				308	// What else should we support?
				309	Err(Error::PropertyNotFound)
				310	}
				311	}
				312	}
				313
				314	/// Returns a Unicode aware class for \w.
				315	///
				316	/// This returns an error if the data is not available for \w.
				317	pub fn perl_word() -> Result<hir::ClassUnicode> {
				318	#[cfg(not(feature = "unicode-perl"))]
				319	fn imp() -> Result<hir::ClassUnicode> {
				320	Err(Error::PerlClassNotFound)
				321	}
				322
				323	#[cfg(feature = "unicode-perl")]
				324	fn imp() -> Result<hir::ClassUnicode> {
				325	use unicode_tables::perl_word::PERL_WORD;
				326	Ok(hir_class(PERL_WORD))
				327	}
				328
				329	imp()
				330	}
				331
				332	/// Returns a Unicode aware class for \s.
				333	///
				334	/// This returns an error if the data is not available for \s.
				335	pub fn perl_space() -> Result<hir::ClassUnicode> {
				336	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
				337	fn imp() -> Result<hir::ClassUnicode> {
				338	Err(Error::PerlClassNotFound)
				339	}
				340
				341	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
				342	fn imp() -> Result<hir::ClassUnicode> {
				343	use unicode_tables::perl_space::WHITE_SPACE;
				344	Ok(hir_class(WHITE_SPACE))
				345	}
				346
				347	#[cfg(feature = "unicode-bool")]
				348	fn imp() -> Result<hir::ClassUnicode> {
				349	use unicode_tables::property_bool::WHITE_SPACE;
				350	Ok(hir_class(WHITE_SPACE))
				351	}
				352
				353	imp()
				354	}
				355
				356	/// Returns a Unicode aware class for \d.
				357	///
				358	/// This returns an error if the data is not available for \d.
				359	pub fn perl_digit() -> Result<hir::ClassUnicode> {
				360	#[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
				361	fn imp() -> Result<hir::ClassUnicode> {
				362	Err(Error::PerlClassNotFound)
				363	}
				364
				365	#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
				366	fn imp() -> Result<hir::ClassUnicode> {
				367	use unicode_tables::perl_decimal::DECIMAL_NUMBER;
				368	Ok(hir_class(DECIMAL_NUMBER))
				369	}
				370
				371	#[cfg(feature = "unicode-gencat")]
				372	fn imp() -> Result<hir::ClassUnicode> {
				373	use unicode_tables::general_category::DECIMAL_NUMBER;
				374	Ok(hir_class(DECIMAL_NUMBER))
				375	}
				376
				377	imp()
				378	}
				379
				380	/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
				381	pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
				382	let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
				383	.iter()
				384	.map(\|&(s, e)\| hir::ClassUnicodeRange::new(s, e))
				385	.collect();
				386	hir::ClassUnicode::new(hir_ranges)
				387	}
				388
				389	/// Returns true only if the given codepoint is in the `\w` character class.
				390	///
				391	/// If the `unicode-perl` feature is not enabled, then this returns an error.
				392	pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
				393	#[cfg(not(feature = "unicode-perl"))]
				394	fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
				395	Err(UnicodeWordError(()))
				396	}
				397
				398	#[cfg(feature = "unicode-perl")]
				399	fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
				400	use is_word_byte;
				401	use std::cmp::Ordering;
				402	use unicode_tables::perl_word::PERL_WORD;
				403
				404	if c <= 0x7F as char && is_word_byte(c as u8) {
				405	return Ok(true);
				406	}
				407	Ok(PERL_WORD
				408	.binary_search_by(\|&(start, end)\| {
				409	if start <= c && c <= end {
				410	Ordering::Equal
				411	} else if start > c {
				412	Ordering::Greater
				413	} else {
				414	Ordering::Less
				415	}
				416	})
				417	.is_ok())
				418	}
				419
				420	imp(c)
				421	}
				422
				423	/// A mapping of property values for a specific property.
				424	///
				425	/// The first element of each tuple is a normalized property value while the
				426	/// second element of each tuple is the corresponding canonical property
				427	/// value.
				428	type PropertyValues = &'static [(&'static str, &'static str)];
				429
				430	fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
				431	Ok(match normalized_value {
				432	"any" => Some("Any"),
				433	"assigned" => Some("Assigned"),
				434	"ascii" => Some("ASCII"),
				435	_ => {
				436	let gencats = property_values("General_Category")?.unwrap();
				437	canonical_value(gencats, normalized_value)
				438	}
				439	})
				440	}
				441
				442	fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
				443	let scripts = property_values("Script")?.unwrap();
				444	Ok(canonical_value(scripts, normalized_value))
				445	}
				446
				447	/// Find the canonical property name for the given normalized property name.
				448	///
				449	/// If no such property exists, then `None` is returned.
				450	///
				451	/// The normalized property name must have been normalized according to
				452	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
				453	///
				454	/// If the property names data is not available, then an error is returned.
				455	fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
				456	#[cfg(not(any(
				457	feature = "unicode-age",
				458	feature = "unicode-bool",
				459	feature = "unicode-gencat",
				460	feature = "unicode-perl",
				461	feature = "unicode-script",
				462	feature = "unicode-segment",
				463	)))]
				464	fn imp(_: &str) -> Result<Option<&'static str>> {
				465	Err(Error::PropertyNotFound)
				466	}
				467
				468	#[cfg(any(
				469	feature = "unicode-age",
				470	feature = "unicode-bool",
				471	feature = "unicode-gencat",
				472	feature = "unicode-perl",
				473	feature = "unicode-script",
				474	feature = "unicode-segment",
				475	))]
				476	fn imp(name: &str) -> Result<Option<&'static str>> {
				477	use unicode_tables::property_names::PROPERTY_NAMES;
				478
				479	Ok(PROPERTY_NAMES
				480	.binary_search_by_key(&name, \|&(n, _)\| n)
				481	.ok()
				482	.map(\|i\| PROPERTY_NAMES[i].1))
				483	}
				484
				485	imp(normalized_name)
				486	}
				487
				488	/// Find the canonical property value for the given normalized property
				489	/// value.
				490	///
				491	/// The given property values should correspond to the values for the property
				492	/// under question, which can be found using `property_values`.
				493	///
				494	/// If no such property value exists, then `None` is returned.
				495	///
				496	/// The normalized property value must have been normalized according to
				497	/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
				498	fn canonical_value(
				499	vals: PropertyValues,
				500	normalized_value: &str,
				501	) -> Option<&'static str> {
				502	vals.binary_search_by_key(&normalized_value, \|&(n, _)\| n)
				503	.ok()
				504	.map(\|i\| vals[i].1)
				505	}
				506
				507	/// Return the table of property values for the given property name.
				508	///
				509	/// If the property values data is not available, then an error is returned.
				510	fn property_values(
				511	canonical_property_name: &'static str,
				512	) -> Result<Option<PropertyValues>> {
				513	#[cfg(not(any(
				514	feature = "unicode-age",
				515	feature = "unicode-bool",
				516	feature = "unicode-gencat",
				517	feature = "unicode-perl",
				518	feature = "unicode-script",
				519	feature = "unicode-segment",
				520	)))]
				521	fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
				522	Err(Error::PropertyValueNotFound)
				523	}
				524
				525	#[cfg(any(
				526	feature = "unicode-age",
				527	feature = "unicode-bool",
				528	feature = "unicode-gencat",
				529	feature = "unicode-perl",
				530	feature = "unicode-script",
				531	feature = "unicode-segment",
				532	))]
				533	fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
				534	use unicode_tables::property_values::PROPERTY_VALUES;
				535
				536	Ok(PROPERTY_VALUES
				537	.binary_search_by_key(&name, \|&(n, _)\| n)
				538	.ok()
				539	.map(\|i\| PROPERTY_VALUES[i].1))
				540	}
				541
				542	imp(canonical_property_name)
				543	}
				544
				545	// This is only used in some cases, but small enough to just let it be dead
				546	// instead of figuring out (and maintaining) the right set of features.
				547	#[allow(dead_code)]
				548	fn property_set(
				549	name_map: &'static [(&'static str, Range)],
				550	canonical: &'static str,
				551	) -> Option<Range> {
				552	name_map
				553	.binary_search_by_key(&canonical, \|x\| x.0)
				554	.ok()
				555	.map(\|i\| name_map[i].1)
				556	}
				557
				558	/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
				559	/// of codepoints that were added in a particular revision of Unicode. The
				560	/// iterator yields items in chronological order.
				561	///
				562	/// If the given age value isn't valid or if the data isn't available, then an
				563	/// error is returned instead.
				564	fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
				565	#[cfg(not(feature = "unicode-age"))]
				566	fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
				567	use std::option::IntoIter;
				568	Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
				569	}
				570
				571	#[cfg(feature = "unicode-age")]
				572	fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
				573	use unicode_tables::age;
				574
				575	const AGES: &'static [(&'static str, Range)] = &[
				576	("V1_1", age::V1_1),
				577	("V2_0", age::V2_0),
				578	("V2_1", age::V2_1),
				579	("V3_0", age::V3_0),
				580	("V3_1", age::V3_1),
				581	("V3_2", age::V3_2),
				582	("V4_0", age::V4_0),
				583	("V4_1", age::V4_1),
				584	("V5_0", age::V5_0),
				585	("V5_1", age::V5_1),
				586	("V5_2", age::V5_2),
				587	("V6_0", age::V6_0),
				588	("V6_1", age::V6_1),
				589	("V6_2", age::V6_2),
				590	("V6_3", age::V6_3),
				591	("V7_0", age::V7_0),
				592	("V8_0", age::V8_0),
				593	("V9_0", age::V9_0),
				594	("V10_0", age::V10_0),
				595	("V11_0", age::V11_0),
				596	("V12_0", age::V12_0),
				597	("V12_1", age::V12_1),
				598	("V13_0", age::V13_0),
				599	];
				600	assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
				601
				602	let pos = AGES.iter().position(\|&(age, _)\| canonical_age == age);
				603	match pos {
				604	None => Err(Error::PropertyValueNotFound),
				605	Some(i) => Ok(AGES[..i + 1].iter().map(\|&(_, classes)\| classes)),
				606	}
				607	}
				608
				609	imp(canonical_age)
				610	}
				611
				612	/// Returns the Unicode HIR class corresponding to the given general category.
				613	///
				614	/// Name canonicalization is assumed to be performed by the caller.
				615	///
				616	/// If the given general category could not be found, or if the general
				617	/// category data is not available, then an error is returned.
				618	fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				619	#[cfg(not(feature = "unicode-gencat"))]
				620	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				621	Err(Error::PropertyNotFound)
				622	}
				623
				624	#[cfg(feature = "unicode-gencat")]
				625	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				626	use unicode_tables::general_category::BY_NAME;
				627	match name {
				628	"ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
				629	"Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
				630	"Assigned" => {
				631	let mut cls = gencat("Unassigned")?;
				632	cls.negate();
				633	Ok(cls)
				634	}
				635	name => property_set(BY_NAME, name)
				636	.map(hir_class)
				637	.ok_or(Error::PropertyValueNotFound),
				638	}
				639	}
				640
				641	match canonical_name {
				642	"Decimal_Number" => perl_digit(),
				643	name => imp(name),
				644	}
				645	}
				646
				647	/// Returns the Unicode HIR class corresponding to the given script.
				648	///
				649	/// Name canonicalization is assumed to be performed by the caller.
				650	///
				651	/// If the given script could not be found, or if the script data is not
				652	/// available, then an error is returned.
				653	fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				654	#[cfg(not(feature = "unicode-script"))]
				655	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				656	Err(Error::PropertyNotFound)
				657	}
				658
				659	#[cfg(feature = "unicode-script")]
				660	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				661	use unicode_tables::script::BY_NAME;
				662	property_set(BY_NAME, name)
				663	.map(hir_class)
				664	.ok_or(Error::PropertyValueNotFound)
				665	}
				666
				667	imp(canonical_name)
				668	}
				669
				670	/// Returns the Unicode HIR class corresponding to the given script extension.
				671	///
				672	/// Name canonicalization is assumed to be performed by the caller.
				673	///
				674	/// If the given script extension could not be found, or if the script data is
				675	/// not available, then an error is returned.
				676	fn script_extension(
				677	canonical_name: &'static str,
				678	) -> Result<hir::ClassUnicode> {
				679	#[cfg(not(feature = "unicode-script"))]
				680	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				681	Err(Error::PropertyNotFound)
				682	}
				683
				684	#[cfg(feature = "unicode-script")]
				685	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				686	use unicode_tables::script_extension::BY_NAME;
				687	property_set(BY_NAME, name)
				688	.map(hir_class)
				689	.ok_or(Error::PropertyValueNotFound)
				690	}
				691
				692	imp(canonical_name)
				693	}
				694
				695	/// Returns the Unicode HIR class corresponding to the given Unicode boolean
				696	/// property.
				697	///
				698	/// Name canonicalization is assumed to be performed by the caller.
				699	///
				700	/// If the given boolean property could not be found, or if the boolean
				701	/// property data is not available, then an error is returned.
				702	fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				703	#[cfg(not(feature = "unicode-bool"))]
				704	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				705	Err(Error::PropertyNotFound)
				706	}
				707
				708	#[cfg(feature = "unicode-bool")]
				709	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				710	use unicode_tables::property_bool::BY_NAME;
				711	property_set(BY_NAME, name)
				712	.map(hir_class)
				713	.ok_or(Error::PropertyNotFound)
				714	}
				715
				716	match canonical_name {
				717	"Decimal_Number" => perl_digit(),
				718	"White_Space" => perl_space(),
				719	name => imp(name),
				720	}
				721	}
				722
				723	/// Returns the Unicode HIR class corresponding to the given grapheme cluster
				724	/// break property.
				725	///
				726	/// Name canonicalization is assumed to be performed by the caller.
				727	///
				728	/// If the given property could not be found, or if the corresponding data is
				729	/// not available, then an error is returned.
				730	fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				731	#[cfg(not(feature = "unicode-segment"))]
				732	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				733	Err(Error::PropertyNotFound)
				734	}
				735
				736	#[cfg(feature = "unicode-segment")]
				737	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				738	use unicode_tables::grapheme_cluster_break::BY_NAME;
				739	property_set(BY_NAME, name)
				740	.map(hir_class)
				741	.ok_or(Error::PropertyValueNotFound)
				742	}
				743
				744	imp(canonical_name)
				745	}
				746
				747	/// Returns the Unicode HIR class corresponding to the given word break
				748	/// property.
				749	///
				750	/// Name canonicalization is assumed to be performed by the caller.
				751	///
				752	/// If the given property could not be found, or if the corresponding data is
				753	/// not available, then an error is returned.
				754	fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				755	#[cfg(not(feature = "unicode-segment"))]
				756	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				757	Err(Error::PropertyNotFound)
				758	}
				759
				760	#[cfg(feature = "unicode-segment")]
				761	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				762	use unicode_tables::word_break::BY_NAME;
				763	property_set(BY_NAME, name)
				764	.map(hir_class)
				765	.ok_or(Error::PropertyValueNotFound)
				766	}
				767
				768	imp(canonical_name)
				769	}
				770
				771	/// Returns the Unicode HIR class corresponding to the given sentence
				772	/// break property.
				773	///
				774	/// Name canonicalization is assumed to be performed by the caller.
				775	///
				776	/// If the given property could not be found, or if the corresponding data is
				777	/// not available, then an error is returned.
				778	fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
				779	#[cfg(not(feature = "unicode-segment"))]
				780	fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
				781	Err(Error::PropertyNotFound)
				782	}
				783
				784	#[cfg(feature = "unicode-segment")]
				785	fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
				786	use unicode_tables::sentence_break::BY_NAME;
				787	property_set(BY_NAME, name)
				788	.map(hir_class)
				789	.ok_or(Error::PropertyValueNotFound)
				790	}
				791
				792	imp(canonical_name)
				793	}
				794
				795	/// Like symbolic_name_normalize_bytes, but operates on a string.
				796	fn symbolic_name_normalize(x: &str) -> String {
				797	let mut tmp = x.as_bytes().to_vec();
				798	let len = symbolic_name_normalize_bytes(&mut tmp).len();
				799	tmp.truncate(len);
				800	// This should always succeed because `symbolic_name_normalize_bytes`
				801	// guarantees that `&tmp[..len]` is always valid UTF-8.
				802	//
				803	// N.B. We could avoid the additional UTF-8 check here, but it's unlikely
				804	// to be worth skipping the additional safety check. A benchmark must
				805	// justify it first.
				806	String::from_utf8(tmp).unwrap()
				807	}
				808
				809	/// Normalize the given symbolic name in place according to UAX44-LM3.
				810	///
				811	/// A "symbolic name" typically corresponds to property names and property
				812	/// value aliases. Note, though, that it should not be applied to property
				813	/// string values.
				814	///
				815	/// The slice returned is guaranteed to be valid UTF-8 for all possible values
				816	/// of `slice`.
				817	///
				818	/// See: http://unicode.org/reports/tr44/#UAX44-LM3
				819	fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
				820	// I couldn't find a place in the standard that specified that property
				821	// names/aliases had a particular structure (unlike character names), but
				822	// we assume that it's ASCII only and drop anything that isn't ASCII.
				823	let mut start = 0;
				824	let mut starts_with_is = false;
				825	if slice.len() >= 2 {
				826	// Ignore any "is" prefix.
				827	starts_with_is = slice[0..2] == b"is"[..]
				828	\|\| slice[0..2] == b"IS"[..]
				829	\|\| slice[0..2] == b"iS"[..]
				830	\|\| slice[0..2] == b"Is"[..];
				831	if starts_with_is {
				832	start = 2;
				833	}
				834	}
				835	let mut next_write = 0;
				836	for i in start..slice.len() {
				837	// VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
				838	// UTF-8, we ensure that the slice contains only ASCII bytes. In
				839	// particular, we drop every non-ASCII byte from the normalized string.
				840	let b = slice[i];
				841	if b == b' ' \|\| b == b'_' \|\| b == b'-' {
				842	continue;
				843	} else if b'A' <= b && b <= b'Z' {
				844	slice[next_write] = b + (b'a' - b'A');
				845	next_write += 1;
				846	} else if b <= 0x7F {
				847	slice[next_write] = b;
				848	next_write += 1;
				849	}
				850	}
				851	// Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
				852	// ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
				853	// fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
				854	// is actually an alias for the 'Other' general category.
				855	if starts_with_is && next_write == 1 && slice[0] == b'c' {
				856	slice[0] = b'i';
				857	slice[1] = b's';
				858	slice[2] = b'c';
				859	next_write = 3;
				860	}
				861	&mut slice[..next_write]
				862	}
				863
				864	#[cfg(test)]
				865	mod tests {
				866	use super::{
				867	contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
				868	symbolic_name_normalize_bytes,
				869	};
				870
				871	#[cfg(feature = "unicode-case")]
				872	fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
				873	simple_fold(c).unwrap().unwrap()
				874	}
				875
				876	#[cfg(feature = "unicode-case")]
				877	fn simple_fold_err(c: char) -> Option<char> {
				878	match simple_fold(c).unwrap() {
				879	Ok(_) => unreachable!("simple_fold returned Ok iterator"),
				880	Err(next) => next,
				881	}
				882	}
				883
				884	#[cfg(feature = "unicode-case")]
				885	fn contains_case_map(start: char, end: char) -> bool {
				886	contains_simple_case_mapping(start, end).unwrap()
				887	}
				888
				889	#[test]
				890	#[cfg(feature = "unicode-case")]
				891	fn simple_fold_k() {
				892	let xs: Vec<char> = simple_fold_ok('k').collect();
				893	assert_eq!(xs, vec!['K', 'K']);
				894
				895	let xs: Vec<char> = simple_fold_ok('K').collect();
				896	assert_eq!(xs, vec!['k', 'K']);
				897
				898	let xs: Vec<char> = simple_fold_ok('K').collect();
				899	assert_eq!(xs, vec!['K', 'k']);
				900	}
				901
				902	#[test]
				903	#[cfg(feature = "unicode-case")]
				904	fn simple_fold_a() {
				905	let xs: Vec<char> = simple_fold_ok('a').collect();
				906	assert_eq!(xs, vec!['A']);
				907
				908	let xs: Vec<char> = simple_fold_ok('A').collect();
				909	assert_eq!(xs, vec!['a']);
				910	}
				911
				912	#[test]
				913	#[cfg(feature = "unicode-case")]
				914	fn simple_fold_empty() {
				915	assert_eq!(Some('A'), simple_fold_err('?'));
				916	assert_eq!(Some('A'), simple_fold_err('@'));
				917	assert_eq!(Some('a'), simple_fold_err('['));
				918	assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
				919	}
				920
				921	#[test]
				922	#[cfg(feature = "unicode-case")]
				923	fn simple_fold_max() {
				924	assert_eq!(None, simple_fold_err('\u{10FFFE}'));
				925	assert_eq!(None, simple_fold_err('\u{10FFFF}'));
				926	}
				927
				928	#[test]
				929	#[cfg(not(feature = "unicode-case"))]
				930	fn simple_fold_disabled() {
				931	assert!(simple_fold('a').is_err());
				932	}
				933
				934	#[test]
				935	#[cfg(feature = "unicode-case")]
				936	fn range_contains() {
				937	assert!(contains_case_map('A', 'A'));
				938	assert!(contains_case_map('Z', 'Z'));
				939	assert!(contains_case_map('A', 'Z'));
				940	assert!(contains_case_map('@', 'A'));
				941	assert!(contains_case_map('Z', '['));
				942	assert!(contains_case_map('☃', 'Ⰰ'));
				943
				944	assert!(!contains_case_map('[', '['));
				945	assert!(!contains_case_map('[', '`'));
				946
				947	assert!(!contains_case_map('☃', '☃'));
				948	}
				949
				950	#[test]
				951	#[cfg(not(feature = "unicode-case"))]
				952	fn range_contains_disabled() {
				953	assert!(contains_simple_case_mapping('a', 'a').is_err());
				954	}
				955
				956	#[test]
				957	#[cfg(feature = "unicode-gencat")]
				958	fn regression_466() {
				959	use super::{CanonicalClassQuery, ClassQuery};
				960
				961	let q = ClassQuery::OneLetter('C');
				962	assert_eq!(
				963	q.canonicalize().unwrap(),
				964	CanonicalClassQuery::GeneralCategory("Other")
				965	);
				966	}
				967
				968	#[test]
				969	fn sym_normalize() {
				970	let sym_norm = symbolic_name_normalize;
				971
				972	assert_eq!(sym_norm("Line_Break"), "linebreak");
				973	assert_eq!(sym_norm("Line-break"), "linebreak");
				974	assert_eq!(sym_norm("linebreak"), "linebreak");
				975	assert_eq!(sym_norm("BA"), "ba");
				976	assert_eq!(sym_norm("ba"), "ba");
				977	assert_eq!(sym_norm("Greek"), "greek");
				978	assert_eq!(sym_norm("isGreek"), "greek");
				979	assert_eq!(sym_norm("IS_Greek"), "greek");
				980	assert_eq!(sym_norm("isc"), "isc");
				981	assert_eq!(sym_norm("is c"), "isc");
				982	assert_eq!(sym_norm("is_c"), "isc");
				983	}
				984
				985	#[test]
				986	fn valid_utf8_symbolic() {
				987	let mut x = b"abc\xFFxyz".to_vec();
				988	let y = symbolic_name_normalize_bytes(&mut x);
				989	assert_eq!(y, b"abcxyz");
				990	}
				991	}