Blame - src/pcre2_script_run.c - platform/external/pcre

blob: 4926fa63bbf98708bf346af85db5ca8a5ef71342 [file] [log] [blame]

Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	1	/*************************************************
				2	* Perl-Compatible Regular Expressions *
				3	*************************************************/
				4
				5	/* PCRE is a library of functions to support regular expressions whose syntax
				6	and semantics are as close as possible to those of the Perl 5 language.
				7
				8	Written by Philip Hazel
				9	Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	10	New API code Copyright (c) 2016-2021 University of Cambridge
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	11
				12	-----------------------------------------------------------------------------
				13	Redistribution and use in source and binary forms, with or without
				14	modification, are permitted provided that the following conditions are met:
				15
				16	* Redistributions of source code must retain the above copyright notice,
				17	this list of conditions and the following disclaimer.
				18
				19	* Redistributions in binary form must reproduce the above copyright
				20	notice, this list of conditions and the following disclaimer in the
				21	documentation and/or other materials provided with the distribution.
				22
				23	* Neither the name of the University of Cambridge nor the names of its
				24	contributors may be used to endorse or promote products derived from
				25	this software without specific prior written permission.
				26
				27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
				31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				37	POSSIBILITY OF SUCH DAMAGE.
				38	-----------------------------------------------------------------------------
				39	*/
				40
				41	/* This module contains the function for checking a script run. */
				42
				43	#ifdef HAVE_CONFIG_H
				44	#include "config.h"
				45	#endif
				46
				47	#include "pcre2_internal.h"
				48
				49
				50	/*************************************************
				51	* Check script run *
				52	*************************************************/
				53
				54	/* A script run is conceptually a sequence of characters all in the same
				55	Unicode script. However, it isn't quite that simple. There are special rules
				56	for scripts that are commonly used together, and also special rules for digits.
				57	This function implements the appropriate checks, which is possible only when
				58	PCRE2 is compiled with Unicode support. The function returns TRUE if there is
				59	no Unicode support; however, it should never be called in that circumstance
				60	because an error is given by pcre2_compile() if a script run is called for in a
				61	version of PCRE2 compiled without Unicode support.
				62
				63	Arguments:
				64	pgr point to the first character
				65	endptr point after the last character
				66	utf TRUE if in UTF mode
				67
				68	Returns: TRUE if this is a valid script run
				69	*/
				70
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	71	/* These are states in the checking process. */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	72
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	73	enum { SCRIPT_UNSET, /* Requirement as yet unknown */
				74	SCRIPT_MAP, /* Bitmap contains acceptable scripts */
				75	SCRIPT_HANPENDING, /* Have had only Han characters */
				76	SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
				77	SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
				78	SCRIPT_HANHANGUL /* Expect Han or Hangul */
				79	};
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	80
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	81	#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
				82	#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	83
				84	BOOL
				85	PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
				86	{
				87	#ifdef SUPPORT_UNICODE
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	88	uint32_t require_state = SCRIPT_UNSET;
				89	uint32_t require_map[FULL_MAPSIZE];
				90	uint32_t map[FULL_MAPSIZE];
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	91	uint32_t require_digitset = 0;
				92	uint32_t c;
				93
				94	#if PCRE2_CODE_UNIT_WIDTH == 32
				95	(void)utf; /* Avoid compiler warning */
				96	#endif
				97
				98	/* Any string containing fewer than 2 characters is a valid script run. */
				99
				100	if (ptr >= endptr) return TRUE;
				101	GETCHARINCTEST(c, ptr);
				102	if (ptr >= endptr) return TRUE;
				103
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	104	/* Initialize the require map. This is a full-size bitmap that has a bit for
				105	every script, as opposed to the maps in ucd_script_sets, which only have bits
				106	for scripts less than ucp_Unknown - those that appear in script extension
				107	lists. */
				108
				109	for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
				110
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	111	/* Scan strings of two or more characters, checking the Unicode characteristics
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	112	of each code point. There is special code for scripts that can be combined with
				113	characters from the Han Chinese script. This may be used in conjunction with
				114	four other scripts in these combinations:
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	115
				116	. Han with Hiragana and Katakana is allowed (for Japanese).
				117	. Han with Bopomofo is allowed (for Taiwanese Mandarin).
				118	. Han with Hangul is allowed (for Korean).
				119
				120	If the first significant character's script is one of the four, the required
				121	script type is immediately known. However, if the first significant
				122	character's script is Han, we have to keep checking for a non-Han character.
				123	Hence the SCRIPT_HANPENDING state. */
				124
				125	for (;;)
				126	{
				127	const ucd_record *ucd = GET_UCD(c);
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	128	uint32_t script = ucd->script;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	129
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	130	/* If the script is Unknown, the string is not a valid script run. Such
				131	characters can only form script runs of length one (see test above). */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	132
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	133	if (script == ucp_Unknown) return FALSE;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	134
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	135	/* A character without any script extensions whose script is Inherited or
				136	Common is always accepted with any script. If there are extensions, the
				137	following processing happens for all scripts. */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	138
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	139	if (UCD_SCRIPTX_PROP(ucd) != 0 \|\| (script != ucp_Inherited && script != ucp_Common))
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	140	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	141	BOOL OK;
				142
				143	/* Set up a full-sized map for this character that can include bits for all
				144	scripts. Copy the scriptx map for this character (which covers those
				145	scripts that appear in script extension lists), set the remaining values to
				146	zero, and then, except for Common or Inherited, add this script's bit to
				147	the map. */
				148
				149	memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
				150	memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
				151	if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
				152
				153	/* Handle the different checking states */
				154
				155	switch(require_state)
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	156	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	157	/* First significant character - it might follow Common or Inherited
				158	characters that do not have any script extensions. */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	159
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	160	case SCRIPT_UNSET:
				161	switch(script)
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	162	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	163	case ucp_Han:
				164	require_state = SCRIPT_HANPENDING;
				165	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	166
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	167	case ucp_Hiragana:
				168	case ucp_Katakana:
				169	require_state = SCRIPT_HANHIRAKATA;
				170	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	171
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	172	case ucp_Bopomofo:
				173	require_state = SCRIPT_HANBOPOMOFO;
				174	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	175
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	176	case ucp_Hangul:
				177	require_state = SCRIPT_HANHANGUL;
				178	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	179
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	180	default:
				181	memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
				182	require_state = SCRIPT_MAP;
				183	break;
				184	}
				185	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	186
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	187	/* The first significant character was Han. An inspection of the Unicode
				188	11.0.0 files shows that there are the following types of Script Extension
				189	list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
				190	scripts:
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	191
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	192	. Bopomofo + Han
				193	. Han + Hiragana + Katakana
				194	. Hiragana + Katakana
				195	. Bopopmofo + Hangul + Han + Hiragana + Katakana
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	196
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	197	The following code tries to make sense of this. */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	198
				199	#define FOUND_BOPOMOFO 1
				200	#define FOUND_HIRAGANA 2
				201	#define FOUND_KATAKANA 4
				202	#define FOUND_HANGUL 8
				203
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	204	case SCRIPT_HANPENDING:
				205	if (script != ucp_Han) /* Another Han does nothing */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	206	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	207	uint32_t chspecial = 0;
				208
				209	if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial \|= FOUND_BOPOMOFO;
				210	if (MAPBIT(map, ucp_Hiragana) != 0) chspecial \|= FOUND_HIRAGANA;
				211	if (MAPBIT(map, ucp_Katakana) != 0) chspecial \|= FOUND_KATAKANA;
				212	if (MAPBIT(map, ucp_Hangul) != 0) chspecial \|= FOUND_HANGUL;
				213
				214	if (chspecial == 0) return FALSE; /* Not allowed with Han */
				215
				216	if (chspecial == FOUND_BOPOMOFO)
				217	require_state = SCRIPT_HANBOPOMOFO;
				218	else if (chspecial == (FOUND_HIRAGANA\|FOUND_KATAKANA))
				219	require_state = SCRIPT_HANHIRAKATA;
				220
				221	/* Otherwise this character must be allowed with all of them, so remain
				222	in the pending state. */
				223	}
				224	break;
				225
				226	/* Previously encountered one of the "with Han" scripts. Check that
				227	this character is appropriate. */
				228
				229	case SCRIPT_HANHIRAKATA:
				230	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
				231	MAPBIT(map, ucp_Katakana) == 0) return FALSE;
				232	break;
				233
				234	case SCRIPT_HANBOPOMOFO:
				235	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
				236	break;
				237
				238	case SCRIPT_HANHANGUL:
				239	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
				240	break;
				241
				242	/* Previously encountered one or more characters that are allowed with a
				243	list of scripts. */
				244
				245	case SCRIPT_MAP:
				246	OK = FALSE;
				247
				248	for (int i = 0; i < FULL_MAPSIZE; i++)
				249	{
				250	if ((require_map[i] & map[i]) != 0)
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	251	{
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	252	OK = TRUE;
				253	break;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	254	}
				255	}
				256
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	257	if (!OK) return FALSE;
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	258
Elliott Hughes	4e19c8e	2022-04-15 15:11:02 -0700	[diff] [blame]	259	/* The rest of the string must be in this script, but we have to
				260	allow for the Han complications. */
				261
				262	switch(script)
				263	{
				264	case ucp_Han:
				265	require_state = SCRIPT_HANPENDING;
				266	break;
				267
				268	case ucp_Hiragana:
				269	case ucp_Katakana:
				270	require_state = SCRIPT_HANHIRAKATA;
				271	break;
				272
				273	case ucp_Bopomofo:
				274	require_state = SCRIPT_HANBOPOMOFO;
				275	break;
				276
				277	case ucp_Hangul:
				278	require_state = SCRIPT_HANHANGUL;
				279	break;
				280
				281	/* Compute the intersection of the required list of scripts and the
				282	allowed scripts for this character. */
				283
				284	default:
				285	for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
				286	break;
				287	}
				288
				289	break;
				290	}
				291	} /* End checking character's script and extensions. */
				292
				293	/* The character is in an acceptable script. We must now ensure that all
				294	decimal digits in the string come from the same set. Some scripts (e.g.
				295	Common, Arabic) have more than one set of decimal digits. This code does
				296	not allow mixing sets, even within the same script. The vector called
				297	PRIV(ucd_digit_sets)[] contains, in its first element, the number of
				298	following elements, and then, in ascending order, the code points of the
				299	'9' characters in every set of 10 digits. Each set is identified by the
				300	offset in the vector of its '9' character. An initial check of the first
				301	value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
				302
				303	if (ucd->chartype == ucp_Nd)
				304	{
				305	uint32_t digitset;
				306
				307	if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
				308	{
				309	int mid;
				310	int bot = 1;
				311	int top = PRIV(ucd_digit_sets)[0];
				312	for (;;)
				313	{
				314	if (top <= bot + 1) /* <= rather than == is paranoia */
				315	{
				316	digitset = top;
				317	break;
				318	}
				319	mid = (top + bot) / 2;
				320	if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
				321	}
				322	}
				323
				324	/* A required value of 0 means "unset". */
				325
				326	if (require_digitset == 0) require_digitset = digitset;
				327	else if (digitset != require_digitset) return FALSE;
				328	} /* End digit handling */
Elliott Hughes	0c26e19	2019-08-07 12:24:46 -0700	[diff] [blame]	329
				330	/* If we haven't yet got to the end, pick up the next character. */
				331
				332	if (ptr >= endptr) return TRUE;
				333	GETCHARINCTEST(c, ptr);
				334	} /* End checking loop */
				335
				336	#else /* NOT SUPPORT_UNICODE */
				337	(void)ptr;
				338	(void)endptr;
				339	(void)utf;
				340	return TRUE;
				341	#endif /* SUPPORT_UNICODE */
				342	}
				343
				344	/* End of pcre2_script_run.c */