blob: 4926fa63bbf98708bf346af85db5ca8a5ef71342 [file] [log] [blame]
Elliott Hughes0c26e192019-08-07 12:24:46 -07001/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070010 New API code Copyright (c) 2016-2021 University of Cambridge
Elliott Hughes0c26e192019-08-07 12:24:46 -070011
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains the function for checking a script run. */
42
43#ifdef HAVE_CONFIG_H
44#include "config.h"
45#endif
46
47#include "pcre2_internal.h"
48
49
50/*************************************************
51* Check script run *
52*************************************************/
53
54/* A script run is conceptually a sequence of characters all in the same
55Unicode script. However, it isn't quite that simple. There are special rules
56for scripts that are commonly used together, and also special rules for digits.
57This function implements the appropriate checks, which is possible only when
58PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59no Unicode support; however, it should never be called in that circumstance
60because an error is given by pcre2_compile() if a script run is called for in a
61version of PCRE2 compiled without Unicode support.
62
63Arguments:
64 pgr point to the first character
65 endptr point after the last character
66 utf TRUE if in UTF mode
67
68Returns: TRUE if this is a valid script run
69*/
70
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070071/* These are states in the checking process. */
Elliott Hughes0c26e192019-08-07 12:24:46 -070072
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070073enum { SCRIPT_UNSET, /* Requirement as yet unknown */
74 SCRIPT_MAP, /* Bitmap contains acceptable scripts */
75 SCRIPT_HANPENDING, /* Have had only Han characters */
76 SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
77 SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
78 SCRIPT_HANHANGUL /* Expect Han or Hangul */
79 };
Elliott Hughes0c26e192019-08-07 12:24:46 -070080
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070081#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
82#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
Elliott Hughes0c26e192019-08-07 12:24:46 -070083
84BOOL
85PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86{
87#ifdef SUPPORT_UNICODE
Elliott Hughes4e19c8e2022-04-15 15:11:02 -070088uint32_t require_state = SCRIPT_UNSET;
89uint32_t require_map[FULL_MAPSIZE];
90uint32_t map[FULL_MAPSIZE];
Elliott Hughes0c26e192019-08-07 12:24:46 -070091uint32_t require_digitset = 0;
92uint32_t c;
93
94#if PCRE2_CODE_UNIT_WIDTH == 32
95(void)utf; /* Avoid compiler warning */
96#endif
97
98/* Any string containing fewer than 2 characters is a valid script run. */
99
100if (ptr >= endptr) return TRUE;
101GETCHARINCTEST(c, ptr);
102if (ptr >= endptr) return TRUE;
103
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700104/* Initialize the require map. This is a full-size bitmap that has a bit for
105every script, as opposed to the maps in ucd_script_sets, which only have bits
106for scripts less than ucp_Unknown - those that appear in script extension
107lists. */
108
109for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
110
Elliott Hughes0c26e192019-08-07 12:24:46 -0700111/* Scan strings of two or more characters, checking the Unicode characteristics
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700112of each code point. There is special code for scripts that can be combined with
113characters from the Han Chinese script. This may be used in conjunction with
114four other scripts in these combinations:
Elliott Hughes0c26e192019-08-07 12:24:46 -0700115
116. Han with Hiragana and Katakana is allowed (for Japanese).
117. Han with Bopomofo is allowed (for Taiwanese Mandarin).
118. Han with Hangul is allowed (for Korean).
119
120If the first significant character's script is one of the four, the required
121script type is immediately known. However, if the first significant
122character's script is Han, we have to keep checking for a non-Han character.
123Hence the SCRIPT_HANPENDING state. */
124
125for (;;)
126 {
127 const ucd_record *ucd = GET_UCD(c);
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700128 uint32_t script = ucd->script;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700129
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700130 /* If the script is Unknown, the string is not a valid script run. Such
131 characters can only form script runs of length one (see test above). */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700132
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700133 if (script == ucp_Unknown) return FALSE;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700134
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700135 /* A character without any script extensions whose script is Inherited or
136 Common is always accepted with any script. If there are extensions, the
137 following processing happens for all scripts. */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700138
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700139 if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
Elliott Hughes0c26e192019-08-07 12:24:46 -0700140 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700141 BOOL OK;
142
143 /* Set up a full-sized map for this character that can include bits for all
144 scripts. Copy the scriptx map for this character (which covers those
145 scripts that appear in script extension lists), set the remaining values to
146 zero, and then, except for Common or Inherited, add this script's bit to
147 the map. */
148
149 memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
150 memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
151 if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
152
153 /* Handle the different checking states */
154
155 switch(require_state)
Elliott Hughes0c26e192019-08-07 12:24:46 -0700156 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700157 /* First significant character - it might follow Common or Inherited
158 characters that do not have any script extensions. */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700159
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700160 case SCRIPT_UNSET:
161 switch(script)
Elliott Hughes0c26e192019-08-07 12:24:46 -0700162 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700163 case ucp_Han:
164 require_state = SCRIPT_HANPENDING;
165 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700166
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700167 case ucp_Hiragana:
168 case ucp_Katakana:
169 require_state = SCRIPT_HANHIRAKATA;
170 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700171
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700172 case ucp_Bopomofo:
173 require_state = SCRIPT_HANBOPOMOFO;
174 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700175
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700176 case ucp_Hangul:
177 require_state = SCRIPT_HANHANGUL;
178 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700179
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700180 default:
181 memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
182 require_state = SCRIPT_MAP;
183 break;
184 }
185 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700186
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700187 /* The first significant character was Han. An inspection of the Unicode
188 11.0.0 files shows that there are the following types of Script Extension
189 list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
190 scripts:
Elliott Hughes0c26e192019-08-07 12:24:46 -0700191
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700192 . Bopomofo + Han
193 . Han + Hiragana + Katakana
194 . Hiragana + Katakana
195 . Bopopmofo + Hangul + Han + Hiragana + Katakana
Elliott Hughes0c26e192019-08-07 12:24:46 -0700196
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700197 The following code tries to make sense of this. */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700198
199#define FOUND_BOPOMOFO 1
200#define FOUND_HIRAGANA 2
201#define FOUND_KATAKANA 4
202#define FOUND_HANGUL 8
203
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700204 case SCRIPT_HANPENDING:
205 if (script != ucp_Han) /* Another Han does nothing */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700206 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700207 uint32_t chspecial = 0;
208
209 if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
210 if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
211 if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
212 if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
213
214 if (chspecial == 0) return FALSE; /* Not allowed with Han */
215
216 if (chspecial == FOUND_BOPOMOFO)
217 require_state = SCRIPT_HANBOPOMOFO;
218 else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
219 require_state = SCRIPT_HANHIRAKATA;
220
221 /* Otherwise this character must be allowed with all of them, so remain
222 in the pending state. */
223 }
224 break;
225
226 /* Previously encountered one of the "with Han" scripts. Check that
227 this character is appropriate. */
228
229 case SCRIPT_HANHIRAKATA:
230 if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
231 MAPBIT(map, ucp_Katakana) == 0) return FALSE;
232 break;
233
234 case SCRIPT_HANBOPOMOFO:
235 if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
236 break;
237
238 case SCRIPT_HANHANGUL:
239 if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
240 break;
241
242 /* Previously encountered one or more characters that are allowed with a
243 list of scripts. */
244
245 case SCRIPT_MAP:
246 OK = FALSE;
247
248 for (int i = 0; i < FULL_MAPSIZE; i++)
249 {
250 if ((require_map[i] & map[i]) != 0)
Elliott Hughes0c26e192019-08-07 12:24:46 -0700251 {
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700252 OK = TRUE;
253 break;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700254 }
255 }
256
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700257 if (!OK) return FALSE;
Elliott Hughes0c26e192019-08-07 12:24:46 -0700258
Elliott Hughes4e19c8e2022-04-15 15:11:02 -0700259 /* The rest of the string must be in this script, but we have to
260 allow for the Han complications. */
261
262 switch(script)
263 {
264 case ucp_Han:
265 require_state = SCRIPT_HANPENDING;
266 break;
267
268 case ucp_Hiragana:
269 case ucp_Katakana:
270 require_state = SCRIPT_HANHIRAKATA;
271 break;
272
273 case ucp_Bopomofo:
274 require_state = SCRIPT_HANBOPOMOFO;
275 break;
276
277 case ucp_Hangul:
278 require_state = SCRIPT_HANHANGUL;
279 break;
280
281 /* Compute the intersection of the required list of scripts and the
282 allowed scripts for this character. */
283
284 default:
285 for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
286 break;
287 }
288
289 break;
290 }
291 } /* End checking character's script and extensions. */
292
293 /* The character is in an acceptable script. We must now ensure that all
294 decimal digits in the string come from the same set. Some scripts (e.g.
295 Common, Arabic) have more than one set of decimal digits. This code does
296 not allow mixing sets, even within the same script. The vector called
297 PRIV(ucd_digit_sets)[] contains, in its first element, the number of
298 following elements, and then, in ascending order, the code points of the
299 '9' characters in every set of 10 digits. Each set is identified by the
300 offset in the vector of its '9' character. An initial check of the first
301 value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
302
303 if (ucd->chartype == ucp_Nd)
304 {
305 uint32_t digitset;
306
307 if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
308 {
309 int mid;
310 int bot = 1;
311 int top = PRIV(ucd_digit_sets)[0];
312 for (;;)
313 {
314 if (top <= bot + 1) /* <= rather than == is paranoia */
315 {
316 digitset = top;
317 break;
318 }
319 mid = (top + bot) / 2;
320 if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
321 }
322 }
323
324 /* A required value of 0 means "unset". */
325
326 if (require_digitset == 0) require_digitset = digitset;
327 else if (digitset != require_digitset) return FALSE;
328 } /* End digit handling */
Elliott Hughes0c26e192019-08-07 12:24:46 -0700329
330 /* If we haven't yet got to the end, pick up the next character. */
331
332 if (ptr >= endptr) return TRUE;
333 GETCHARINCTEST(c, ptr);
334 } /* End checking loop */
335
336#else /* NOT SUPPORT_UNICODE */
337(void)ptr;
338(void)endptr;
339(void)utf;
340return TRUE;
341#endif /* SUPPORT_UNICODE */
342}
343
344/* End of pcre2_script_run.c */