Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 6 | and semantics are as close as possible to those of the Perl 5 language. |
| 7 | |
| 8 | Written by Philip Hazel |
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 10 | New API code Copyright (c) 2016-2021 University of Cambridge |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 11 | |
| 12 | ----------------------------------------------------------------------------- |
| 13 | Redistribution and use in source and binary forms, with or without |
| 14 | modification, are permitted provided that the following conditions are met: |
| 15 | |
| 16 | * Redistributions of source code must retain the above copyright notice, |
| 17 | this list of conditions and the following disclaimer. |
| 18 | |
| 19 | * Redistributions in binary form must reproduce the above copyright |
| 20 | notice, this list of conditions and the following disclaimer in the |
| 21 | documentation and/or other materials provided with the distribution. |
| 22 | |
| 23 | * Neither the name of the University of Cambridge nor the names of its |
| 24 | contributors may be used to endorse or promote products derived from |
| 25 | this software without specific prior written permission. |
| 26 | |
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | POSSIBILITY OF SUCH DAMAGE. |
| 38 | ----------------------------------------------------------------------------- |
| 39 | */ |
| 40 | |
| 41 | /* This module contains the function for checking a script run. */ |
| 42 | |
| 43 | #ifdef HAVE_CONFIG_H |
| 44 | #include "config.h" |
| 45 | #endif |
| 46 | |
| 47 | #include "pcre2_internal.h" |
| 48 | |
| 49 | |
| 50 | /************************************************* |
| 51 | * Check script run * |
| 52 | *************************************************/ |
| 53 | |
| 54 | /* A script run is conceptually a sequence of characters all in the same |
| 55 | Unicode script. However, it isn't quite that simple. There are special rules |
| 56 | for scripts that are commonly used together, and also special rules for digits. |
| 57 | This function implements the appropriate checks, which is possible only when |
| 58 | PCRE2 is compiled with Unicode support. The function returns TRUE if there is |
| 59 | no Unicode support; however, it should never be called in that circumstance |
| 60 | because an error is given by pcre2_compile() if a script run is called for in a |
| 61 | version of PCRE2 compiled without Unicode support. |
| 62 | |
| 63 | Arguments: |
| 64 | pgr point to the first character |
| 65 | endptr point after the last character |
| 66 | utf TRUE if in UTF mode |
| 67 | |
| 68 | Returns: TRUE if this is a valid script run |
| 69 | */ |
| 70 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 71 | /* These are states in the checking process. */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 72 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 73 | enum { SCRIPT_UNSET, /* Requirement as yet unknown */ |
| 74 | SCRIPT_MAP, /* Bitmap contains acceptable scripts */ |
| 75 | SCRIPT_HANPENDING, /* Have had only Han characters */ |
| 76 | SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */ |
| 77 | SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */ |
| 78 | SCRIPT_HANHANGUL /* Expect Han or Hangul */ |
| 79 | }; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 80 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 81 | #define UCD_MAPSIZE (ucp_Unknown/32 + 1) |
| 82 | #define FULL_MAPSIZE (ucp_Script_Count/32 + 1) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 83 | |
| 84 | BOOL |
| 85 | PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) |
| 86 | { |
| 87 | #ifdef SUPPORT_UNICODE |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 88 | uint32_t require_state = SCRIPT_UNSET; |
| 89 | uint32_t require_map[FULL_MAPSIZE]; |
| 90 | uint32_t map[FULL_MAPSIZE]; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 91 | uint32_t require_digitset = 0; |
| 92 | uint32_t c; |
| 93 | |
| 94 | #if PCRE2_CODE_UNIT_WIDTH == 32 |
| 95 | (void)utf; /* Avoid compiler warning */ |
| 96 | #endif |
| 97 | |
| 98 | /* Any string containing fewer than 2 characters is a valid script run. */ |
| 99 | |
| 100 | if (ptr >= endptr) return TRUE; |
| 101 | GETCHARINCTEST(c, ptr); |
| 102 | if (ptr >= endptr) return TRUE; |
| 103 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 104 | /* Initialize the require map. This is a full-size bitmap that has a bit for |
| 105 | every script, as opposed to the maps in ucd_script_sets, which only have bits |
| 106 | for scripts less than ucp_Unknown - those that appear in script extension |
| 107 | lists. */ |
| 108 | |
| 109 | for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; |
| 110 | |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 111 | /* Scan strings of two or more characters, checking the Unicode characteristics |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 112 | of each code point. There is special code for scripts that can be combined with |
| 113 | characters from the Han Chinese script. This may be used in conjunction with |
| 114 | four other scripts in these combinations: |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 115 | |
| 116 | . Han with Hiragana and Katakana is allowed (for Japanese). |
| 117 | . Han with Bopomofo is allowed (for Taiwanese Mandarin). |
| 118 | . Han with Hangul is allowed (for Korean). |
| 119 | |
| 120 | If the first significant character's script is one of the four, the required |
| 121 | script type is immediately known. However, if the first significant |
| 122 | character's script is Han, we have to keep checking for a non-Han character. |
| 123 | Hence the SCRIPT_HANPENDING state. */ |
| 124 | |
| 125 | for (;;) |
| 126 | { |
| 127 | const ucd_record *ucd = GET_UCD(c); |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 128 | uint32_t script = ucd->script; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 129 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 130 | /* If the script is Unknown, the string is not a valid script run. Such |
| 131 | characters can only form script runs of length one (see test above). */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 132 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 133 | if (script == ucp_Unknown) return FALSE; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 134 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 135 | /* A character without any script extensions whose script is Inherited or |
| 136 | Common is always accepted with any script. If there are extensions, the |
| 137 | following processing happens for all scripts. */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 138 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 139 | if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common)) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 140 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 141 | BOOL OK; |
| 142 | |
| 143 | /* Set up a full-sized map for this character that can include bits for all |
| 144 | scripts. Copy the scriptx map for this character (which covers those |
| 145 | scripts that appear in script extension lists), set the remaining values to |
| 146 | zero, and then, except for Common or Inherited, add this script's bit to |
| 147 | the map. */ |
| 148 | |
| 149 | memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); |
| 150 | memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); |
| 151 | if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); |
| 152 | |
| 153 | /* Handle the different checking states */ |
| 154 | |
| 155 | switch(require_state) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 156 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 157 | /* First significant character - it might follow Common or Inherited |
| 158 | characters that do not have any script extensions. */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 159 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 160 | case SCRIPT_UNSET: |
| 161 | switch(script) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 162 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 163 | case ucp_Han: |
| 164 | require_state = SCRIPT_HANPENDING; |
| 165 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 166 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 167 | case ucp_Hiragana: |
| 168 | case ucp_Katakana: |
| 169 | require_state = SCRIPT_HANHIRAKATA; |
| 170 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 171 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 172 | case ucp_Bopomofo: |
| 173 | require_state = SCRIPT_HANBOPOMOFO; |
| 174 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 175 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 176 | case ucp_Hangul: |
| 177 | require_state = SCRIPT_HANHANGUL; |
| 178 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 179 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 180 | default: |
| 181 | memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); |
| 182 | require_state = SCRIPT_MAP; |
| 183 | break; |
| 184 | } |
| 185 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 186 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 187 | /* The first significant character was Han. An inspection of the Unicode |
| 188 | 11.0.0 files shows that there are the following types of Script Extension |
| 189 | list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul |
| 190 | scripts: |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 191 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 192 | . Bopomofo + Han |
| 193 | . Han + Hiragana + Katakana |
| 194 | . Hiragana + Katakana |
| 195 | . Bopopmofo + Hangul + Han + Hiragana + Katakana |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 196 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 197 | The following code tries to make sense of this. */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 198 | |
| 199 | #define FOUND_BOPOMOFO 1 |
| 200 | #define FOUND_HIRAGANA 2 |
| 201 | #define FOUND_KATAKANA 4 |
| 202 | #define FOUND_HANGUL 8 |
| 203 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 204 | case SCRIPT_HANPENDING: |
| 205 | if (script != ucp_Han) /* Another Han does nothing */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 206 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 207 | uint32_t chspecial = 0; |
| 208 | |
| 209 | if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; |
| 210 | if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; |
| 211 | if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; |
| 212 | if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; |
| 213 | |
| 214 | if (chspecial == 0) return FALSE; /* Not allowed with Han */ |
| 215 | |
| 216 | if (chspecial == FOUND_BOPOMOFO) |
| 217 | require_state = SCRIPT_HANBOPOMOFO; |
| 218 | else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) |
| 219 | require_state = SCRIPT_HANHIRAKATA; |
| 220 | |
| 221 | /* Otherwise this character must be allowed with all of them, so remain |
| 222 | in the pending state. */ |
| 223 | } |
| 224 | break; |
| 225 | |
| 226 | /* Previously encountered one of the "with Han" scripts. Check that |
| 227 | this character is appropriate. */ |
| 228 | |
| 229 | case SCRIPT_HANHIRAKATA: |
| 230 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + |
| 231 | MAPBIT(map, ucp_Katakana) == 0) return FALSE; |
| 232 | break; |
| 233 | |
| 234 | case SCRIPT_HANBOPOMOFO: |
| 235 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; |
| 236 | break; |
| 237 | |
| 238 | case SCRIPT_HANHANGUL: |
| 239 | if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; |
| 240 | break; |
| 241 | |
| 242 | /* Previously encountered one or more characters that are allowed with a |
| 243 | list of scripts. */ |
| 244 | |
| 245 | case SCRIPT_MAP: |
| 246 | OK = FALSE; |
| 247 | |
| 248 | for (int i = 0; i < FULL_MAPSIZE; i++) |
| 249 | { |
| 250 | if ((require_map[i] & map[i]) != 0) |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 251 | { |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 252 | OK = TRUE; |
| 253 | break; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 254 | } |
| 255 | } |
| 256 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 257 | if (!OK) return FALSE; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 258 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 259 | /* The rest of the string must be in this script, but we have to |
| 260 | allow for the Han complications. */ |
| 261 | |
| 262 | switch(script) |
| 263 | { |
| 264 | case ucp_Han: |
| 265 | require_state = SCRIPT_HANPENDING; |
| 266 | break; |
| 267 | |
| 268 | case ucp_Hiragana: |
| 269 | case ucp_Katakana: |
| 270 | require_state = SCRIPT_HANHIRAKATA; |
| 271 | break; |
| 272 | |
| 273 | case ucp_Bopomofo: |
| 274 | require_state = SCRIPT_HANBOPOMOFO; |
| 275 | break; |
| 276 | |
| 277 | case ucp_Hangul: |
| 278 | require_state = SCRIPT_HANHANGUL; |
| 279 | break; |
| 280 | |
| 281 | /* Compute the intersection of the required list of scripts and the |
| 282 | allowed scripts for this character. */ |
| 283 | |
| 284 | default: |
| 285 | for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; |
| 286 | break; |
| 287 | } |
| 288 | |
| 289 | break; |
| 290 | } |
| 291 | } /* End checking character's script and extensions. */ |
| 292 | |
| 293 | /* The character is in an acceptable script. We must now ensure that all |
| 294 | decimal digits in the string come from the same set. Some scripts (e.g. |
| 295 | Common, Arabic) have more than one set of decimal digits. This code does |
| 296 | not allow mixing sets, even within the same script. The vector called |
| 297 | PRIV(ucd_digit_sets)[] contains, in its first element, the number of |
| 298 | following elements, and then, in ascending order, the code points of the |
| 299 | '9' characters in every set of 10 digits. Each set is identified by the |
| 300 | offset in the vector of its '9' character. An initial check of the first |
| 301 | value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ |
| 302 | |
| 303 | if (ucd->chartype == ucp_Nd) |
| 304 | { |
| 305 | uint32_t digitset; |
| 306 | |
| 307 | if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else |
| 308 | { |
| 309 | int mid; |
| 310 | int bot = 1; |
| 311 | int top = PRIV(ucd_digit_sets)[0]; |
| 312 | for (;;) |
| 313 | { |
| 314 | if (top <= bot + 1) /* <= rather than == is paranoia */ |
| 315 | { |
| 316 | digitset = top; |
| 317 | break; |
| 318 | } |
| 319 | mid = (top + bot) / 2; |
| 320 | if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | /* A required value of 0 means "unset". */ |
| 325 | |
| 326 | if (require_digitset == 0) require_digitset = digitset; |
| 327 | else if (digitset != require_digitset) return FALSE; |
| 328 | } /* End digit handling */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 329 | |
| 330 | /* If we haven't yet got to the end, pick up the next character. */ |
| 331 | |
| 332 | if (ptr >= endptr) return TRUE; |
| 333 | GETCHARINCTEST(c, ptr); |
| 334 | } /* End checking loop */ |
| 335 | |
| 336 | #else /* NOT SUPPORT_UNICODE */ |
| 337 | (void)ptr; |
| 338 | (void)endptr; |
| 339 | (void)utf; |
| 340 | return TRUE; |
| 341 | #endif /* SUPPORT_UNICODE */ |
| 342 | } |
| 343 | |
| 344 | /* End of pcre2_script_run.c */ |