Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 1 | /************************************************* |
| 2 | * Perl-Compatible Regular Expressions * |
| 3 | *************************************************/ |
| 4 | |
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax |
| 6 | and semantics are as close as possible to those of the Perl 5 language. |
| 7 | |
| 8 | Written by Philip Hazel |
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 10 | New API code Copyright (c) 2016-2018 University of Cambridge |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 11 | |
| 12 | ----------------------------------------------------------------------------- |
| 13 | Redistribution and use in source and binary forms, with or without |
| 14 | modification, are permitted provided that the following conditions are met: |
| 15 | |
| 16 | * Redistributions of source code must retain the above copyright notice, |
| 17 | this list of conditions and the following disclaimer. |
| 18 | |
| 19 | * Redistributions in binary form must reproduce the above copyright |
| 20 | notice, this list of conditions and the following disclaimer in the |
| 21 | documentation and/or other materials provided with the distribution. |
| 22 | |
| 23 | * Neither the name of the University of Cambridge nor the names of its |
| 24 | contributors may be used to endorse or promote products derived from |
| 25 | this software without specific prior written permission. |
| 26 | |
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 37 | POSSIBILITY OF SUCH DAMAGE. |
| 38 | ----------------------------------------------------------------------------- |
| 39 | */ |
| 40 | |
| 41 | |
| 42 | /* This module contains mode-dependent macro and structure definitions. The |
| 43 | file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. |
| 44 | These mode-dependent items are kept in a separate file so that they can also be |
| 45 | #included multiple times for different code unit widths by pcre2test in order |
| 46 | to have access to the hidden structures at all supported widths. |
| 47 | |
| 48 | Some of the mode-dependent macros are required at different widths for |
| 49 | different parts of the pcre2test code (in particular, the included |
| 50 | pcre_printint.c file). We undefine them here so that they can be re-defined for |
| 51 | multiple inclusions. Not all of these are used in pcre2test, but it's easier |
| 52 | just to undefine them all. */ |
| 53 | |
| 54 | #undef ACROSSCHAR |
| 55 | #undef BACKCHAR |
| 56 | #undef BYTES2CU |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 57 | #undef CHMAX_255 |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 58 | #undef CU2BYTES |
| 59 | #undef FORWARDCHAR |
| 60 | #undef FORWARDCHARTEST |
| 61 | #undef GET |
| 62 | #undef GET2 |
| 63 | #undef GETCHAR |
| 64 | #undef GETCHARINC |
| 65 | #undef GETCHARINCTEST |
| 66 | #undef GETCHARLEN |
| 67 | #undef GETCHARLENTEST |
| 68 | #undef GETCHARTEST |
| 69 | #undef GET_EXTRALEN |
| 70 | #undef HAS_EXTRALEN |
| 71 | #undef IMM2_SIZE |
| 72 | #undef MAX_255 |
| 73 | #undef MAX_MARK |
| 74 | #undef MAX_PATTERN_SIZE |
| 75 | #undef MAX_UTF_SINGLE_CU |
| 76 | #undef NOT_FIRSTCU |
| 77 | #undef PUT |
| 78 | #undef PUT2 |
| 79 | #undef PUT2INC |
| 80 | #undef PUTCHAR |
| 81 | #undef PUTINC |
| 82 | #undef TABLE_GET |
| 83 | |
| 84 | |
| 85 | |
| 86 | /* -------------------------- MACROS ----------------------------- */ |
| 87 | |
| 88 | /* PCRE keeps offsets in its compiled code as at least 16-bit quantities |
| 89 | (always stored in big-endian order in 8-bit mode) by default. These are used, |
| 90 | for example, to link from the start of a subpattern to its alternatives and its |
| 91 | end. The use of 16 bits per offset limits the size of an 8-bit compiled regex |
| 92 | to around 64K, which is big enough for almost everybody. However, I received a |
| 93 | request for an even bigger limit. For this reason, and also to make the code |
| 94 | easier to maintain, the storing and loading of offsets from the compiled code |
| 95 | unit string is now handled by the macros that are defined here. |
| 96 | |
| 97 | The macros are controlled by the value of LINK_SIZE. This defaults to 2, but |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 98 | values of 3 or 4 are also supported. */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 99 | |
| 100 | /* ------------------- 8-bit support ------------------ */ |
| 101 | |
| 102 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 103 | |
| 104 | #if LINK_SIZE == 2 |
| 105 | #define PUT(a,n,d) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 106 | (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ |
| 107 | (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 108 | #define GET(a,n) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 109 | (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 110 | #define MAX_PATTERN_SIZE (1 << 16) |
| 111 | |
| 112 | #elif LINK_SIZE == 3 |
| 113 | #define PUT(a,n,d) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 114 | (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ |
| 115 | (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ |
| 116 | (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 117 | #define GET(a,n) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 118 | (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 119 | #define MAX_PATTERN_SIZE (1 << 24) |
| 120 | |
| 121 | #elif LINK_SIZE == 4 |
| 122 | #define PUT(a,n,d) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 123 | (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ |
| 124 | (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ |
| 125 | (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ |
| 126 | (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 127 | #define GET(a,n) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 128 | (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 129 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
| 130 | |
| 131 | #else |
| 132 | #error LINK_SIZE must be 2, 3, or 4 |
| 133 | #endif |
| 134 | |
| 135 | |
| 136 | /* ------------------- 16-bit support ------------------ */ |
| 137 | |
| 138 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 139 | |
| 140 | #if LINK_SIZE == 2 |
| 141 | #undef LINK_SIZE |
| 142 | #define LINK_SIZE 1 |
| 143 | #define PUT(a,n,d) \ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 144 | (a[n] = (PCRE2_UCHAR)(d)) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 145 | #define GET(a,n) \ |
| 146 | (a[n]) |
| 147 | #define MAX_PATTERN_SIZE (1 << 16) |
| 148 | |
| 149 | #elif LINK_SIZE == 3 || LINK_SIZE == 4 |
| 150 | #undef LINK_SIZE |
| 151 | #define LINK_SIZE 2 |
| 152 | #define PUT(a,n,d) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 153 | (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ |
| 154 | (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 155 | #define GET(a,n) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 156 | (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 157 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
| 158 | |
| 159 | #else |
| 160 | #error LINK_SIZE must be 2, 3, or 4 |
| 161 | #endif |
| 162 | |
| 163 | |
| 164 | /* ------------------- 32-bit support ------------------ */ |
| 165 | |
| 166 | #elif PCRE2_CODE_UNIT_WIDTH == 32 |
| 167 | #undef LINK_SIZE |
| 168 | #define LINK_SIZE 1 |
| 169 | #define PUT(a,n,d) \ |
| 170 | (a[n] = (d)) |
| 171 | #define GET(a,n) \ |
| 172 | (a[n]) |
| 173 | #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ |
| 174 | |
| 175 | #else |
| 176 | #error Unsupported compiling mode |
| 177 | #endif |
| 178 | |
| 179 | |
| 180 | /* --------------- Other mode-specific macros ----------------- */ |
| 181 | |
| 182 | /* PCRE uses some other (at least) 16-bit quantities that do not change when |
| 183 | the size of offsets changes. There are used for repeat counts and for other |
| 184 | things such as capturing parenthesis numbers in back references. |
| 185 | |
| 186 | Define the number of code units required to hold a 16-bit count/offset, and |
| 187 | macros to load and store such a value. For reasons that I do not understand, |
| 188 | the expression in the 8-bit GET2 macro is treated by gcc as a signed |
| 189 | expression, even when a is declared as unsigned. It seems that any kind of |
| 190 | arithmetic results in a signed value. Hence the cast. */ |
| 191 | |
| 192 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 193 | #define IMM2_SIZE 2 |
| 194 | #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) |
| 195 | #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 |
| 196 | |
| 197 | #else /* Code units are 16 or 32 bits */ |
| 198 | #define IMM2_SIZE 1 |
| 199 | #define GET2(a,n) a[n] |
| 200 | #define PUT2(a,n,d) a[n] = d |
| 201 | #endif |
| 202 | |
| 203 | /* Other macros that are different for 8-bit mode. The MAX_255 macro checks |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 204 | whether its argument, which is assumed to be one code unit, is less than 256. |
| 205 | The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK |
| 206 | name must fit in one code unit; currently it is set to 255 or 65535. The |
| 207 | TABLE_GET macro is used to access elements of tables containing exactly 256 |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 208 | items. Its argument is a code unit. When code points can be greater than 255, a |
| 209 | check is needed before accessing these tables. */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 210 | |
| 211 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 212 | #define MAX_255(c) TRUE |
| 213 | #define MAX_MARK ((1u << 8) - 1) |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 214 | #define TABLE_GET(c, table, default) ((table)[c]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 215 | #ifdef SUPPORT_UNICODE |
| 216 | #define SUPPORT_WIDE_CHARS |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 217 | #define CHMAX_255(c) ((c) <= 255u) |
| 218 | #else |
| 219 | #define CHMAX_255(c) TRUE |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 220 | #endif /* SUPPORT_UNICODE */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 221 | |
| 222 | #else /* Code units are 16 or 32 bits */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 223 | #define CHMAX_255(c) ((c) <= 255u) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 224 | #define MAX_255(c) ((c) <= 255u) |
| 225 | #define MAX_MARK ((1u << 16) - 1) |
| 226 | #define SUPPORT_WIDE_CHARS |
| 227 | #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) |
| 228 | #endif |
| 229 | |
| 230 | |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 231 | /* ----------------- Character-handling macros ----------------- */ |
| 232 | |
| 233 | /* There is a proposed future special "UTF-21" mode, in which only the lowest |
| 234 | 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 |
| 235 | high-order bits available to the application for other uses. In preparation for |
| 236 | the future implementation of this mode, there are macros that load a data item |
| 237 | and, if in this special mode, mask it to 21 bits. These macros all have names |
| 238 | starting with UCHAR21. In all other modes, including the normal 32-bit |
| 239 | library, the macros all have the same simple definitions. When the new mode is |
| 240 | implemented, it is expected that these definitions will be varied appropriately |
| 241 | using #ifdef when compiling the library that supports the special mode. */ |
| 242 | |
| 243 | #define UCHAR21(eptr) (*(eptr)) |
| 244 | #define UCHAR21TEST(eptr) (*(eptr)) |
| 245 | #define UCHAR21INC(eptr) (*(eptr)++) |
| 246 | #define UCHAR21INCTEST(eptr) (*(eptr)++) |
| 247 | |
| 248 | /* When UTF encoding is being used, a character is no longer just a single |
| 249 | byte in 8-bit mode or a single short in 16-bit mode. The macros for character |
| 250 | handling generate simple sequences when used in the basic mode, and more |
| 251 | complicated ones for UTF characters. GETCHARLENTEST and other macros are not |
| 252 | used when UTF is not supported. To make sure they can never even appear when |
| 253 | UTF support is omitted, we don't even define them. */ |
| 254 | |
| 255 | #ifndef SUPPORT_UNICODE |
| 256 | |
| 257 | /* #define MAX_UTF_SINGLE_CU */ |
| 258 | /* #define HAS_EXTRALEN(c) */ |
| 259 | /* #define GET_EXTRALEN(c) */ |
| 260 | /* #define NOT_FIRSTCU(c) */ |
| 261 | #define GETCHAR(c, eptr) c = *eptr; |
| 262 | #define GETCHARTEST(c, eptr) c = *eptr; |
| 263 | #define GETCHARINC(c, eptr) c = *eptr++; |
| 264 | #define GETCHARINCTEST(c, eptr) c = *eptr++; |
| 265 | #define GETCHARLEN(c, eptr, len) c = *eptr; |
| 266 | #define PUTCHAR(c, p) (*p = c, 1) |
| 267 | /* #define GETCHARLENTEST(c, eptr, len) */ |
| 268 | /* #define BACKCHAR(eptr) */ |
| 269 | /* #define FORWARDCHAR(eptr) */ |
| 270 | /* #define FORWARCCHARTEST(eptr,end) */ |
| 271 | /* #define ACROSSCHAR(condition, eptr, action) */ |
| 272 | |
| 273 | #else /* SUPPORT_UNICODE */ |
| 274 | |
| 275 | /* ------------------- 8-bit support ------------------ */ |
| 276 | |
| 277 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 278 | #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ |
| 279 | |
| 280 | /* The largest UTF code point that can be encoded as a single code unit. */ |
| 281 | |
| 282 | #define MAX_UTF_SINGLE_CU 127 |
| 283 | |
| 284 | /* Tests whether the code point needs extra characters to decode. */ |
| 285 | |
| 286 | #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) |
| 287 | |
| 288 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
| 289 | Otherwise it has an undefined behaviour. */ |
| 290 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 291 | #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 292 | |
| 293 | /* Returns TRUE, if the given value is not the first code unit of a UTF |
| 294 | sequence. */ |
| 295 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 296 | #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 297 | |
| 298 | /* Get the next UTF-8 character, not advancing the pointer. This is called when |
| 299 | we know we are in UTF-8 mode. */ |
| 300 | |
| 301 | #define GETCHAR(c, eptr) \ |
| 302 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 303 | if (c >= 0xc0u) GETUTF8(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 304 | |
| 305 | /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the |
| 306 | pointer. */ |
| 307 | |
| 308 | #define GETCHARTEST(c, eptr) \ |
| 309 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 310 | if (utf && c >= 0xc0u) GETUTF8(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 311 | |
| 312 | /* Get the next UTF-8 character, advancing the pointer. This is called when we |
| 313 | know we are in UTF-8 mode. */ |
| 314 | |
| 315 | #define GETCHARINC(c, eptr) \ |
| 316 | c = *eptr++; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 317 | if (c >= 0xc0u) GETUTF8INC(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 318 | |
| 319 | /* Get the next character, testing for UTF-8 mode, and advancing the pointer. |
| 320 | This is called when we don't know if we are in UTF-8 mode. */ |
| 321 | |
| 322 | #define GETCHARINCTEST(c, eptr) \ |
| 323 | c = *eptr++; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 324 | if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 325 | |
| 326 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
| 327 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
| 328 | |
| 329 | #define GETCHARLEN(c, eptr, len) \ |
| 330 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 331 | if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 332 | |
| 333 | /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the |
| 334 | pointer, incrementing length if there are extra bytes. This is called when we |
| 335 | do not know if we are in UTF-8 mode. */ |
| 336 | |
| 337 | #define GETCHARLENTEST(c, eptr, len) \ |
| 338 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 339 | if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 340 | |
| 341 | /* If the pointer is not at the start of a character, move it back until |
| 342 | it is. This is called only in UTF-8 mode - we don't put a test within the macro |
| 343 | because almost all calls are already within a block of UTF-8 only code. */ |
| 344 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 345 | #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 346 | |
| 347 | /* Same as above, just in the other direction. */ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 348 | #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ |
| 349 | #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 350 | |
| 351 | /* Same as above, but it allows a fully customizable form. */ |
| 352 | #define ACROSSCHAR(condition, eptr, action) \ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 353 | while((condition) && ((*eptr) & 0xc0u) == 0x80u) action |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 354 | |
| 355 | /* Deposit a character into memory, returning the number of code units. */ |
| 356 | |
| 357 | #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ |
| 358 | PRIV(ord2utf)(c,p) : (*p = c, 1)) |
| 359 | |
| 360 | |
| 361 | /* ------------------- 16-bit support ------------------ */ |
| 362 | |
| 363 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 364 | #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ |
| 365 | |
| 366 | /* The largest UTF code point that can be encoded as a single code unit. */ |
| 367 | |
| 368 | #define MAX_UTF_SINGLE_CU 65535 |
| 369 | |
| 370 | /* Tests whether the code point needs extra characters to decode. */ |
| 371 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 372 | #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 373 | |
| 374 | /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. |
| 375 | Otherwise it has an undefined behaviour. */ |
| 376 | |
| 377 | #define GET_EXTRALEN(c) 1 |
| 378 | |
| 379 | /* Returns TRUE, if the given value is not the first code unit of a UTF |
| 380 | sequence. */ |
| 381 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 382 | #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 383 | |
| 384 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
| 385 | advancing the pointer. */ |
| 386 | |
| 387 | #define GETUTF16(c, eptr) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 388 | { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 389 | |
| 390 | /* Get the next UTF-16 character, not advancing the pointer. This is called when |
| 391 | we know we are in UTF-16 mode. */ |
| 392 | |
| 393 | #define GETCHAR(c, eptr) \ |
| 394 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 395 | if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 396 | |
| 397 | /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the |
| 398 | pointer. */ |
| 399 | |
| 400 | #define GETCHARTEST(c, eptr) \ |
| 401 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 402 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 403 | |
| 404 | /* Base macro to pick up the low surrogate of a UTF-16 character, advancing |
| 405 | the pointer. */ |
| 406 | |
| 407 | #define GETUTF16INC(c, eptr) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 408 | { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 409 | |
| 410 | /* Get the next UTF-16 character, advancing the pointer. This is called when we |
| 411 | know we are in UTF-16 mode. */ |
| 412 | |
| 413 | #define GETCHARINC(c, eptr) \ |
| 414 | c = *eptr++; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 415 | if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 416 | |
| 417 | /* Get the next character, testing for UTF-16 mode, and advancing the pointer. |
| 418 | This is called when we don't know if we are in UTF-16 mode. */ |
| 419 | |
| 420 | #define GETCHARINCTEST(c, eptr) \ |
| 421 | c = *eptr++; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 422 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 423 | |
| 424 | /* Base macro to pick up the low surrogate of a UTF-16 character, not |
| 425 | advancing the pointer, incrementing the length. */ |
| 426 | |
| 427 | #define GETUTF16LEN(c, eptr, len) \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 428 | { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 429 | |
| 430 | /* Get the next UTF-16 character, not advancing the pointer, incrementing |
| 431 | length if there is a low surrogate. This is called when we know we are in |
| 432 | UTF-16 mode. */ |
| 433 | |
| 434 | #define GETCHARLEN(c, eptr, len) \ |
| 435 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 436 | if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 437 | |
| 438 | /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the |
| 439 | pointer, incrementing length if there is a low surrogate. This is called when |
| 440 | we do not know if we are in UTF-16 mode. */ |
| 441 | |
| 442 | #define GETCHARLENTEST(c, eptr, len) \ |
| 443 | c = *eptr; \ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 444 | if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 445 | |
| 446 | /* If the pointer is not at the start of a character, move it back until |
| 447 | it is. This is called only in UTF-16 mode - we don't put a test within the |
| 448 | macro because almost all calls are already within a block of UTF-16 only |
| 449 | code. */ |
| 450 | |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 451 | #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 452 | |
| 453 | /* Same as above, just in the other direction. */ |
Janis Danisevskis | 8b979b2 | 2016-08-15 16:09:16 +0100 | [diff] [blame] | 454 | #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ |
| 455 | #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 456 | |
| 457 | /* Same as above, but it allows a fully customizable form. */ |
| 458 | #define ACROSSCHAR(condition, eptr, action) \ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 459 | if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 460 | |
| 461 | /* Deposit a character into memory, returning the number of code units. */ |
| 462 | |
| 463 | #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ |
| 464 | PRIV(ord2utf)(c,p) : (*p = c, 1)) |
| 465 | |
| 466 | |
| 467 | /* ------------------- 32-bit support ------------------ */ |
| 468 | |
| 469 | #else |
| 470 | |
| 471 | /* These are trivial for the 32-bit library, since all UTF-32 characters fit |
| 472 | into one PCRE2_UCHAR unit. */ |
| 473 | |
| 474 | #define MAX_UTF_SINGLE_CU (0x10ffffu) |
| 475 | #define HAS_EXTRALEN(c) (0) |
| 476 | #define GET_EXTRALEN(c) (0) |
| 477 | #define NOT_FIRSTCU(c) (0) |
| 478 | |
| 479 | /* Get the next UTF-32 character, not advancing the pointer. This is called when |
| 480 | we know we are in UTF-32 mode. */ |
| 481 | |
| 482 | #define GETCHAR(c, eptr) \ |
| 483 | c = *(eptr); |
| 484 | |
| 485 | /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the |
| 486 | pointer. */ |
| 487 | |
| 488 | #define GETCHARTEST(c, eptr) \ |
| 489 | c = *(eptr); |
| 490 | |
| 491 | /* Get the next UTF-32 character, advancing the pointer. This is called when we |
| 492 | know we are in UTF-32 mode. */ |
| 493 | |
| 494 | #define GETCHARINC(c, eptr) \ |
| 495 | c = *((eptr)++); |
| 496 | |
| 497 | /* Get the next character, testing for UTF-32 mode, and advancing the pointer. |
| 498 | This is called when we don't know if we are in UTF-32 mode. */ |
| 499 | |
| 500 | #define GETCHARINCTEST(c, eptr) \ |
| 501 | c = *((eptr)++); |
| 502 | |
| 503 | /* Get the next UTF-32 character, not advancing the pointer, not incrementing |
| 504 | length (since all UTF-32 is of length 1). This is called when we know we are in |
| 505 | UTF-32 mode. */ |
| 506 | |
| 507 | #define GETCHARLEN(c, eptr, len) \ |
| 508 | GETCHAR(c, eptr) |
| 509 | |
| 510 | /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the |
| 511 | pointer, not incrementing the length (since all UTF-32 is of length 1). |
| 512 | This is called when we do not know if we are in UTF-32 mode. */ |
| 513 | |
| 514 | #define GETCHARLENTEST(c, eptr, len) \ |
| 515 | GETCHARTEST(c, eptr) |
| 516 | |
| 517 | /* If the pointer is not at the start of a character, move it back until |
| 518 | it is. This is called only in UTF-32 mode - we don't put a test within the |
| 519 | macro because almost all calls are already within a block of UTF-32 only |
| 520 | code. |
| 521 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 522 | These are all no-ops since all UTF-32 characters fit into one PCRE2_UCHAR. */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 523 | |
| 524 | #define BACKCHAR(eptr) do { } while (0) |
| 525 | |
| 526 | /* Same as above, just in the other direction. */ |
| 527 | |
| 528 | #define FORWARDCHAR(eptr) do { } while (0) |
| 529 | #define FORWARDCHARTEST(eptr,end) do { } while (0) |
| 530 | |
| 531 | /* Same as above, but it allows a fully customizable form. */ |
| 532 | |
| 533 | #define ACROSSCHAR(condition, eptr, action) do { } while (0) |
| 534 | |
| 535 | /* Deposit a character into memory, returning the number of code units. */ |
| 536 | |
| 537 | #define PUTCHAR(c, p) (*p = c, 1) |
| 538 | |
| 539 | #endif /* UTF-32 character handling */ |
| 540 | #endif /* SUPPORT_UNICODE */ |
| 541 | |
| 542 | |
| 543 | /* Mode-dependent macros that have the same definition in all modes. */ |
| 544 | |
| 545 | #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) |
| 546 | #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) |
| 547 | #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE |
| 548 | #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE |
| 549 | |
| 550 | |
| 551 | /* ----------------------- HIDDEN STRUCTURES ----------------------------- */ |
| 552 | |
| 553 | /* NOTE: All these structures *must* start with a pcre2_memctl structure. The |
| 554 | code that uses them is simpler because it assumes this. */ |
| 555 | |
| 556 | /* The real general context structure. At present it holds only data for custom |
| 557 | memory control. */ |
| 558 | |
| 559 | typedef struct pcre2_real_general_context { |
| 560 | pcre2_memctl memctl; |
| 561 | } pcre2_real_general_context; |
| 562 | |
| 563 | /* The real compile context structure */ |
| 564 | |
| 565 | typedef struct pcre2_real_compile_context { |
| 566 | pcre2_memctl memctl; |
| 567 | int (*stack_guard)(uint32_t, void *); |
| 568 | void *stack_guard_data; |
| 569 | const uint8_t *tables; |
| 570 | PCRE2_SIZE max_pattern_length; |
| 571 | uint16_t bsr_convention; |
| 572 | uint16_t newline_convention; |
| 573 | uint32_t parens_nest_limit; |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 574 | uint32_t extra_options; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 575 | } pcre2_real_compile_context; |
| 576 | |
| 577 | /* The real match context structure. */ |
| 578 | |
| 579 | typedef struct pcre2_real_match_context { |
| 580 | pcre2_memctl memctl; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 581 | #ifdef SUPPORT_JIT |
| 582 | pcre2_jit_callback jit_callback; |
| 583 | void *jit_callback_data; |
| 584 | #endif |
| 585 | int (*callout)(pcre2_callout_block *, void *); |
| 586 | void *callout_data; |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 587 | int (*substitute_callout)(pcre2_substitute_callout_block *, void *); |
| 588 | void *substitute_callout_data; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 589 | PCRE2_SIZE offset_limit; |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 590 | uint32_t heap_limit; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 591 | uint32_t match_limit; |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 592 | uint32_t depth_limit; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 593 | } pcre2_real_match_context; |
| 594 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 595 | /* The real convert context structure. */ |
| 596 | |
| 597 | typedef struct pcre2_real_convert_context { |
| 598 | pcre2_memctl memctl; |
| 599 | uint32_t glob_separator; |
| 600 | uint32_t glob_escape; |
| 601 | } pcre2_real_convert_context; |
| 602 | |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 603 | /* The real compiled code structure. The type for the blocksize field is |
| 604 | defined specially because it is required in pcre2_serialize_decode() when |
| 605 | copying the size from possibly unaligned memory into a variable of the same |
| 606 | type. Use a macro rather than a typedef to avoid compiler warnings when this |
| 607 | file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the |
| 608 | largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit |
| 609 | argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field |
| 610 | here.) */ |
| 611 | |
| 612 | #undef CODE_BLOCKSIZE_TYPE |
| 613 | #define CODE_BLOCKSIZE_TYPE size_t |
| 614 | |
| 615 | #undef LOOKBEHIND_MAX |
| 616 | #define LOOKBEHIND_MAX UINT16_MAX |
| 617 | |
| 618 | typedef struct pcre2_real_code { |
| 619 | pcre2_memctl memctl; /* Memory control fields */ |
| 620 | const uint8_t *tables; /* The character tables */ |
| 621 | void *executable_jit; /* Pointer to JIT code */ |
| 622 | uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ |
| 623 | CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ |
| 624 | uint32_t magic_number; /* Paranoid and endianness check */ |
| 625 | uint32_t compile_options; /* Options passed to pcre2_compile() */ |
| 626 | uint32_t overall_options; /* Options after processing the pattern */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 627 | uint32_t extra_options; /* Taken from compile_context */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 628 | uint32_t flags; /* Various state flags */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 629 | uint32_t limit_heap; /* Limit set in the pattern */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 630 | uint32_t limit_match; /* Limit set in the pattern */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 631 | uint32_t limit_depth; /* Limit set in the pattern */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 632 | uint32_t first_codeunit; /* Starting code unit */ |
| 633 | uint32_t last_codeunit; /* This codeunit must be seen */ |
| 634 | uint16_t bsr_convention; /* What \R matches */ |
| 635 | uint16_t newline_convention; /* What is a newline? */ |
| 636 | uint16_t max_lookbehind; /* Longest lookbehind (characters) */ |
| 637 | uint16_t minlength; /* Minimum length of match */ |
| 638 | uint16_t top_bracket; /* Highest numbered group */ |
| 639 | uint16_t top_backref; /* Highest numbered back reference */ |
| 640 | uint16_t name_entry_size; /* Size (code units) of table entries */ |
| 641 | uint16_t name_count; /* Number of name entries in the table */ |
| 642 | } pcre2_real_code; |
| 643 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 644 | /* The real match data structure. Define ovector as large as it can ever |
| 645 | actually be so that array bound checkers don't grumble. Memory for this |
| 646 | structure is obtained by calling pcre2_match_data_create(), which sets the size |
| 647 | as the offset of ovector plus a pair of elements for each capturable string, so |
| 648 | the size varies from call to call. As the maximum number of capturing |
| 649 | subpatterns is 65535 we must allow for 65536 strings to include the overall |
| 650 | match. (See also the heapframe structure below.) */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 651 | |
| 652 | typedef struct pcre2_real_match_data { |
| 653 | pcre2_memctl memctl; |
| 654 | const pcre2_real_code *code; /* The pattern used for the match */ |
| 655 | PCRE2_SPTR subject; /* The subject that was matched */ |
| 656 | PCRE2_SPTR mark; /* Pointer to last mark */ |
| 657 | PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ |
| 658 | PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ |
| 659 | PCRE2_SIZE startchar; /* Offset to starting code unit */ |
Elliott Hughes | 0c26e19 | 2019-08-07 12:24:46 -0700 | [diff] [blame] | 660 | uint8_t matchedby; /* Type of match (normal, JIT, DFA) */ |
| 661 | uint8_t flags; /* Various flags */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 662 | uint16_t oveccount; /* Number of pairs */ |
| 663 | int rc; /* The return code from the match */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 664 | PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 665 | } pcre2_real_match_data; |
| 666 | |
| 667 | |
| 668 | /* ----------------------- PRIVATE STRUCTURES ----------------------------- */ |
| 669 | |
| 670 | /* These structures are not needed for pcre2test. */ |
| 671 | |
| 672 | #ifndef PCRE2_PCRE2TEST |
| 673 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 674 | /* Structures for checking for mutual recursion when scanning compiled or |
| 675 | parsed code. */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 676 | |
| 677 | typedef struct recurse_check { |
| 678 | struct recurse_check *prev; |
| 679 | PCRE2_SPTR group; |
| 680 | } recurse_check; |
| 681 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 682 | typedef struct parsed_recurse_check { |
| 683 | struct parsed_recurse_check *prev; |
| 684 | uint32_t *groupptr; |
| 685 | } parsed_recurse_check; |
| 686 | |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 687 | /* Structure for building a cache when filling in recursion offsets. */ |
| 688 | |
| 689 | typedef struct recurse_cache { |
| 690 | PCRE2_SPTR group; |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 691 | int groupnumber; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 692 | } recurse_cache; |
| 693 | |
| 694 | /* Structure for maintaining a chain of pointers to the currently incomplete |
| 695 | branches, for testing for left recursion while compiling. */ |
| 696 | |
| 697 | typedef struct branch_chain { |
| 698 | struct branch_chain *outer; |
| 699 | PCRE2_UCHAR *current_branch; |
| 700 | } branch_chain; |
| 701 | |
| 702 | /* Structure for building a list of named groups during the first pass of |
| 703 | compiling. */ |
| 704 | |
| 705 | typedef struct named_group { |
| 706 | PCRE2_SPTR name; /* Points to the name in the pattern */ |
| 707 | uint32_t number; /* Group number */ |
| 708 | uint16_t length; /* Length of the name */ |
| 709 | uint16_t isdup; /* TRUE if a duplicate */ |
| 710 | } named_group; |
| 711 | |
| 712 | /* Structure for passing "static" information around between the functions |
| 713 | doing the compiling, so that they are thread-safe. */ |
| 714 | |
| 715 | typedef struct compile_block { |
| 716 | pcre2_real_compile_context *cx; /* Points to the compile context */ |
| 717 | const uint8_t *lcc; /* Points to lower casing table */ |
| 718 | const uint8_t *fcc; /* Points to case-flipping table */ |
| 719 | const uint8_t *cbits; /* Points to character type table */ |
| 720 | const uint8_t *ctypes; /* Points to table of type maps */ |
| 721 | PCRE2_SPTR start_workspace; /* The start of working space */ |
| 722 | PCRE2_SPTR start_code; /* The start of the compiled code */ |
| 723 | PCRE2_SPTR start_pattern; /* The start of the pattern */ |
| 724 | PCRE2_SPTR end_pattern; /* The end of the pattern */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 725 | PCRE2_UCHAR *name_table; /* The name/number table */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 726 | PCRE2_SIZE workspace_size; /* Size of workspace */ |
| 727 | PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ |
| 728 | PCRE2_SIZE erroroffset; /* Offset of error in pattern */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 729 | uint16_t names_found; /* Number of entries so far */ |
| 730 | uint16_t name_entry_size; /* Size of each entry */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 731 | uint16_t parens_depth; /* Depth of nested parentheses */ |
| 732 | uint16_t assert_depth; /* Depth of nested assertions */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 733 | open_capitem *open_caps; /* Chain of open capture items */ |
| 734 | named_group *named_groups; /* Points to vector in pre-compile */ |
| 735 | uint32_t named_group_list_size; /* Number of entries in the list */ |
| 736 | uint32_t external_options; /* External (initial) options */ |
| 737 | uint32_t external_flags; /* External flag bits to be set */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 738 | uint32_t bracount; /* Count of capturing parentheses */ |
| 739 | uint32_t lastcapture; /* Last capture encountered */ |
| 740 | uint32_t *parsed_pattern; /* Parsed pattern buffer */ |
| 741 | uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 742 | uint32_t *groupinfo; /* Group info vector */ |
| 743 | uint32_t top_backref; /* Maximum back reference */ |
| 744 | uint32_t backref_map; /* Bitmap of low back refs */ |
| 745 | uint32_t nltype; /* Newline type */ |
| 746 | uint32_t nllen; /* Newline string length */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 747 | uint32_t class_range_start; /* Overall class range start */ |
| 748 | uint32_t class_range_end; /* Overall class range end */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 749 | PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 750 | uint32_t req_varyopt; /* "After variable item" flag for reqbyte */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 751 | int max_lookbehind; /* Maximum lookbehind (characters) */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 752 | BOOL had_accept; /* (*ACCEPT) encountered */ |
| 753 | BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ |
| 754 | BOOL had_recurse; /* Had a recursion or subroutine call */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 755 | BOOL dupnames; /* Duplicate names exist */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 756 | } compile_block; |
| 757 | |
| 758 | /* Structure for keeping the properties of the in-memory stack used |
| 759 | by the JIT matcher. */ |
| 760 | |
| 761 | typedef struct pcre2_real_jit_stack { |
| 762 | pcre2_memctl memctl; |
| 763 | void* stack; |
| 764 | } pcre2_real_jit_stack; |
| 765 | |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 766 | /* Structure for items in a linked list that represents an explicit recursive |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 767 | call within the pattern when running pcre2_dfa_match(). */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 768 | |
| 769 | typedef struct dfa_recursion_info { |
| 770 | struct dfa_recursion_info *prevrec; |
| 771 | PCRE2_SPTR subject_position; |
| 772 | uint32_t group_num; |
| 773 | } dfa_recursion_info; |
| 774 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 775 | /* Structure for "stack" frames that are used for remembering backtracking |
| 776 | positions during matching. As these are used in a vector, with the ovector item |
| 777 | being extended, the size of the structure must be a multiple of PCRE2_SIZE. The |
| 778 | only way to check this at compile time is to force an error by generating an |
| 779 | array with a negative size. By putting this in a typedef (which is never used), |
| 780 | we don't generate any code when all is well. */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 781 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 782 | typedef struct heapframe { |
| 783 | |
| 784 | /* The first set of fields are variables that have to be preserved over calls |
| 785 | to RRMATCH(), but which do not need to be copied to new frames. */ |
| 786 | |
| 787 | PCRE2_SPTR ecode; /* The current position in the pattern */ |
| 788 | PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */ |
| 789 | PCRE2_SIZE length; /* Used for character, string, or code lengths */ |
| 790 | PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */ |
| 791 | PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */ |
| 792 | uint32_t rdepth; /* "Recursion" depth */ |
| 793 | uint32_t group_frame_type; /* Type information for group frames */ |
| 794 | uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */ |
| 795 | uint8_t return_id; /* Where to go on in internal "return" */ |
| 796 | uint8_t op; /* Processing opcode */ |
| 797 | |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 798 | /* At this point, the structure is 16-bit aligned. On most architectures |
| 799 | the alignment requirement for a pointer will ensure that the eptr field below |
| 800 | is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer |
| 801 | that is 16-bit aligned. We must therefore ensure that what comes between here |
| 802 | and eptr is an odd multiple of 16 bits so as to get back into 32-bit |
| 803 | alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs |
| 804 | fudges in the other cases. In the 32-bit case the padding comes first so that |
| 805 | the occu field itself is 32-bit aligned. Without the padding, this structure |
| 806 | is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */ |
| 807 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 808 | #if PCRE2_CODE_UNIT_WIDTH == 8 |
| 809 | PCRE2_UCHAR occu[6]; /* Used for other case code units */ |
| 810 | #elif PCRE2_CODE_UNIT_WIDTH == 16 |
| 811 | PCRE2_UCHAR occu[2]; /* Used for other case code units */ |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 812 | uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 813 | #else |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 814 | uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 815 | PCRE2_UCHAR occu[1]; /* Used for other case code units */ |
| 816 | #endif |
| 817 | |
| 818 | /* The rest have to be copied from the previous frame whenever a new frame |
| 819 | becomes current. The final field is specified as a large vector so that |
| 820 | runtime array bound checks don't catch references to it. However, for any |
| 821 | specific call to pcre2_match() the memory allocated for each frame structure |
| 822 | allows for exactly the right size ovector for the number of capturing |
| 823 | parentheses. (See also the comment for pcre2_real_match_data above.) */ |
| 824 | |
| 825 | PCRE2_SPTR eptr; /* MUST BE FIRST */ |
| 826 | PCRE2_SPTR start_match; /* Can be adjusted by \K */ |
| 827 | PCRE2_SPTR mark; /* Most recent mark on the success path */ |
| 828 | uint32_t current_recurse; /* Current (deepest) recursion number */ |
| 829 | uint32_t capture_last; /* Most recent capture */ |
| 830 | PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */ |
| 831 | PCRE2_SIZE offset_top; /* Offset after highest capture */ |
| 832 | PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ |
| 833 | } heapframe; |
| 834 | |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 835 | /* This typedef is a check that the size of the heapframe structure is a |
| 836 | multiple of PCRE2_SIZE. See various comments above. */ |
| 837 | |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 838 | typedef char check_heapframe_size[ |
| 839 | ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 840 | |
Elliott Hughes | 4e19c8e | 2022-04-15 15:11:02 -0700 | [diff] [blame] | 841 | /* Structure for computing the alignment of heapframe. */ |
| 842 | |
| 843 | typedef struct heapframe_align { |
| 844 | char unalign; /* Completely unalign the current offset */ |
| 845 | heapframe frame; /* Offset is its alignment */ |
| 846 | } heapframe_align; |
| 847 | |
| 848 | /* This define is the minimum alignment required for a heapframe, in bytes. */ |
| 849 | |
| 850 | #define HEAPFRAME_ALIGNMENT offsetof(heapframe_align, frame) |
| 851 | |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 852 | /* Structure for passing "static" information around between the functions |
| 853 | doing traditional NFA matching (pcre2_match() and friends). */ |
| 854 | |
| 855 | typedef struct match_block { |
| 856 | pcre2_memctl memctl; /* For general use */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 857 | PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */ |
| 858 | heapframe *match_frames; /* Points to vector of frames */ |
| 859 | heapframe *match_frames_top; /* Points after the end of the vector */ |
| 860 | heapframe *stack_frames; /* The original vector on the stack */ |
| 861 | PCRE2_SIZE heap_limit; /* As it says */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 862 | uint32_t match_limit; /* As it says */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 863 | uint32_t match_limit_depth; /* As it says */ |
| 864 | uint32_t match_call_count; /* Number of times a new frame is created */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 865 | BOOL hitend; /* Hit the end of the subject at some point */ |
| 866 | BOOL hasthen; /* Pattern contains (*THEN) */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 867 | BOOL allowemptypartial; /* Allow empty hard partial */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 868 | const uint8_t *lcc; /* Points to lower casing table */ |
| 869 | const uint8_t *fcc; /* Points to case-flipping table */ |
| 870 | const uint8_t *ctypes; /* Points to table of type maps */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 871 | PCRE2_SIZE start_offset; /* The start offset value */ |
| 872 | PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ |
| 873 | uint16_t partial; /* PARTIAL options */ |
| 874 | uint16_t bsr_convention; /* \R interpretation */ |
| 875 | uint16_t name_count; /* Number of names in name table */ |
| 876 | uint16_t name_entry_size; /* Size of entry in names table */ |
| 877 | PCRE2_SPTR name_table; /* Table of group names */ |
| 878 | PCRE2_SPTR start_code; /* For use when recursing */ |
| 879 | PCRE2_SPTR start_subject; /* Start of the subject string */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 880 | PCRE2_SPTR check_subject; /* Where UTF-checked from */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 881 | PCRE2_SPTR end_subject; /* End of the subject string */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 882 | PCRE2_SPTR end_match_ptr; /* Subject position at end match */ |
| 883 | PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ |
| 884 | PCRE2_SPTR last_used_ptr; /* Latest consulted character */ |
| 885 | PCRE2_SPTR mark; /* Mark pointer to pass back on success */ |
| 886 | PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 887 | PCRE2_SPTR verb_ecode_ptr; /* For passing back info */ |
| 888 | PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */ |
| 889 | uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 890 | uint32_t moptions; /* Match options */ |
| 891 | uint32_t poptions; /* Pattern options */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 892 | uint32_t skip_arg_count; /* For counting SKIP_ARGs */ |
| 893 | uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 894 | uint32_t nltype; /* Newline type */ |
| 895 | uint32_t nllen; /* Newline string length */ |
| 896 | PCRE2_UCHAR nl[4]; /* Newline string when fixed */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 897 | pcre2_callout_block *cb; /* Points to a callout block */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 898 | void *callout_data; /* To pass back to callouts */ |
| 899 | int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 900 | } match_block; |
| 901 | |
| 902 | /* A similar structure is used for the same purpose by the DFA matching |
| 903 | functions. */ |
| 904 | |
| 905 | typedef struct dfa_match_block { |
| 906 | pcre2_memctl memctl; /* For general use */ |
| 907 | PCRE2_SPTR start_code; /* Start of the compiled pattern */ |
| 908 | PCRE2_SPTR start_subject ; /* Start of the subject string */ |
| 909 | PCRE2_SPTR end_subject; /* End of subject string */ |
| 910 | PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ |
| 911 | PCRE2_SPTR last_used_ptr; /* Latest consulted character */ |
| 912 | const uint8_t *tables; /* Character tables */ |
| 913 | PCRE2_SIZE start_offset; /* The start offset value */ |
Elliott Hughes | 653c210 | 2019-01-09 15:41:36 -0800 | [diff] [blame] | 914 | PCRE2_SIZE heap_limit; /* As it says */ |
| 915 | PCRE2_SIZE heap_used; /* As it says */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 916 | uint32_t match_limit; /* As it says */ |
| 917 | uint32_t match_limit_depth; /* As it says */ |
| 918 | uint32_t match_call_count; /* Number of calls of internal function */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 919 | uint32_t moptions; /* Match options */ |
| 920 | uint32_t poptions; /* Pattern options */ |
| 921 | uint32_t nltype; /* Newline type */ |
| 922 | uint32_t nllen; /* Newline string length */ |
Elliott Hughes | 2dbd7d2 | 2020-06-03 14:32:37 -0700 | [diff] [blame] | 923 | BOOL allowemptypartial; /* Allow empty hard partial */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 924 | PCRE2_UCHAR nl[4]; /* Newline string when fixed */ |
| 925 | uint16_t bsr_convention; /* \R interpretation */ |
Elliott Hughes | 9bc971b | 2018-07-27 13:23:14 -0700 | [diff] [blame] | 926 | pcre2_callout_block *cb; /* Points to a callout block */ |
Janis Danisevskis | 53e448c | 2016-03-31 13:35:25 +0100 | [diff] [blame] | 927 | void *callout_data; /* To pass back to callouts */ |
| 928 | int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ |
| 929 | dfa_recursion_info *recursive; /* Linked list of recursion data */ |
| 930 | } dfa_match_block; |
| 931 | |
| 932 | #endif /* PCRE2_PCRE2TEST */ |
| 933 | |
| 934 | /* End of pcre2_intmodedep.h */ |